diff --git a/.circleci/cimodel/data/caffe2_build_data.py b/.circleci/cimodel/data/caffe2_build_data.py index 446f61af8666..dcaa90e01f04 100644 --- a/.circleci/cimodel/data/caffe2_build_data.py +++ b/.circleci/cimodel/data/caffe2_build_data.py @@ -11,7 +11,6 @@ (Ver("gcc", "4.9"), [X("py2")]), ]), (Ver("ubuntu", "16.04"), [ - (Ver("cuda", "8.0"), [X("py2")]), (Ver("cuda", "9.0"), [ # TODO make explicit that this is a "secret TensorRT build" # (see https://github.com/pytorch/pytorch/pull/17323#discussion_r259446749) diff --git a/.circleci/cimodel/data/caffe2_build_definitions.py b/.circleci/cimodel/data/caffe2_build_definitions.py index aa67a49894a9..44ef7f4ac3d3 100644 --- a/.circleci/cimodel/data/caffe2_build_definitions.py +++ b/.circleci/cimodel/data/caffe2_build_definitions.py @@ -10,7 +10,7 @@ DOCKER_IMAGE_PATH_BASE = "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/" -DOCKER_IMAGE_VERSION = 273 +DOCKER_IMAGE_VERSION = 276 class Conf(object): diff --git a/.circleci/cimodel/data/pytorch_build_data.py b/.circleci/cimodel/data/pytorch_build_data.py index d18cbb8f17cc..7d102413daf5 100644 --- a/.circleci/cimodel/data/pytorch_build_data.py +++ b/.circleci/cimodel/data/pytorch_build_data.py @@ -8,7 +8,7 @@ (None, [ X("2.7.9"), X("2.7"), - X("3.5"), + ("3.5", [("important", [X(True)])]), X("nightly"), ]), ("gcc", [ @@ -28,7 +28,6 @@ ("5", [X("3.6")]), ]), ("cuda", [ - ("8", [X("3.6")]), ("9", [ # Note there are magic strings here # https://github.com/pytorch/pytorch/blob/master/.jenkins/pytorch/build.sh#L21 @@ -37,7 +36,7 @@ # and # https://github.com/pytorch/pytorch/blob/master/.jenkins/pytorch/build.sh#L153 # (from https://github.com/pytorch/pytorch/pull/17323#discussion_r259453144) - X("2.7"), + ("2.7", [("important", [X(True)])]), X("3.6"), ]), ("9.2", [X("3.6")]), @@ -136,6 +135,7 @@ def child_constructor(self): next_nodes = { "xla": XlaConfigNode, "namedtensor": NamedTensorConfigNode, + "important": ImportantConfigNode, } return next_nodes[experimental_feature] @@ -156,6 +156,14 @@ def init2(self, node_name): self.props["is_namedtensor"] = node_name +class ImportantConfigNode(TreeConfigNode): + def modify_label(self, label): + return "IMPORTANT=" + str(label) + + def init2(self, node_name): + self.props["is_important"] = node_name + + class XenialCompilerConfigNode(TreeConfigNode): def init2(self, node_name): diff --git a/.circleci/cimodel/data/pytorch_build_definitions.py b/.circleci/cimodel/data/pytorch_build_definitions.py index 387410c562c7..f42d8db5c063 100644 --- a/.circleci/cimodel/data/pytorch_build_definitions.py +++ b/.circleci/cimodel/data/pytorch_build_definitions.py @@ -25,7 +25,8 @@ def __init__(self, gpu_resource=None, dependent_tests=None, parent_build=None, - is_namedtensor=False): + is_namedtensor=False, + is_important=False): self.distro = distro self.pyver = pyver @@ -37,6 +38,7 @@ def __init__(self, # (from https://github.com/pytorch/pytorch/pull/17323#discussion_r259453608) self.is_xla = is_xla self.is_namedtensor = is_namedtensor + self.is_important = is_important self.restrict_phases = restrict_phases self.gpu_resource = gpu_resource @@ -46,7 +48,10 @@ def __init__(self, # TODO: Eliminate the special casing for docker paths # In the short term, we *will* need to support special casing as docker images are merged for caffe2 and pytorch def get_parms(self, for_docker): - leading = ["pytorch"] + leading = [] + if self.is_important and not for_docker: + leading.append("AAA") + leading.append("pytorch") if self.is_xla and not for_docker: leading.append("xla") if self.is_namedtensor and not for_docker: @@ -225,6 +230,7 @@ def instantiate_configs(): is_xla = fc.find_prop("is_xla") or False is_namedtensor = fc.find_prop("is_namedtensor") or False + is_important = fc.find_prop("is_important") or False gpu_resource = None if cuda_version and cuda_version != "10": @@ -239,9 +245,10 @@ def instantiate_configs(): restrict_phases, gpu_resource, is_namedtensor=is_namedtensor, + is_important=is_important, ) - if cuda_version == "8": + if cuda_version == "9" and python_version == "3.6": c.dependent_tests = gen_dependent_configs(c) config_list.append(c) diff --git a/.circleci/config.yml b/.circleci/config.yml index 69fe24f832fa..95155de67632 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -139,6 +139,11 @@ setup_ci_environment: &setup_ci_environment no_output_timeout: "1h" command: ~/workspace/.circleci/scripts/setup_ci_environment.sh +# Installs expect and moreutils so that we can call `unbuffer` and `ts`. +# Also installs OpenMP +# !!!!NOTE!!!! this is copied into a binary_macos_brew_update job which is the +# same but does not install libomp. If you are changing this, consider if you +# need to change that step as well. macos_brew_update: &macos_brew_update name: Brew update and install moreutils, expect and libomp no_output_timeout: "1h" @@ -154,21 +159,6 @@ macos_brew_update: &macos_brew_update brew install expect brew install libomp -# In version 2.1 and above we could make this a command and pass a parameter to -# it, but in this version there is no way to pass a parameter to a step -binary_macos_brew_update: &binary_macos_brew_update - name: Brew update and install moreutils and expect - no_output_timeout: "1h" - command: | - set -ex - # moreutils installs a `parallel` executable by default, which conflicts - # with the executable from the GNU `parallel`, so we must unlink GNU - # `parallel` first, and relink it afterwards - brew update - brew unlink parallel - brew install moreutils - brew link parallel --overwrite - brew install expect ############################################################################## @@ -197,7 +187,7 @@ pytorch_linux_build_defaults: &pytorch_linux_build_defaults docker pull ${DOCKER_IMAGE} >/dev/null export id=$(docker run -t -d -w /var/lib/jenkins ${DOCKER_IMAGE}) - git submodule sync && git submodule update -q --init + git submodule sync && git submodule update -q --init --recursive docker cp /home/circleci/project/. $id:/var/lib/jenkins/workspace @@ -387,6 +377,7 @@ caffe2_linux_test_defaults: &caffe2_linux_test_defaults export COMMAND='((echo "source ./workspace/env" && echo "sudo chown -R jenkins workspace && cd workspace && ./ci_test_script.sh") | docker exec -u jenkins -i "$id" bash) 2>&1' echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts + ############################################################################## # Macos build defaults ############################################################################## @@ -498,13 +489,13 @@ caffe2_macos_build_defaults: &caffe2_macos_build_defaults # do not need both the pytorch and builder repos, so this is a little wasteful # (smoke tests and upload jobs do not need the pytorch repo). binary_checkout: &binary_checkout - name: Checkout + name: Checkout pytorch/builder repo command: ~/workspace/.circleci/scripts/binary_checkout.sh # Parses circleci arguments in a consistent way, essentially routing to the # correct pythonXgccXcudaXos build we want binary_populate_env: &binary_populate_env - name: Set up env + name: Set up binary env variables command: ~/workspace/.circleci/scripts/binary_populate_env.sh binary_install_miniconda: &binary_install_miniconda @@ -521,6 +512,25 @@ binary_run_in_docker: &binary_run_in_docker # This step only runs on circleci linux machine executors that themselves # need to start docker images command: ~/workspace/.circleci/scripts/binary_run_in_docker.sh + +# This is copied almost verbatim from the macos_brew_update job +# In version 2.1 and above we could make this a command and pass a parameter to +# it, but in this version there is no way to pass a parameter to a step +binary_macos_brew_update: &binary_macos_brew_update + name: Brew update and install moreutils and expect + no_output_timeout: "1h" + command: | + set -eux -o pipefail + # moreutils installs a `parallel` executable by default, which conflicts + # with the executable from the GNU `parallel`, so we must unlink GNU + # `parallel` first, and relink it afterwards + brew update + brew unlink parallel + brew install moreutils + brew link parallel --overwrite + brew install expect + + # binary linux build defaults ############################################################################## binary_linux_build: &binary_linux_build @@ -535,14 +545,14 @@ binary_linux_build: &binary_linux_build - run: name: Install unbuffer and ts command: | - set -ex + set -eux -o pipefail source /env retry yum -q -y install epel-release retry yum -q -y install expect moreutils - run: name: Upgrade gcc version (based on env var) command: | - set -ex + set -eux -o pipefail source /env if [[ "$DESIRED_DEVTOOLSET" == 'devtoolset7' ]]; then source "/builder/upgrade_gcc_abi.sh" @@ -550,6 +560,11 @@ binary_linux_build: &binary_linux_build # Env variables are not persisted into the next step echo "export PATH=$PATH" >> /env echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH" >> /env + + # We need to set this variable manually because + # https://github.com/pytorch/pytorch/blob/master/torch/abi-check.cpp + # sets the ABI to 0 by default + echo "export _GLIBCXX_USE_CXX11_ABI=1" >> /env else echo "Not upgrading gcc version" fi @@ -573,10 +588,14 @@ binary_linux_test: &binary_linux_test steps: - attach_workspace: at: ~/workspace + - attach_workspace: + at: /home/circleci/project - run: <<: *setup_linux_system_environment - run: <<: *setup_ci_environment + - run: + <<: *binary_checkout - run: <<: *binary_populate_env - run: @@ -607,6 +626,7 @@ binary_linux_upload: &binary_linux_upload no_output_timeout: "1h" command: ~/workspace/.circleci/scripts/binary_linux_upload.sh + ############################################################################## # Macos binary build defaults # The root of everything is /Users/distiller/pytorch-ci-env/workspace @@ -630,7 +650,7 @@ binary_mac_build: &binary_mac_build name: Build no_output_timeout: "1h" command: | - set -ex + set -eux -o pipefail script="/Users/distiller/project/pytorch/.circleci/scripts/binary_macos_build.sh" cat "$script" source "$script" @@ -639,7 +659,7 @@ binary_mac_build: &binary_mac_build name: Test no_output_timeout: "1h" command: | - set -ex + set -eux -o pipefail script="/Users/distiller/project/pytorch/.circleci/scripts/binary_macos_test.sh" cat "$script" source "$script" @@ -671,6 +691,8 @@ binary_mac_upload: &binary_mac_upload script="/Users/distiller/project/pytorch/.circleci/scripts/binary_macos_upload.sh" cat "$script" source "$script" + + # Nighlty build smoke tests defaults # These are the second-round smoke tests. These make sure that the binaries are # correct from a user perspective, testing that they exist from the cloud are @@ -682,10 +704,14 @@ smoke_linux_test: &smoke_linux_test steps: - attach_workspace: at: ~/workspace + - attach_workspace: + at: /home/circleci/project - run: <<: *setup_linux_system_environment - run: <<: *setup_ci_environment + - run: + <<: *binary_checkout - run: <<: *binary_populate_env - run: @@ -695,8 +721,7 @@ smoke_linux_test: &smoke_linux_test set -ex cat >/home/circleci/project/ci_test_script.sh <&1 | ts + caffe2_py2_gcc4_8_ubuntu14_04_build: environment: BUILD_ENVIRONMENT: "caffe2-py2-gcc4.8-ubuntu14.04-build" - DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc4.8-ubuntu14.04:273" + DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc4.8-ubuntu14.04:276" <<: *caffe2_linux_build_defaults caffe2_py2_gcc4_8_ubuntu14_04_test: environment: BUILD_ENVIRONMENT: "caffe2-py2-gcc4.8-ubuntu14.04-test" - DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc4.8-ubuntu14.04:273" + DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc4.8-ubuntu14.04:276" resource_class: large <<: *caffe2_linux_test_defaults caffe2_py2_gcc4_9_ubuntu14_04_build: environment: BUILD_ENVIRONMENT: "caffe2-py2-gcc4.9-ubuntu14.04-build" - DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc4.9-ubuntu14.04:273" + DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc4.9-ubuntu14.04:276" BUILD_ONLY: "1" <<: *caffe2_linux_build_defaults - caffe2_py2_cuda8_0_cudnn7_ubuntu16_04_build: - environment: - BUILD_ENVIRONMENT: "caffe2-py2-cuda8.0-cudnn7-ubuntu16.04-build" - DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda8.0-cudnn7-ubuntu16.04:273" - <<: *caffe2_linux_build_defaults - - caffe2_py2_cuda8_0_cudnn7_ubuntu16_04_test: - environment: - BUILD_ENVIRONMENT: "caffe2-py2-cuda8.0-cudnn7-ubuntu16.04-test" - USE_CUDA_DOCKER_RUNTIME: "1" - DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda8.0-cudnn7-ubuntu16.04:273" - resource_class: gpu.medium - <<: *caffe2_linux_test_defaults - caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_build: environment: BUILD_ENVIRONMENT: "caffe2-py2-cuda9.0-cudnn7-ubuntu16.04-build" - DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-ubuntu16.04:273" + DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-ubuntu16.04:276" <<: *caffe2_linux_build_defaults caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_test: environment: BUILD_ENVIRONMENT: "caffe2-py2-cuda9.0-cudnn7-ubuntu16.04-test" USE_CUDA_DOCKER_RUNTIME: "1" - DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-ubuntu16.04:273" + DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-ubuntu16.04:276" resource_class: gpu.medium <<: *caffe2_linux_test_defaults caffe2_cmake_cuda9_0_cudnn7_ubuntu16_04_build: environment: BUILD_ENVIRONMENT: "caffe2-cmake-cuda9.0-cudnn7-ubuntu16.04-build" - DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-ubuntu16.04:273" + DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-ubuntu16.04:276" <<: *caffe2_linux_build_defaults caffe2_cmake_cuda9_0_cudnn7_ubuntu16_04_test: environment: BUILD_ENVIRONMENT: "caffe2-cmake-cuda9.0-cudnn7-ubuntu16.04-test" USE_CUDA_DOCKER_RUNTIME: "1" - DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-ubuntu16.04:273" + DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-ubuntu16.04:276" resource_class: gpu.medium <<: *caffe2_linux_test_defaults caffe2_py2_cuda9_1_cudnn7_ubuntu16_04_build: environment: BUILD_ENVIRONMENT: "caffe2-py2-cuda9.1-cudnn7-ubuntu16.04-build" - DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.1-cudnn7-ubuntu16.04:273" + DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.1-cudnn7-ubuntu16.04:276" <<: *caffe2_linux_build_defaults caffe2_py2_cuda9_1_cudnn7_ubuntu16_04_test: environment: BUILD_ENVIRONMENT: "caffe2-py2-cuda9.1-cudnn7-ubuntu16.04-test" USE_CUDA_DOCKER_RUNTIME: "1" - DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.1-cudnn7-ubuntu16.04:273" + DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.1-cudnn7-ubuntu16.04:276" resource_class: gpu.medium <<: *caffe2_linux_test_defaults caffe2_py2_mkl_ubuntu16_04_build: environment: BUILD_ENVIRONMENT: "caffe2-py2-mkl-ubuntu16.04-build" - DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-mkl-ubuntu16.04:273" + DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-mkl-ubuntu16.04:276" <<: *caffe2_linux_build_defaults caffe2_py2_mkl_ubuntu16_04_test: environment: BUILD_ENVIRONMENT: "caffe2-py2-mkl-ubuntu16.04-test" - DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-mkl-ubuntu16.04:273" + DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-mkl-ubuntu16.04:276" resource_class: large <<: *caffe2_linux_test_defaults caffe2_onnx_py2_gcc5_ubuntu16_04_build: environment: BUILD_ENVIRONMENT: "caffe2-onnx-py2-gcc5-ubuntu16.04-build" - DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc5-ubuntu16.04:273" + DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc5-ubuntu16.04:276" <<: *caffe2_linux_build_defaults caffe2_onnx_py2_gcc5_ubuntu16_04_test: environment: BUILD_ENVIRONMENT: "caffe2-onnx-py2-gcc5-ubuntu16.04-test" - DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc5-ubuntu16.04:273" + DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc5-ubuntu16.04:276" resource_class: large <<: *caffe2_linux_test_defaults caffe2_py2_clang3_8_ubuntu16_04_build: environment: BUILD_ENVIRONMENT: "caffe2-py2-clang3.8-ubuntu16.04-build" - DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-clang3.8-ubuntu16.04:273" + DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-clang3.8-ubuntu16.04:276" BUILD_ONLY: "1" <<: *caffe2_linux_build_defaults caffe2_py2_clang3_9_ubuntu16_04_build: environment: BUILD_ENVIRONMENT: "caffe2-py2-clang3.9-ubuntu16.04-build" - DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-clang3.9-ubuntu16.04:273" + DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-clang3.9-ubuntu16.04:276" BUILD_ONLY: "1" <<: *caffe2_linux_build_defaults caffe2_py2_clang7_ubuntu16_04_build: environment: BUILD_ENVIRONMENT: "caffe2-py2-clang7-ubuntu16.04-build" - DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-clang7-ubuntu16.04:273" + DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-clang7-ubuntu16.04:276" BUILD_ONLY: "1" <<: *caffe2_linux_build_defaults caffe2_py2_android_ubuntu16_04_build: environment: BUILD_ENVIRONMENT: "caffe2-py2-android-ubuntu16.04-build" - DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-android-ubuntu16.04:273" + DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-android-ubuntu16.04:276" BUILD_ONLY: "1" <<: *caffe2_linux_build_defaults caffe2_py2_cuda9_0_cudnn7_centos7_build: environment: BUILD_ENVIRONMENT: "caffe2-py2-cuda9.0-cudnn7-centos7-build" - DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-centos7:273" + DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-centos7:276" <<: *caffe2_linux_build_defaults caffe2_py2_cuda9_0_cudnn7_centos7_test: environment: BUILD_ENVIRONMENT: "caffe2-py2-cuda9.0-cudnn7-centos7-test" USE_CUDA_DOCKER_RUNTIME: "1" - DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-centos7:273" + DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-centos7:276" resource_class: gpu.medium <<: *caffe2_linux_test_defaults @@ -1352,6 +1361,7 @@ jobs: PYTHON_VERSION: "2" <<: *caffe2_macos_build_defaults + # update_s3_htmls job # These jobs create html files for every cpu/cu## folder in s3. The html # files just store the names of all the files in that folder (which are @@ -1363,6 +1373,8 @@ jobs: machine: image: ubuntu-1604:201903-01 steps: + - attach_workspace: + at: ~/workspace - run: <<: *setup_linux_system_environment - run: @@ -1388,7 +1400,7 @@ jobs: echo "declare -x \"AWS_ACCESS_KEY_ID=${PYTORCH_BINARY_AWS_ACCESS_KEY_ID}\"" >> /home/circleci/project/env echo "declare -x \"AWS_SECRET_ACCESS_KEY=${PYTORCH_BINARY_AWS_SECRET_ACCESS_KEY}\"" >> /home/circleci/project/env source /home/circleci/project/env - set -ex + set -eux -o pipefail retry () { $* || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*) } @@ -1416,15 +1428,12 @@ jobs: machine: image: ubuntu-1604:201903-01 steps: + - attach_workspace: + at: ~/workspace - run: <<: *setup_linux_system_environment - run: <<: *binary_checkout - # N.B. This sources binary_populate_env so that it takes the Pytorch - # version listed there. The only variables it needs are the date and the - # version string. - - run: - <<: *binary_populate_env - run: <<: *binary_install_miniconda - run: @@ -1432,18 +1441,24 @@ jobs: no_output_timeout: "1h" command: | set +x - echo "declare -x \"AWS_ACCESS_KEY_ID=${PYTORCH_BINARY_AWS_ACCESS_KEY_ID}\"" >> /home/circleci/project/env + echo "declare -x \"AWS_ACCESS_KEY_ID=${PYTORCH_BINARY_AWS_ACCESS_KEY_ID}\"" > /home/circleci/project/env echo "declare -x \"AWS_SECRET_ACCESS_KEY=${PYTORCH_BINARY_AWS_SECRET_ACCESS_KEY}\"" >> /home/circleci/project/env + export DATE="$(date -u +%Y_%m_%d)" + retry () { + $* || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*) + } source /home/circleci/project/env - set -ex + set -eux -o pipefail + # This is hardcoded to match binary_install_miniconda.sh + export PATH="/home/circleci/project/miniconda/bin:$PATH" # Not any awscli will work. Most won't. This one will work - export PATH="$MINICONDA_ROOT/bin:$PATH" retry conda create -qyn aws36 python=3.6 source activate aws36 pip install awscli==1.16.46 "/home/circleci/project/builder/cron/upload_binary_sizes.sh" + ############################################################################## # Binary build specs individual job specifications ############################################################################## @@ -2059,6 +2074,7 @@ jobs: resource_class: gpu.medium <<: *binary_linux_test + # There is currently no testing for libtorch TODO # binary_linux_libtorch_2.7m_cpu_test: # environment: @@ -2843,13 +2859,13 @@ workflows: requires: - setup - pytorch_linux_trusty_py2_7_build - - pytorch_linux_trusty_py3_5_build: + - AAA_pytorch_linux_trusty_py3_5_build: requires: - setup - - pytorch_linux_trusty_py3_5_test: + - AAA_pytorch_linux_trusty_py3_5_test: requires: - setup - - pytorch_linux_trusty_py3_5_build + - AAA_pytorch_linux_trusty_py3_5_build - pytorch_linux_trusty_pynightly_build: requires: - setup @@ -2899,52 +2915,45 @@ workflows: requires: - setup - pytorch_linux_xenial_py3_clang5_asan_build - - pytorch_linux_xenial_cuda8_cudnn7_py3_build: + - AAA_pytorch_linux_xenial_cuda9_cudnn7_py2_build: requires: - setup - - pytorch_linux_xenial_cuda8_cudnn7_py3_test: + - AAA_pytorch_linux_xenial_cuda9_cudnn7_py2_test: requires: - setup - - pytorch_linux_xenial_cuda8_cudnn7_py3_build - - pytorch_linux_xenial_cuda8_cudnn7_py3_multigpu_test: + - AAA_pytorch_linux_xenial_cuda9_cudnn7_py2_build + - pytorch_linux_xenial_cuda9_cudnn7_py3_build: requires: - setup - - pytorch_linux_xenial_cuda8_cudnn7_py3_build - - pytorch_linux_xenial_cuda8_cudnn7_py3_NO_AVX2_test: + - pytorch_linux_xenial_cuda9_cudnn7_py3_test: requires: - setup - - pytorch_linux_xenial_cuda8_cudnn7_py3_build - - pytorch_linux_xenial_cuda8_cudnn7_py3_NO_AVX_NO_AVX2_test: + - pytorch_linux_xenial_cuda9_cudnn7_py3_build + - pytorch_linux_xenial_cuda9_cudnn7_py3_multigpu_test: requires: - setup - - pytorch_linux_xenial_cuda8_cudnn7_py3_build - - pytorch_linux_xenial_cuda8_cudnn7_py3_slow_test: + - pytorch_linux_xenial_cuda9_cudnn7_py3_build + - pytorch_linux_xenial_cuda9_cudnn7_py3_NO_AVX2_test: requires: - setup - - pytorch_linux_xenial_cuda8_cudnn7_py3_build - - pytorch_linux_xenial_cuda8_cudnn7_py3_nogpu_test: + - pytorch_linux_xenial_cuda9_cudnn7_py3_build + - pytorch_linux_xenial_cuda9_cudnn7_py3_NO_AVX_NO_AVX2_test: requires: - setup - - pytorch_linux_xenial_cuda8_cudnn7_py3_build - - pytorch_short_perf_test_gpu: - requires: - - pytorch_linux_xenial_cuda8_cudnn7_py3_build - - pytorch_doc_push: - requires: - - pytorch_linux_xenial_cuda8_cudnn7_py3_build - - pytorch_linux_xenial_cuda9_cudnn7_py2_build: + - pytorch_linux_xenial_cuda9_cudnn7_py3_build + - pytorch_linux_xenial_cuda9_cudnn7_py3_slow_test: requires: - setup - - pytorch_linux_xenial_cuda9_cudnn7_py2_test: + - pytorch_linux_xenial_cuda9_cudnn7_py3_build + - pytorch_linux_xenial_cuda9_cudnn7_py3_nogpu_test: requires: - setup - - pytorch_linux_xenial_cuda9_cudnn7_py2_build - - pytorch_linux_xenial_cuda9_cudnn7_py3_build: + - pytorch_linux_xenial_cuda9_cudnn7_py3_build + - pytorch_short_perf_test_gpu: requires: - - setup - - pytorch_linux_xenial_cuda9_cudnn7_py3_test: + - pytorch_linux_xenial_cuda9_cudnn7_py3_build + - pytorch_doc_push: requires: - - setup - pytorch_linux_xenial_cuda9_cudnn7_py3_build - pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_build: requires: @@ -2971,6 +2980,7 @@ workflows: - pytorch_macos_10_13_cuda9_2_cudnn7_py3_build: requires: - setup + - caffe2_py2_gcc4_8_ubuntu14_04_build: requires: - setup @@ -2978,13 +2988,6 @@ workflows: requires: - setup - caffe2_py2_gcc4_8_ubuntu14_04_build - - caffe2_py2_cuda8_0_cudnn7_ubuntu16_04_build: - requires: - - setup - - caffe2_py2_cuda8_0_cudnn7_ubuntu16_04_test: - requires: - - setup - - caffe2_py2_cuda8_0_cudnn7_ubuntu16_04_build - caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_build: requires: - setup @@ -3087,6 +3090,7 @@ workflows: # requires: # - setup # - binary_linux_conda_3.6_cu90_build + ############################################################################## # Daily smoke test trigger ############################################################################## @@ -3899,9 +3903,17 @@ workflows: only: - master jobs: + - setup - update_s3_htmls_for_nightlies: context: org-member + requires: + - setup - update_s3_htmls_for_nightlies_devtoolset7: context: org-member + requires: + - setup - upload_binary_sizes: context: org-member + requires: + - setup + diff --git a/.circleci/scripts/binary_checkout.sh b/.circleci/scripts/binary_checkout.sh index f4d9fa7f7beb..c1fa7472590b 100755 --- a/.circleci/scripts/binary_checkout.sh +++ b/.circleci/scripts/binary_checkout.sh @@ -1,6 +1,5 @@ #!/bin/bash - -set -ex +set -eux -o pipefail # This step runs on multiple executors with different envfile locations if [[ "$(uname)" == Darwin ]]; then # macos executor (builds and tests) @@ -20,13 +19,13 @@ export BUILDER_ROOT="$workdir/builder" # Clone the Pytorch branch git clone https://github.com/pytorch/pytorch.git "$PYTORCH_ROOT" pushd "$PYTORCH_ROOT" -if [[ -n "$CIRCLE_PR_NUMBER" ]]; then +if [[ -n "${CIRCLE_PR_NUMBER:-}" ]]; then # "smoke" binary build on PRs git fetch --force origin "pull/${CIRCLE_PR_NUMBER}/head:remotes/origin/pull/${CIRCLE_PR_NUMBER}" git reset --hard "$CIRCLE_SHA1" git checkout -q -B "$CIRCLE_BRANCH" git reset --hard "$CIRCLE_SHA1" -elif [[ -n "$CIRCLE_SHA1" ]]; then +elif [[ -n "${CIRCLE_SHA1:-}" ]]; then # Scheduled workflows & "smoke" binary build on master on PR merges git reset --hard "$CIRCLE_SHA1" git checkout -q -B master diff --git a/.circleci/scripts/binary_install_miniconda.sh b/.circleci/scripts/binary_install_miniconda.sh index 05f4f7a3bfde..ea419ff3030b 100755 --- a/.circleci/scripts/binary_install_miniconda.sh +++ b/.circleci/scripts/binary_install_miniconda.sh @@ -1,15 +1,32 @@ #!/bin/bash -set -ex +set -eux -o pipefail + # This step runs on multiple executors with different envfile locations if [[ "$(uname)" == Darwin ]]; then - source "/Users/distiller/project/env" + envfile="/Users/distiller/project/env" elif [[ -d "/home/circleci/project" ]]; then # machine executor (binary tests) - source "/home/circleci/project/env" + envfile="/home/circleci/project/env" else # docker executor (binary builds) - source "/env" + envfile="/env" +fi + +# TODO this is super hacky and ugly. Basically, the binary_update_html job does +# not have an env file, since it does not call binary_populate_env.sh, since it +# does not have a BUILD_ENVIRONMENT. So for this one case, which we detect by a +# lack of an env file, we manually export the environment variables that we +# need to install miniconda +if [[ ! -f "$envfile" ]]; then + MINICONDA_ROOT="/home/circleci/project/miniconda" + workdir="/home/circleci/project" + retry () { + $* || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*) + } + export -f retry +else + source "$envfile" fi conda_sh="$workdir/install_miniconda.sh" @@ -22,10 +39,6 @@ chmod +x "$conda_sh" "$conda_sh" -b -p "$MINICONDA_ROOT" rm -f "$conda_sh" -# TODO we can probably remove the next two lines -export PATH="$MINICONDA_ROOT/bin:$PATH" -source "$MINICONDA_ROOT/bin/activate" - # We can't actually add miniconda to the PATH in the envfile, because that # breaks 'unbuffer' in Mac jobs. This is probably because conda comes with # a tclsh, which then gets inserted before the tclsh needed in /usr/bin diff --git a/.circleci/scripts/binary_linux_build.sh b/.circleci/scripts/binary_linux_build.sh index 38507ea06ce0..9061b86d42e5 100755 --- a/.circleci/scripts/binary_linux_build.sh +++ b/.circleci/scripts/binary_linux_build.sh @@ -1,7 +1,7 @@ #!/bin/bash echo "RUNNING ON $(uname -a) WITH $(nproc) CPUS AND $(free -m)" -set -ex +set -eux -o pipefail source /env # Defaults here so they can be changed in one place diff --git a/.circleci/scripts/binary_linux_test.sh b/.circleci/scripts/binary_linux_test.sh index bf867316ad3b..663fcfba465c 100755 --- a/.circleci/scripts/binary_linux_test.sh +++ b/.circleci/scripts/binary_linux_test.sh @@ -3,7 +3,7 @@ source /home/circleci/project/env cat >/home/circleci/project/ci_test_script.sh < /home/circleci/project/env - echo "declare -x COMMIT_SOURCE=${CIRCLE_BRANCH}" >> /home/circleci/project/env - echo "declare -x PYTHON_VERSION=${PYTHON_VERSION}" >> /home/circleci/project/env + echo "declare -x COMMIT_SOURCE=${CIRCLE_BRANCH:-}" >> /home/circleci/project/env + echo "declare -x PYTHON_VERSION=${PYTHON_VERSION:-}" >> /home/circleci/project/env echo "declare -x SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> /home/circleci/project/env - if [ -n "${USE_CUDA_DOCKER_RUNTIME}" ]; then + if [ -n "${USE_CUDA_DOCKER_RUNTIME:-}" ]; then echo "declare -x TORCH_CUDA_ARCH_LIST=5.2" >> /home/circleci/project/env fi export SCCACHE_MAX_JOBS=`expr $(nproc) - 1` @@ -97,21 +97,21 @@ if [[ "${BUILD_ENVIRONMENT}" == *-build ]]; then if [[ "${BUILD_ENVIRONMENT}" == *xla* ]]; then # This IAM user allows write access to S3 bucket for sccache & bazels3cache set +x - echo "declare -x AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_AND_XLA_BAZEL_S3_BUCKET_V2}" >> /home/circleci/project/env - echo "declare -x AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_AND_XLA_BAZEL_S3_BUCKET_V2}" >> /home/circleci/project/env + echo "declare -x AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_AND_XLA_BAZEL_S3_BUCKET_V2:-}" >> /home/circleci/project/env + echo "declare -x AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_AND_XLA_BAZEL_S3_BUCKET_V2:-}" >> /home/circleci/project/env set -x else # This IAM user allows write access to S3 bucket for sccache set +x - echo "declare -x AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V4}" >> /home/circleci/project/env - echo "declare -x AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V4}" >> /home/circleci/project/env + echo "declare -x AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V4:-}" >> /home/circleci/project/env + echo "declare -x AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V4:-}" >> /home/circleci/project/env set -x fi fi # This IAM user only allows read-write access to ECR set +x -export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_ECR_READ_WRITE_V4} -export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_ECR_READ_WRITE_V4} +export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_ECR_READ_WRITE_V4:-} +export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_ECR_READ_WRITE_V4:-} eval $(aws ecr get-login --region us-east-1 --no-include-email) set -x diff --git a/.circleci/scripts/setup_linux_system_environment.sh b/.circleci/scripts/setup_linux_system_environment.sh index 2782b103a1f5..e6bc004aef6a 100755 --- a/.circleci/scripts/setup_linux_system_environment.sh +++ b/.circleci/scripts/setup_linux_system_environment.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -set -ex +set -eux -o pipefail # Set up CircleCI GPG keys for apt, if needed curl -L https://packagecloud.io/circleci/trusty/gpgkey | sudo apt-key add - diff --git a/.circleci/verbatim-sources/binary-build-tests.yml b/.circleci/verbatim-sources/binary-build-tests.yml index 540151acb6cc..a260cd880838 100644 --- a/.circleci/verbatim-sources/binary-build-tests.yml +++ b/.circleci/verbatim-sources/binary-build-tests.yml @@ -1,3 +1,4 @@ + # There is currently no testing for libtorch TODO # binary_linux_libtorch_2.7m_cpu_test: # environment: diff --git a/.circleci/verbatim-sources/binary_update_htmls.yml b/.circleci/verbatim-sources/binary_update_htmls.yml index 969b3615e027..0ac7d16d0e37 100644 --- a/.circleci/verbatim-sources/binary_update_htmls.yml +++ b/.circleci/verbatim-sources/binary_update_htmls.yml @@ -1,3 +1,4 @@ + # update_s3_htmls job # These jobs create html files for every cpu/cu## folder in s3. The html # files just store the names of all the files in that folder (which are @@ -9,6 +10,8 @@ machine: image: ubuntu-1604:201903-01 steps: + - attach_workspace: + at: ~/workspace - run: <<: *setup_linux_system_environment - run: @@ -34,7 +37,7 @@ echo "declare -x \"AWS_ACCESS_KEY_ID=${PYTORCH_BINARY_AWS_ACCESS_KEY_ID}\"" >> /home/circleci/project/env echo "declare -x \"AWS_SECRET_ACCESS_KEY=${PYTORCH_BINARY_AWS_SECRET_ACCESS_KEY}\"" >> /home/circleci/project/env source /home/circleci/project/env - set -ex + set -eux -o pipefail retry () { $* || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*) } @@ -62,15 +65,12 @@ machine: image: ubuntu-1604:201903-01 steps: + - attach_workspace: + at: ~/workspace - run: <<: *setup_linux_system_environment - run: <<: *binary_checkout - # N.B. This sources binary_populate_env so that it takes the Pytorch - # version listed there. The only variables it needs are the date and the - # version string. - - run: - <<: *binary_populate_env - run: <<: *binary_install_miniconda - run: @@ -78,15 +78,21 @@ no_output_timeout: "1h" command: | set +x - echo "declare -x \"AWS_ACCESS_KEY_ID=${PYTORCH_BINARY_AWS_ACCESS_KEY_ID}\"" >> /home/circleci/project/env + echo "declare -x \"AWS_ACCESS_KEY_ID=${PYTORCH_BINARY_AWS_ACCESS_KEY_ID}\"" > /home/circleci/project/env echo "declare -x \"AWS_SECRET_ACCESS_KEY=${PYTORCH_BINARY_AWS_SECRET_ACCESS_KEY}\"" >> /home/circleci/project/env + export DATE="$(date -u +%Y_%m_%d)" + retry () { + $* || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*) + } source /home/circleci/project/env - set -ex + set -eux -o pipefail + # This is hardcoded to match binary_install_miniconda.sh + export PATH="/home/circleci/project/miniconda/bin:$PATH" # Not any awscli will work. Most won't. This one will work - export PATH="$MINICONDA_ROOT/bin:$PATH" retry conda create -qyn aws36 python=3.6 source activate aws36 pip install awscli==1.16.46 "/home/circleci/project/builder/cron/upload_binary_sizes.sh" + diff --git a/.circleci/verbatim-sources/header-section.yml b/.circleci/verbatim-sources/header-section.yml index e4a4f0feb50a..d1cd51b6e222 100644 --- a/.circleci/verbatim-sources/header-section.yml +++ b/.circleci/verbatim-sources/header-section.yml @@ -139,6 +139,11 @@ setup_ci_environment: &setup_ci_environment no_output_timeout: "1h" command: ~/workspace/.circleci/scripts/setup_ci_environment.sh +# Installs expect and moreutils so that we can call `unbuffer` and `ts`. +# Also installs OpenMP +# !!!!NOTE!!!! this is copied into a binary_macos_brew_update job which is the +# same but does not install libomp. If you are changing this, consider if you +# need to change that step as well. macos_brew_update: &macos_brew_update name: Brew update and install moreutils, expect and libomp no_output_timeout: "1h" @@ -154,18 +159,3 @@ macos_brew_update: &macos_brew_update brew install expect brew install libomp -# In version 2.1 and above we could make this a command and pass a parameter to -# it, but in this version there is no way to pass a parameter to a step -binary_macos_brew_update: &binary_macos_brew_update - name: Brew update and install moreutils and expect - no_output_timeout: "1h" - command: | - set -ex - # moreutils installs a `parallel` executable by default, which conflicts - # with the executable from the GNU `parallel`, so we must unlink GNU - # `parallel` first, and relink it afterwards - brew update - brew unlink parallel - brew install moreutils - brew link parallel --overwrite - brew install expect diff --git a/.circleci/verbatim-sources/job-specs-custom.yml b/.circleci/verbatim-sources/job-specs-custom.yml index 47012225b7cc..79da6b5dd402 100644 --- a/.circleci/verbatim-sources/job-specs-custom.yml +++ b/.circleci/verbatim-sources/job-specs-custom.yml @@ -1,7 +1,8 @@ + pytorch_short_perf_test_gpu: environment: BUILD_ENVIRONMENT: pytorch-short-perf-test-gpu - DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn7-py3:300" + DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py3:300" PYTHON_VERSION: "3.6" USE_CUDA_DOCKER_RUNTIME: "1" resource_class: gpu.medium @@ -38,7 +39,7 @@ pytorch_doc_push: environment: BUILD_ENVIRONMENT: pytorch-doc-push - DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn7-py3:300" + DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py3:300" resource_class: large machine: image: ubuntu-1604:201903-01 @@ -197,6 +198,7 @@ export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V4} set -x - git submodule sync && git submodule update -q --init + git submodule sync && git submodule update -q --init --recursive chmod a+x .jenkins/pytorch/macos-build.sh unbuffer .jenkins/pytorch/macos-build.sh 2>&1 | ts + diff --git a/.circleci/verbatim-sources/job-specs-setup.yml b/.circleci/verbatim-sources/job-specs-setup.yml index 2032a463457a..9b748a4aba5a 100644 --- a/.circleci/verbatim-sources/job-specs-setup.yml +++ b/.circleci/verbatim-sources/job-specs-setup.yml @@ -1,3 +1,4 @@ + setup: docker: - image: circleci/python:3.7.3 @@ -10,3 +11,4 @@ - persist_to_workspace: root: . paths: .circleci/scripts + diff --git a/.circleci/verbatim-sources/linux-binary-build-defaults.yml b/.circleci/verbatim-sources/linux-binary-build-defaults.yml index 4b87359f1291..c9a0216f0557 100644 --- a/.circleci/verbatim-sources/linux-binary-build-defaults.yml +++ b/.circleci/verbatim-sources/linux-binary-build-defaults.yml @@ -1,3 +1,4 @@ + # binary linux build defaults ############################################################################## binary_linux_build: &binary_linux_build @@ -12,14 +13,14 @@ binary_linux_build: &binary_linux_build - run: name: Install unbuffer and ts command: | - set -ex + set -eux -o pipefail source /env retry yum -q -y install epel-release retry yum -q -y install expect moreutils - run: name: Upgrade gcc version (based on env var) command: | - set -ex + set -eux -o pipefail source /env if [[ "$DESIRED_DEVTOOLSET" == 'devtoolset7' ]]; then source "/builder/upgrade_gcc_abi.sh" @@ -27,6 +28,11 @@ binary_linux_build: &binary_linux_build # Env variables are not persisted into the next step echo "export PATH=$PATH" >> /env echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH" >> /env + + # We need to set this variable manually because + # https://github.com/pytorch/pytorch/blob/master/torch/abi-check.cpp + # sets the ABI to 0 by default + echo "export _GLIBCXX_USE_CXX11_ABI=1" >> /env else echo "Not upgrading gcc version" fi @@ -50,10 +56,14 @@ binary_linux_test: &binary_linux_test steps: - attach_workspace: at: ~/workspace + - attach_workspace: + at: /home/circleci/project - run: <<: *setup_linux_system_environment - run: <<: *setup_ci_environment + - run: + <<: *binary_checkout - run: <<: *binary_populate_env - run: @@ -83,3 +93,4 @@ binary_linux_upload: &binary_linux_upload name: Upload no_output_timeout: "1h" command: ~/workspace/.circleci/scripts/binary_linux_upload.sh + diff --git a/.circleci/verbatim-sources/linux-build-defaults.yml b/.circleci/verbatim-sources/linux-build-defaults.yml index 605176da7819..106ad6f645e6 100644 --- a/.circleci/verbatim-sources/linux-build-defaults.yml +++ b/.circleci/verbatim-sources/linux-build-defaults.yml @@ -26,7 +26,7 @@ pytorch_linux_build_defaults: &pytorch_linux_build_defaults docker pull ${DOCKER_IMAGE} >/dev/null export id=$(docker run -t -d -w /var/lib/jenkins ${DOCKER_IMAGE}) - git submodule sync && git submodule update -q --init + git submodule sync && git submodule update -q --init --recursive docker cp /home/circleci/project/. $id:/var/lib/jenkins/workspace @@ -215,3 +215,4 @@ caffe2_linux_test_defaults: &caffe2_linux_test_defaults export COMMAND='((echo "source ./workspace/env" && echo "sudo chown -R jenkins workspace && cd workspace && ./ci_test_script.sh") | docker exec -u jenkins -i "$id" bash) 2>&1' echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts + diff --git a/.circleci/verbatim-sources/macos-binary-build-defaults.yml b/.circleci/verbatim-sources/macos-binary-build-defaults.yml index 2fc4ca37f1ec..c49f9bbe0289 100644 --- a/.circleci/verbatim-sources/macos-binary-build-defaults.yml +++ b/.circleci/verbatim-sources/macos-binary-build-defaults.yml @@ -22,7 +22,7 @@ binary_mac_build: &binary_mac_build name: Build no_output_timeout: "1h" command: | - set -ex + set -eux -o pipefail script="/Users/distiller/project/pytorch/.circleci/scripts/binary_macos_build.sh" cat "$script" source "$script" @@ -31,7 +31,7 @@ binary_mac_build: &binary_mac_build name: Test no_output_timeout: "1h" command: | - set -ex + set -eux -o pipefail script="/Users/distiller/project/pytorch/.circleci/scripts/binary_macos_test.sh" cat "$script" source "$script" @@ -63,3 +63,4 @@ binary_mac_upload: &binary_mac_upload script="/Users/distiller/project/pytorch/.circleci/scripts/binary_macos_upload.sh" cat "$script" source "$script" + diff --git a/.circleci/verbatim-sources/nightly-binary-build-defaults.yml b/.circleci/verbatim-sources/nightly-binary-build-defaults.yml index c95c74c2a43e..0d354a5e44f9 100644 --- a/.circleci/verbatim-sources/nightly-binary-build-defaults.yml +++ b/.circleci/verbatim-sources/nightly-binary-build-defaults.yml @@ -25,13 +25,13 @@ # do not need both the pytorch and builder repos, so this is a little wasteful # (smoke tests and upload jobs do not need the pytorch repo). binary_checkout: &binary_checkout - name: Checkout + name: Checkout pytorch/builder repo command: ~/workspace/.circleci/scripts/binary_checkout.sh # Parses circleci arguments in a consistent way, essentially routing to the # correct pythonXgccXcudaXos build we want binary_populate_env: &binary_populate_env - name: Set up env + name: Set up binary env variables command: ~/workspace/.circleci/scripts/binary_populate_env.sh binary_install_miniconda: &binary_install_miniconda @@ -48,3 +48,21 @@ binary_run_in_docker: &binary_run_in_docker # This step only runs on circleci linux machine executors that themselves # need to start docker images command: ~/workspace/.circleci/scripts/binary_run_in_docker.sh + +# This is copied almost verbatim from the macos_brew_update job +# In version 2.1 and above we could make this a command and pass a parameter to +# it, but in this version there is no way to pass a parameter to a step +binary_macos_brew_update: &binary_macos_brew_update + name: Brew update and install moreutils and expect + no_output_timeout: "1h" + command: | + set -eux -o pipefail + # moreutils installs a `parallel` executable by default, which conflicts + # with the executable from the GNU `parallel`, so we must unlink GNU + # `parallel` first, and relink it afterwards + brew update + brew unlink parallel + brew install moreutils + brew link parallel --overwrite + brew install expect + diff --git a/.circleci/verbatim-sources/nightly-build-smoke-tests-defaults.yml b/.circleci/verbatim-sources/nightly-build-smoke-tests-defaults.yml index 729555bbe7d6..b8745547e785 100644 --- a/.circleci/verbatim-sources/nightly-build-smoke-tests-defaults.yml +++ b/.circleci/verbatim-sources/nightly-build-smoke-tests-defaults.yml @@ -1,3 +1,4 @@ + # Nighlty build smoke tests defaults # These are the second-round smoke tests. These make sure that the binaries are # correct from a user perspective, testing that they exist from the cloud are @@ -9,10 +10,14 @@ smoke_linux_test: &smoke_linux_test steps: - attach_workspace: at: ~/workspace + - attach_workspace: + at: /home/circleci/project - run: <<: *setup_linux_system_environment - run: <<: *setup_ci_environment + - run: + <<: *binary_checkout - run: <<: *binary_populate_env - run: @@ -22,8 +27,7 @@ smoke_linux_test: &smoke_linux_test set -ex cat >/home/circleci/project/ci_test_script.sh <=0.6.2" --user fi if [[ "${BUILD_ENVIRONMENT}" == *-NO_AVX-* ]]; then diff --git a/.jenkins/pytorch/win-build.sh b/.jenkins/pytorch/win-build.sh index 3b6f330dd5b9..dbe2fd48995a 100755 --- a/.jenkins/pytorch/win-build.sh +++ b/.jenkins/pytorch/win-build.sh @@ -15,6 +15,7 @@ COMPACT_JOB_NAME=pytorch-win-ws2016-cuda9-cudnn7-py3-build SCRIPT_PARENT_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd ) source "$SCRIPT_PARENT_DIR/common.sh" +export IMAGE_COMMIT_ID=`git rev-parse HEAD` export IMAGE_COMMIT_TAG=${BUILD_ENVIRONMENT}-${IMAGE_COMMIT_ID} if [[ ${JOB_NAME} == *"develop"* ]]; then export IMAGE_COMMIT_TAG=develop-${IMAGE_COMMIT_TAG} diff --git a/.jenkins/pytorch/win-test-helpers/build_pytorch.bat b/.jenkins/pytorch/win-test-helpers/build_pytorch.bat index dc4bc2ab4f24..4208d462f647 100644 --- a/.jenkins/pytorch/win-test-helpers/build_pytorch.bat +++ b/.jenkins/pytorch/win-test-helpers/build_pytorch.bat @@ -21,13 +21,42 @@ if "%REBUILD%"=="" ( pip install -q ninja ) git submodule sync --recursive git submodule update --init --recursive -set PATH=%TMP_DIR_WIN%\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0\libnvvp;%PATH% +if "%CUDA_VERSION%" == "9" goto cuda_build_9 +if "%CUDA_VERSION%" == "10" goto cuda_build_10 +goto cuda_build_end + +:cuda_build_9 + +:: Override VS env here +pushd . +call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsall.bat" x64 +@echo on +popd +set DISTUTILS_USE_SDK=1 + set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0 -set CUDA_PATH_V9_0=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0 +set CUDA_PATH_V9_0=%CUDA_PATH% + +goto cuda_build_common + +:cuda_build_10 + +set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1 +set CUDA_PATH_V10_1=%CUDA_PATH% + +goto cuda_build_common + +:cuda_build_common + +set CUDNN_LIB_DIR=%CUDA_PATH%\lib\x64 +set CUDA_TOOLKIT_ROOT_DIR=%CUDA_PATH% +set CUDNN_ROOT_DIR=%CUDA_PATH% set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt -set CUDNN_LIB_DIR=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0\lib\x64 -set CUDA_TOOLKIT_ROOT_DIR=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0 -set CUDNN_ROOT_DIR=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0 +set PATH=%CUDA_PATH%\bin;%CUDA_PATH%\libnvvp;%PATH% + +:cuda_build_end + +set PATH=%TMP_DIR_WIN%\bin;%PATH% :: Target only our CI GPU machine's CUDA arch to speed up the build set TORCH_CUDA_ARCH_LIST=5.2 diff --git a/.jenkins/pytorch/win-test-helpers/installation-helpers/install_magma.bat b/.jenkins/pytorch/win-test-helpers/installation-helpers/install_magma.bat index 4e4c3b7ad337..631f5d1e6f64 100644 --- a/.jenkins/pytorch/win-test-helpers/installation-helpers/install_magma.bat +++ b/.jenkins/pytorch/win-test-helpers/installation-helpers/install_magma.bat @@ -1,9 +1,17 @@ +if "%CUDA_VERSION%" == "9" set CUDA_SUFFIX=cuda90 +if "%CUDA_VERSION%" == "10" set CUDA_SUFFIX=cuda101 + +if "%CUDA_SUFFIX%" == "" ( + echo unknown CUDA version, please set `CUDA_VERSION` to 9 or 10. + exit /b 1 +) + if "%REBUILD%"=="" ( if "%BUILD_ENVIRONMENT%"=="" ( - curl -k https://s3.amazonaws.com/ossci-windows/magma_2.5.0_cuda90_%BUILD_TYPE%.7z --output %TMP_DIR_WIN%\magma_2.5.0_cuda90_%BUILD_TYPE%.7z + curl -k https://s3.amazonaws.com/ossci-windows/magma_2.5.0_%CUDA_SUFFIX%_%BUILD_TYPE%.7z --output %TMP_DIR_WIN%\magma_2.5.0_%CUDA_SUFFIX%_%BUILD_TYPE%.7z ) else ( - aws s3 cp s3://ossci-windows/magma_2.5.0_cuda90_%BUILD_TYPE%.7z %TMP_DIR_WIN%\magma_2.5.0_cuda90_%BUILD_TYPE%.7z --quiet + aws s3 cp s3://ossci-windows/magma_2.5.0_%CUDA_SUFFIX%_%BUILD_TYPE%.7z %TMP_DIR_WIN%\magma_2.5.0_%CUDA_SUFFIX%_%BUILD_TYPE%.7z --quiet ) - 7z x -aoa %TMP_DIR_WIN%\magma_2.5.0_cuda90_%BUILD_TYPE%.7z -o%TMP_DIR_WIN%\magma + 7z x -aoa %TMP_DIR_WIN%\magma_2.5.0_%CUDA_SUFFIX%_%BUILD_TYPE%.7z -o%TMP_DIR_WIN%\magma ) set MAGMA_HOME=%TMP_DIR_WIN%\magma diff --git a/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat b/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat index 4431e7d10643..345a03aacf77 100644 --- a/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat +++ b/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat @@ -25,21 +25,48 @@ pip install -q ninja future hypothesis "librosa>=0.6.2" psutil :: No need to install faulthandler since we only test Python >= 3.6 on Windows :: faulthandler is builtin since Python 3.3 +if "%CUDA_VERSION%" == "9" goto cuda_build_9 +if "%CUDA_VERSION%" == "10" goto cuda_build_10 +goto cuda_build_end + +:cuda_build_9 + pushd . -call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsall.bat" x86_amd64 +call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsall.bat" x64 +@echo on popd -set PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0\libnvvp;%PATH% set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0 -set CUDA_PATH_V9_0=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0 +set CUDA_PATH_V9_0=%CUDA_PATH% + +goto cuda_build_common + +:cuda_build_10 + +pushd . +call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Auxiliary\Build\vcvarsall.bat" x64 +@echo on +popd + +set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1 +set CUDA_PATH_V10_1=%CUDA_PATH% + +goto cuda_build_common + +:cuda_build_common + +set CUDNN_LIB_DIR=%CUDA_PATH%\lib\x64 +set CUDA_TOOLKIT_ROOT_DIR=%CUDA_PATH% +set CUDNN_ROOT_DIR=%CUDA_PATH% set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt -set CUDNN_LIB_DIR=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0\lib\x64 -set CUDA_TOOLKIT_ROOT_DIR=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0 -set CUDNN_ROOT_DIR=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0 +set PATH=%CUDA_PATH%\bin;%CUDA_PATH%\libnvvp;%PATH% +set NUMBAPRO_CUDALIB=%CUDA_PATH%\bin +set NUMBAPRO_LIBDEVICE=%CUDA_PATH%\nvvm\libdevice +set NUMBAPRO_NVVM=%CUDA_PATH%\nvvm\bin\nvvm64_32_0.dll + +:cuda_build_end + set PYTHONPATH=%TMP_DIR_WIN%\build;%PYTHONPATH% -set NUMBAPRO_CUDALIB=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0\bin -set NUMBAPRO_LIBDEVICE=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0\nvvm\libdevice -set NUMBAPRO_NVVM=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0\nvvm\bin\nvvm64_32_0.dll if NOT "%BUILD_ENVIRONMENT%"=="" ( pushd %TMP_DIR_WIN%\build @@ -51,4 +78,7 @@ if NOT "%BUILD_ENVIRONMENT%"=="" ( xcopy /s %CONDA_PARENT_DIR%\Miniconda3\Lib\site-packages\torch %TMP_DIR_WIN%\build\torch\ ) +@echo off +echo @echo off >> %TMP_DIR%/ci_scripts/pytorch_env_restore.bat for /f "usebackq tokens=*" %%i in (`set`) do echo set "%%i" >> %TMP_DIR%/ci_scripts/pytorch_env_restore.bat +@echo on diff --git a/.jenkins/pytorch/win-test-helpers/test_custom_script_ops.bat b/.jenkins/pytorch/win-test-helpers/test_custom_script_ops.bat index 0c13e1ccc4fa..d86692dbabba 100644 --- a/.jenkins/pytorch/win-test-helpers/test_custom_script_ops.bat +++ b/.jenkins/pytorch/win-test-helpers/test_custom_script_ops.bat @@ -4,11 +4,22 @@ cd test\custom_operator :: Build the custom operator library. mkdir build -cd build +pushd build + +echo "Executing CMake for custom_operator test..." + :: Note: Caffe2 does not support MSVC + CUDA + Debug mode (has to be Release mode) cmake -DCMAKE_PREFIX_PATH=%TMP_DIR_WIN%\build\torch -DCMAKE_BUILD_TYPE=Release -GNinja .. +if ERRORLEVEL 1 exit /b 1 + +echo "Executing Ninja for custom_operator test..." + ninja -v -cd .. +if ERRORLEVEL 1 exit /b 1 + +echo "Ninja succeeded for custom_operator test." + +popd :: Run tests Python-side and export a script module. python test_custom_ops.py -v diff --git a/.jenkins/pytorch/win-test.sh b/.jenkins/pytorch/win-test.sh index c51066622684..425a4b8365aa 100755 --- a/.jenkins/pytorch/win-test.sh +++ b/.jenkins/pytorch/win-test.sh @@ -6,6 +6,7 @@ COMPACT_JOB_NAME=pytorch-win-ws2016-cuda9-cudnn7-py3-test SCRIPT_PARENT_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd ) source "$SCRIPT_PARENT_DIR/common.sh" +export IMAGE_COMMIT_ID=`git rev-parse HEAD` export IMAGE_COMMIT_TAG=${BUILD_ENVIRONMENT}-${IMAGE_COMMIT_ID} if [[ ${JOB_NAME} == *"develop"* ]]; then export IMAGE_COMMIT_TAG=develop-${IMAGE_COMMIT_TAG} diff --git a/CITATION b/CITATION index 046a2fa42038..9597a50fa754 100644 --- a/CITATION +++ b/CITATION @@ -1,6 +1,6 @@ @inproceedings{paszke2017automatic, - title={Automatic differentiation in PyTorch}, + title={Automatic Differentiation in {PyTorch}}, author={Paszke, Adam and Gross, Sam and Chintala, Soumith and Chanan, Gregory and Yang, Edward and DeVito, Zachary and Lin, Zeming and Desmaison, Alban and Antiga, Luca and Lerer, Adam}, - booktitle={NIPS-W}, + booktitle={NIPS Autodiff Workshop}, year={2017} } diff --git a/CMakeLists.txt b/CMakeLists.txt index 70264b802390..172b2d3b7296 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,6 +16,11 @@ set(CMAKE_CXX_STANDARD 11) if (NOT MSVC) set(CMAKE_C_STANDARD 11) endif() +if (DEFINED GLIBCXX_USE_CXX11_ABI) + if (${GLIBCXX_USE_CXX11_ABI} EQUAL 1) + set(CXX_STANDARD_REQUIRED ON) + endif() +endif() set(CMAKE_EXPORT_COMPILE_COMMANDS ON) @@ -61,7 +66,6 @@ endif() # Note to developers: if you add an option below, make sure you also add it to # cmake/Summary.cmake so that the summary prints out the option values. include(CMakeDependentOption) -option(BUILD_TORCH "Build Torch" OFF) option(ATEN_NO_TEST "Do not build ATen test binaries" OFF) option(BUILD_ATEN_ONLY "Build only a subset focused on ATen only" OFF) option(BUILD_BINARY "Build C++ binaries" OFF) @@ -318,6 +322,10 @@ if(NOT MSVC) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-constexpr-not-const") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-missing-braces") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Qunused-arguments") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fcolor-diagnostics") + endif() + if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.9) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fdiagnostics-color=always") endif() if ((APPLE AND (NOT ("${CLANG_VERSION_STRING}" VERSION_LESS "9.0"))) OR (CMAKE_COMPILER_IS_GNUCXX @@ -343,7 +351,9 @@ if(NOT MSVC) else() foreach(flag_var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE - CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO) + CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO + CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE + CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO) if (${CAFFE2_USE_MSVC_STATIC_RUNTIME}) if(${flag_var} MATCHES "/MD") string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}") @@ -508,6 +518,7 @@ if (BUILD_SHARED_LIBS) ${PROJECT_SOURCE_DIR}/cmake/Modules_CUDA_fix DESTINATION share/cmake/Caffe2/ COMPONENT dev) + install(EXPORT Caffe2Targets DESTINATION share/cmake/Caffe2 FILE Caffe2Targets.cmake COMPONENT dev) diff --git a/README.md b/README.md index 3b4ee37e4231..be886cda25b5 100644 --- a/README.md +++ b/README.md @@ -151,13 +151,13 @@ They requires JetPack 4.2 and above and are maintained by @dusty-nv ### From Source If you are installing from source, we highly recommend installing an [Anaconda](https://www.anaconda.com/distribution/#download-section) environment. -You will get a high-quality BLAS library (MKL) and you get a controlled compiler version regardless of your Linux distro. +You will get a high-quality BLAS library (MKL) and you get controlled dependency versions regardless of your Linux distro. Once you have [Anaconda](https://www.anaconda.com/distribution/#download-section) installed, here are the instructions. If you want to compile with CUDA support, install -- [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads) 7.5 or above -- [NVIDIA cuDNN](https://developer.nvidia.com/cudnn) v6.x or above +- [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads) 9 or above +- [NVIDIA cuDNN](https://developer.nvidia.com/cudnn) v7 or above If you want to disable CUDA support, export environment variable `NO_CUDA=1`. Other potentially useful environment variables may be found in `setup.py`. @@ -175,7 +175,7 @@ conda install numpy ninja pyyaml mkl mkl-include setuptools cmake cffi typing On Linux ```bash # Add LAPACK support for the GPU if needed -conda install -c pytorch magma-cuda90 # or [magma-cuda80 | magma-cuda92 | magma-cuda100 ] depending on your cuda version +conda install -c pytorch magma-cuda90 # or [magma-cuda92 | magma-cuda100 ] depending on your cuda version ``` #### Get the PyTorch Source @@ -183,7 +183,7 @@ conda install -c pytorch magma-cuda90 # or [magma-cuda80 | magma-cuda92 | magma- git clone --recursive https://github.com/pytorch/pytorch cd pytorch # if you are updating an existing checkout -git submodule sync +git submodule sync git submodule update --init --recursive ``` @@ -209,9 +209,6 @@ If the version of Visual Studio 2017 is higher than 15.4.5, installing of "VC++
There is no guarantee of the correct building with VC++ 2017 toolsets, others than version 15.4 v14.11.
"VC++ 2017 version 15.4 v14.11 toolset" might be installed onto already installed Visual Studio 2017 by running its installation once again and checking the corresponding checkbox under "Individual components"/"Compilers, build tools, and runtimes". -For building against CUDA 8.0 Visual Studio 2015 Update 3 (version 14.0), and the [patch](https://download.microsoft.com/download/8/1/d/81dbe6bb-ed92-411a-bef5-3a75ff972c6a/vc14-kb4020481.exe) are needed to be installed too. -The details of the patch can be found [here](https://support.microsoft.com/en-gb/help/4020481/fix-link-exe-crashes-with-a-fatal-lnk1000-error-when-you-use-wholearch). - NVTX is a part of CUDA distributive, where it is called "Nsight Compute". For installing it onto already installed CUDA run CUDA installation once again and check the corresponding checkbox. Be sure that CUDA with Nsight Compute is installed after Visual Studio 2017. @@ -221,9 +218,6 @@ REM [Optional] The following two lines are needed for Python 2.7, but the suppor set MSSdk=1 set FORCE_PY27_BUILD=1 -REM [Optional] As for CUDA 8, VS2015 Update 3 is required; use the following line. -set "CUDAHOSTCXX=%VS140COMNTOOLS%..\..\VC\bin\amd64\cl.exe" - set CMAKE_GENERATOR=Visual Studio 15 2017 Win64 set DISTUTILS_USE_SDK=1 diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt index 213d6465d2d1..b3d0a4c32457 100644 --- a/aten/src/ATen/CMakeLists.txt +++ b/aten/src/ATen/CMakeLists.txt @@ -252,12 +252,7 @@ IF(USE_CUDA AND NOT USE_ROCM) # build fake CuFFT lib in build dir EXECUTE_PROCESS(COMMAND touch ${CMAKE_CURRENT_BINARY_DIR}/empty_file.cc) - if(${CUDA_VERSION_MAJOR} EQUAL "8") - SET(CUFFT_FAKELINK_OPTIONS - --generate-code arch=compute_35,code=sm_35 - --generate-code arch=compute_50,code=sm_50 - --generate-code arch=compute_60,code=sm_60) - elseif(${CUDA_VERSION_MAJOR} EQUAL "9") + if(${CUDA_VERSION_MAJOR} EQUAL "9") SET(CUFFT_FAKELINK_OPTIONS --generate-code arch=compute_35,code=sm_35 --generate-code arch=compute_50,code=sm_50 diff --git a/aten/src/ATen/CPUApplyUtils.h b/aten/src/ATen/CPUApplyUtils.h index b1bed17f4140..8842eddd1fa2 100644 --- a/aten/src/ATen/CPUApplyUtils.h +++ b/aten/src/ATen/CPUApplyUtils.h @@ -30,7 +30,7 @@ inline std::pair collapse_dims( T* strides, int64_t dims, const int excludeDim = -1) { - AT_CHECK( + TORCH_CHECK( excludeDim >= -1 && excludeDim < dims, "expected excluded dim between -1 and dims - 1"); @@ -331,69 +331,6 @@ apply_op(int64_t numel, int64_t offset, const Op& op, Args... iters) { } } - -inline void apply_kernel(){}; - -// TODO: Deal elegantly with 0-dim tensors. iters.strides_ of 0-dim -// strided_tensor_iter will be of size 0 for dim 0 and iters.strides_[iters.dim_ -// - 1] will index at -1. C++14 integer_sequence could be of use here. -template -inline void -apply_kernel(int64_t numel, int64_t offset, const Op& op, Args... iters) { - if (offset > 0) - forward(offset, iters...); - int64_t size = std::min(numel, max_iterate_size(iters...)); - op(size, iters.data_..., iters.strides_[iters.dim_ - 1]...); - iterate(size, iters...); - iterate_overflow(iters...); - int64_t i = size; - size = std::min(numel, max_iterate_size(iters...)); - for (; i < numel;) { - op(size, iters.data_..., iters.strides_[iters.dim_ - 1]...); - iterate(size, iters...); - i += size; - iterate_overflow(iters...); - } -} - -template -inline void -CPU_tensor_parallel_kernel_apply2(Tensor tensor1, Tensor tensor2, const Op op) { - if (!_apply_preamble({tensor1, tensor2})) - return; - if (tensor1.numel() == 1) { - op(1, tensor1.data(), tensor2.data(), 0, 0); - return; - } - if (tensor1.ndimension() < 8 && tensor2.ndimension() < 8) { - parallel_for( - 0, - tensor1.numel(), - 1, - [&tensor1, &tensor2, &op](int64_t begin, int64_t end) { - apply_kernel( - end - begin, - begin, - op, - strided_tensor_iter_fixed(tensor1), - strided_tensor_iter_fixed(tensor2)); - }); - } else { - parallel_for( - 0, - tensor1.numel(), - 1, - [&tensor1, &tensor2, &op](int64_t begin, int64_t end) { - apply_kernel( - end - begin, - begin, - op, - strided_tensor_iter(tensor1), - strided_tensor_iter(tensor2)); - }); - } -} - /* Apply a pointwise operator to sequence of tensors diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp index 29a1d6709131..c3a64623e9e0 100644 --- a/aten/src/ATen/Context.cpp +++ b/aten/src/ATen/Context.cpp @@ -123,7 +123,7 @@ TypeExtendedInterface& getType(TensorOptions options) { TypeExtendedInterface& getType(const TensorImpl* impl) { Backend backend = tensorTypeIdToBackend(impl->type_id()); return globalContext().getType( - backend, typeMetaToScalarType(impl->dtype()), impl->is_variable() && !at::NonVariableTypeMode::is_enabled()); + backend, typeMetaToScalarType(impl->dtype()), impl->is_variable()); } TypeExtendedInterface& getType(const Tensor& t) { diff --git a/aten/src/ATen/DLConvertor.cpp b/aten/src/ATen/DLConvertor.cpp index 8604ec563047..e75db146ed20 100644 --- a/aten/src/ATen/DLConvertor.cpp +++ b/aten/src/ATen/DLConvertor.cpp @@ -42,6 +42,12 @@ static DLDataType getDLDataType(const Tensor& t) { case ScalarType::QInt8: throw std::logic_error("QInt8 is not supported by dlpack"); break; + case ScalarType::QUInt8: + throw std::logic_error("QUInt8 is not supported by dlpack"); + break; + case ScalarType::QInt32: + throw std::logic_error("QInt32 is not supported by dlpack"); + break; case ScalarType::ComplexHalf: throw std::logic_error("ComplexHalf is not supported by dlpack"); case ScalarType::ComplexFloat: diff --git a/aten/src/ATen/Declarations.cwrap b/aten/src/ATen/Declarations.cwrap index 0eea8e39909b..75f831c12b87 100644 --- a/aten/src/ATen/Declarations.cwrap +++ b/aten/src/ATen/Declarations.cwrap @@ -92,11 +92,13 @@ options: - arguments: - arg: THTensor* self + broadcast: mask inplace fallback types:Bool - THBoolTensor* mask - real value - zero_dim_tensor_only: True arguments: - arg: THTensor* self + broadcast: mask inplace fallback types:Bool - THBoolTensor* mask - THTensor* value ]] @@ -118,12 +120,15 @@ return: self arguments: - arg: THTensor* self + broadcast: mask inplace fallback types:Bool - THBoolTensor* mask - THTensor* source ]] [[ name: _th_masked_select cname: maskedSelect + cpu_bool: True + cuda_bool: True variants: - function return: argument 0 @@ -137,6 +142,8 @@ [[ name: _th_masked_select_bool cname: maskedSelectBool + cpu_bool: True + cuda_bool: True variants: - function return: argument 0 @@ -144,11 +151,13 @@ - arg: THTensor* result output: True - arg: THTensor* self + broadcast: mask fallback types:Bool - THBoolTensor* mask ]] [[ name: _th_nonzero cname: nonzero + cpu_half: True cpu_bool: True cuda_bool: True variants: @@ -365,6 +374,8 @@ ]] [[ name: _th_and + cpu_bool: True + cuda_bool: True cname: __and__ variants: - function @@ -387,6 +398,8 @@ [[ name: _th_iand_ cname: __iand__ + cpu_bool: True + cuda_bool: True variants: - function return: argument 0 @@ -406,6 +419,8 @@ [[ name: _th_or cname: __or__ + cpu_bool: True + cuda_bool: True variants: - function return: argument 0 @@ -427,6 +442,8 @@ [[ name: _th_ior_ cname: __ior__ + cpu_bool: True + cuda_bool: True variants: - function return: argument 0 @@ -446,6 +463,8 @@ [[ name: _th_xor cname: __xor__ + cpu_bool: True + cuda_bool: True variants: - function return: argument 0 @@ -1771,6 +1790,8 @@ [[ name: _th_sign cname: sign + cpu_bool: True + cuda_bool: True variants: - function return: argument 0 @@ -2362,22 +2383,6 @@ if_false: N default: S ]] -[[ - name: _th_getri_single - cname: getri - types: - - Float - - Double - backends: - - CPU - - CUDA - variants: function - return: argument 0 - arguments: - - arg: THTensor* output - output: True - - THTensor* self -]] [[ name: _th_potri cname: potri @@ -2624,7 +2629,6 @@ - floating_point backends: - CPU - - CUDA cname: uniform variants: function return: self diff --git a/aten/src/ATen/Dispatch.h b/aten/src/ATen/Dispatch.h index 649bc9d4cec1..8deac1ac9aee 100644 --- a/aten/src/ATen/Dispatch.h +++ b/aten/src/ATen/Dispatch.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include #include @@ -11,6 +12,14 @@ return __VA_ARGS__(); \ } +#define AT_QINT_PRIVATE_CASE_TYPE(enum_type, type, underlying_enum, underlying_type, ...) \ + case enum_type: { \ + const auto& UNDERLYING_TYPE C10_UNUSED = underlying_enum; \ + using scalar_t C10_UNUSED = type; \ + using underlying_t C10_UNUSED = underlying_type; \ + return __VA_ARGS__(); \ + } + namespace detail { template @@ -59,6 +68,54 @@ inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {} } +// The AT_DISPATCH_* family of macros provides the ability to +// conveniently generate specializations of a kernel over all of the +// dtypes we care about in PyTorch. We call it "dispatch" because +// we are "dispatching" to the correct, dtype-specific kernel. +// +// A standard usage looks like: +// +// AT_DISPATCH_ALL_TYPES(self.scalar_type(), "op_name", [&] { +// // Your code here, with 'scalar_t' now defined to +// // be the dtype in question +// }) +// +// There are many variations of this macro, so it's important to +// understand exactly /which/ dtypes you want to get instantiated, as +// well as what the "default" set is. +// +// The default set of dtypes that are instantiated (e.g., by +// AT_DISPATCH_ALL_TYPES) are floating point types (float, double), +// and integral types (int32_t, int64_t, int16_t, int8_t, uint8_t), +// but NOT booleans (bool), half-precision floats (Half) or +// complex number (std::complex, std::complex). +// This "cut" is somewhat historical (the default types are the +// ones that TH historically supported), but it also reflects the +// fact that the non-default types are "poorly" behaved (booleans +// are NOT integers mod 2, half precision operations ~essentially +// don't exist on CPU, complex numbers are an experimental application). +// +// Here are the questions you should generally ask to decide which +// dispatch you want: +// +// 1. Is this an integral or floating point specific operation? +// (If so, you'll want one of the FLOATING or INTEGRAL macros.) +// +// 2. Should half be supported? (If you're on CPU, the answer is almost +// definitely no. If you do want support, use one of the AND_HALF +// macros) +// +// Much rarer situations: +// +// 3. Should bool be supported? (You often have to write your kernel +// differently if arithmetic operations are involved.) If so, +// Use AT_DISPATCH_ALL_TYPES_AND along with ScalarType::Bool +// +// 4. Should complex be supported? The answer is almost always no, +// unless you are working on "generic" code that should work on +// all dtypes. + + // NB: the the_type variable is not used, but we have kept it for // backwards compatibility. It's probably not used by anyone though; // but we're just being safe (and it doesn't hurt.) Note we must @@ -127,26 +184,6 @@ inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {} } \ }() -#define AT_DISPATCH_ALL_TYPES_AND_HALF(TYPE, NAME, ...) \ - [&] { \ - detail::deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF(); \ - const auto& the_type = TYPE; \ - /* don't use TYPE again in case it is an expensive or side-effect op */ \ - at::ScalarType _st = ::detail::scalar_type(the_type); \ - switch (_st) { \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Byte, uint8_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Char, int8_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Double, double, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Float, float, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Int, int32_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Long, int64_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Short, int16_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Half, at::Half, __VA_ARGS__) \ - default: \ - AT_ERROR(#NAME, " not implemented for '", toString(_st), "'"); \ - } \ - }() - #define AT_DISPATCH_ALL_TYPES(TYPE, NAME, ...) \ [&] { \ const auto& the_type = TYPE; \ @@ -180,6 +217,21 @@ inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {} } \ }() +#define AT_DISPATCH_QINT_TYPES(TYPE, NAME, ...) \ + [&] { \ + const auto& SCALAR_TYPE C10_UNUSED = TYPE; \ + switch (TYPE) { \ + AT_QINT_PRIVATE_CASE_TYPE( \ + kQInt8, qint8, kChar, int8_t, __VA_ARGS__) \ + AT_QINT_PRIVATE_CASE_TYPE( \ + kQUInt8, quint8, kByte, uint8_t, __VA_ARGS__) \ + AT_QINT_PRIVATE_CASE_TYPE( \ + kQInt32, qint32, kInt, int, __VA_ARGS__) \ + default: \ + AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \ + } \ + }() + #define AT_DISPATCH_ALL_TYPES_AND_COMPLEX(TYPE, NAME, ...) \ [&] { \ const auto& the_type = TYPE; \ @@ -202,30 +254,6 @@ inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {} } \ }() -#define AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX(TYPE, NAME, ...) \ - [&] { \ - detail::deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() \ - const auto& the_type = TYPE; \ - /* don't use TYPE again in case it is an expensive or side-effect op */ \ - at::ScalarType _st = ::detail::scalar_type(the_type); \ - switch (_st) { \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Byte, uint8_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Char, int8_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Double, double, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Float, float, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Int, int32_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Long, int64_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Short, int16_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Half, at::Half, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE( \ - at::ScalarType::ComplexFloat, std::complex, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE( \ - at::ScalarType::ComplexDouble, std::complex, __VA_ARGS__) \ - default: \ - AT_ERROR(#NAME, " not implemented for '", toString(_st), "'"); \ - } \ - }() - #define AT_DISPATCH_ALL_TYPES_AND(SCALARTYPE, TYPE, NAME, ...) \ [&] { \ switch (TYPE) { \ @@ -279,3 +307,51 @@ inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {} AT_ERROR(#NAME, " not implemented for '", TYPE, "'"); \ } \ }() + +// ---------------------------------------------------------------------------- +// DEPRECATED MACROS, DON'T USE THESE +// ---------------------------------------------------------------------------- + +#define AT_DISPATCH_ALL_TYPES_AND_HALF(TYPE, NAME, ...) \ + [&] { \ + detail::deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF(); \ + const auto& the_type = TYPE; \ + /* don't use TYPE again in case it is an expensive or side-effect op */ \ + at::ScalarType _st = ::detail::scalar_type(the_type); \ + switch (_st) { \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Byte, uint8_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Char, int8_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Double, double, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Float, float, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Int, int32_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Long, int64_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Short, int16_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Half, at::Half, __VA_ARGS__) \ + default: \ + AT_ERROR(#NAME, " not implemented for '", toString(_st), "'"); \ + } \ + }() + +#define AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX(TYPE, NAME, ...) \ + [&] { \ + detail::deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() \ + const auto& the_type = TYPE; \ + /* don't use TYPE again in case it is an expensive or side-effect op */ \ + at::ScalarType _st = ::detail::scalar_type(the_type); \ + switch (_st) { \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Byte, uint8_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Char, int8_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Double, double, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Float, float, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Int, int32_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Long, int64_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Short, int16_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(at::ScalarType::Half, at::Half, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE( \ + at::ScalarType::ComplexFloat, std::complex, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE( \ + at::ScalarType::ComplexDouble, std::complex, __VA_ARGS__) \ + default: \ + AT_ERROR(#NAME, " not implemented for '", toString(_st), "'"); \ + } \ + }() diff --git a/aten/src/ATen/ExpandUtils.cpp b/aten/src/ATen/ExpandUtils.cpp index f0e854cf4dbb..54fcc7721f71 100644 --- a/aten/src/ATen/ExpandUtils.cpp +++ b/aten/src/ATen/ExpandUtils.cpp @@ -16,7 +16,7 @@ std::vector infer_size(IntArrayRef a, IntArrayRef b) { int64_t sizeA = (dimA >= 0) ? a[dimA] : 1; int64_t sizeB = (dimB >= 0) ? b[dimB] : 1; - AT_CHECK( + TORCH_CHECK( sizeA == sizeB || sizeA == 1 || sizeB == 1, "The size of tensor a (", sizeA, ") must match the size of tensor b (", sizeB, @@ -53,7 +53,7 @@ std::tuple, std::vector> inferExpandGeometry( : expandedSizes[i + 1] * expandedStrides[i + 1]; int64_t targetSize = sizes[i]; if (targetSize == -1) { - AT_CHECK( + TORCH_CHECK( dim >= 0, "The expanded size of the tensor (", targetSize, @@ -62,7 +62,7 @@ std::tuple, std::vector> inferExpandGeometry( targetSize = size; } if (size != targetSize) { - AT_CHECK( + TORCH_CHECK( size == 1, "The expanded size of the tensor (", targetSize, diff --git a/aten/src/ATen/InferSize.h b/aten/src/ATen/InferSize.h index d001bbd17152..aa6ac328f6a3 100644 --- a/aten/src/ATen/InferSize.h +++ b/aten/src/ATen/InferSize.h @@ -36,7 +36,7 @@ static std::vector infer_size(IntArrayRef shape, int64_t numel) { // works yet // empty_tensor.view(-1, 0) // doesn't. - AT_CHECK(newsize != 0, "cannot reshape tensor of 0 elements into shape ", + TORCH_CHECK(newsize != 0, "cannot reshape tensor of 0 elements into shape ", shape, " because the unspecified dimension size -1 can be any " "value and is ambiguous"); res[*infer_dim] = numel / newsize; diff --git a/aten/src/ATen/MatrixRef.h b/aten/src/ATen/MatrixRef.h index 869142ff8562..d6bcc08addd2 100644 --- a/aten/src/ATen/MatrixRef.h +++ b/aten/src/ATen/MatrixRef.h @@ -40,7 +40,7 @@ namespace at { /// Construct an MatrixRef from an ArrayRef and outer stride. /*implicit*/ MatrixRef(ArrayRef arr, size_type stride0) : arr(arr), stride0(stride0) { - AT_CHECK(arr.size() % stride0 == 0, "MatrixRef: ArrayRef size ", arr.size(), " not divisible by stride ", stride0) + TORCH_CHECK(arr.size() % stride0 == 0, "MatrixRef: ArrayRef size ", arr.size(), " not divisible by stride ", stride0) } /// @} @@ -59,7 +59,7 @@ namespace at { } else if (dim == 1) { return stride0; } else { - AT_CHECK(0, "MatrixRef: out of bounds dimension ", dim, "; expected 0 or 1"); + TORCH_CHECK(0, "MatrixRef: out of bounds dimension ", dim, "; expected 0 or 1"); } } diff --git a/aten/src/ATen/OpaqueTensorImpl.h b/aten/src/ATen/OpaqueTensorImpl.h index 12d8ea254b5c..92a206d002a1 100644 --- a/aten/src/ATen/OpaqueTensorImpl.h +++ b/aten/src/ATen/OpaqueTensorImpl.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include namespace at { @@ -36,7 +37,7 @@ struct CAFFE2_API OpaqueTensorImpl : public TensorImpl { AT_ERROR("opaque tensors do not have strides"); } - bool is_contiguous() const override { + bool is_contiguous(c10::MemoryFormat memory_format=c10::MemoryFormat::Any) const override { AT_ERROR("opaque tensors do not have is_contiguous"); } @@ -78,15 +79,15 @@ struct CAFFE2_API OpaqueTensorImpl : public TensorImpl { // NOTE: `shallow_copy_and_detach()` does not copy the following TensorImpl fields: // 1. the AutogradMeta pointer, because it is unique for each Variable. -// 2. the version counter, because although it lives in TensorImpl, the version counter is managed -// by autograd, and the call sites of `shallow_copy_and_detach()` (from autograd) should decide what -// the version counter should be for each new TensorImpl. See NOTE [ Version Counter Sharing ] for details. +// 2. the version counter, because it is set to the passed in `version_counter`. +// See NOTE [ Version Counter Sharing ] for details. // -// NOTE: We don't set `allow_tensor_metadata_change_` to false here, because there are call sites -// to this function that need to change the shallow copy's size or storage afterwards, and setting -// `allow_tensor_metadata_change_` to false would prevent those changes from happening and is -// undesirable. -c10::intrusive_ptr shallow_copy_and_detach() const override { +// NOTE: `allow_tensor_metadata_change` determines whether the TensorImpl shallow-copy +// allows changes to its metadata (e.g. sizes / strides / storage / storage_offset). +// See NOTE [ Metadata Change for a Detached Tensor ] for details. +c10::intrusive_ptr shallow_copy_and_detach( + const c10::VariableVersion& version_counter, + bool allow_tensor_metadata_change) const override { //AT_ASSERT(false); auto impl = c10::make_intrusive>( type_id(), dtype(), device(), opaque_handle_, sizes_); @@ -99,6 +100,8 @@ c10::intrusive_ptr shallow_copy_and_detach() const override { impl->is_contiguous_ = is_contiguous_; impl->is_wrapped_number_ = is_wrapped_number_; impl->reserved_ = reserved_; + impl->set_version_counter(version_counter); + impl->set_allow_tensor_metadata_change(allow_tensor_metadata_change); // OpaqueTensorImpl-specific fields (none currently). return impl; diff --git a/aten/src/ATen/Parallel.cpp b/aten/src/ATen/Parallel.cpp index 0a965f7dba06..0d81dd443699 100644 --- a/aten/src/ATen/Parallel.cpp +++ b/aten/src/ATen/Parallel.cpp @@ -5,6 +5,7 @@ #include #include +#include #ifdef TH_BLAS_MKL #include @@ -13,8 +14,43 @@ namespace at { namespace { +const int NOT_SET = -1; +const int CONSUMED = -2; + // Number of threads set by the user -std::atomic num_threads(-1); +std::atomic num_threads{NOT_SET}; + +// Number of inter-op threads set by the user; +// NOT_SET -> positive value -> CONSUMED +// (CONSUMED - thread pool is initialized) +// or +// NOT_SET -> CONSUMED +std::atomic num_interop_threads{NOT_SET}; + +// thread pool global instance is hidden, +// users should use at::launch and get/set_num_interop_threads interface +TaskThreadPoolBase& get_pool() { + static std::shared_ptr pool = + ThreadPoolRegistry()->Create( + "C10", + /* device_id */ 0, + /* pool_size */ num_interop_threads.exchange(CONSUMED), + /* create_new */ true); + return *pool; +} + + // Factory function for ThreadPoolRegistry +std::shared_ptr create_c10_threadpool( + int device_id, + int pool_size, + bool create_new) { + // For now, the only accepted device id is 0 + AT_CHECK(device_id == 0); + // Create new thread pool + AT_CHECK(create_new); + return std::make_shared(pool_size); +} + } void init_num_threads() { @@ -32,10 +68,9 @@ void init_num_threads() { } } -void set_num_threads(size_t nthreads) { - if (nthreads == 0) { - return; - } +void set_num_threads(int nthreads) { + AT_CHECK(nthreads > 0, "Expected positive number of threads"); + num_threads.store(nthreads); #ifdef _OPENMP omp_set_num_threads(nthreads); @@ -56,7 +91,7 @@ void set_num_threads(size_t nthreads) { // region might be different in the new thread; // Use init_num_threads() during thread initialization to ensure // consistent size of parallel region in different threads -size_t get_num_threads() { +int get_num_threads() { #ifdef _OPENMP return omp_get_max_threads(); #else @@ -100,7 +135,7 @@ std::string get_parallel_info() { } PTThreadPool::PTThreadPool( - std::size_t pool_size, + int pool_size, int numa_node_id) : c10::ThreadPool(pool_size, numa_node_id) {} @@ -109,26 +144,31 @@ void PTThreadPool::init_thread() { at::init_num_threads(); } -namespace { +C10_REGISTER_CREATOR(ThreadPoolRegistry, C10, create_c10_threadpool); -std::shared_ptr createC10ThreadPool( - int device_id, - int pool_size, - bool create_new) { - static std::shared_ptr pool = - std::make_shared(pool_size); - // For now, the only accepted device id is 0 - // for the JIT inter-op pool (CPU), - AT_ASSERT(device_id == 0); - // we use the shared thread pool - AT_ASSERT(!create_new); - // and the size does not change - AT_ASSERT(pool->size() == pool_size); - return pool; +void set_num_interop_threads(int nthreads) { + AT_CHECK(nthreads > 0, "Expected positive number of threads"); + + int no_value = NOT_SET; + AT_CHECK(num_interop_threads.compare_exchange_strong(no_value, nthreads), + "Error: cannot set number of interop threads after parallel work " + "has started or set_num_interop_threads called"); } -} // namespace +int get_num_interop_threads() { + int nthreads = num_interop_threads.load(); + if (nthreads > 0) { + return nthreads; + } else if (nthreads == NOT_SET) { + // return default value + return TaskThreadPoolBase::defaultNumThreads(); + } else { + return get_pool().size(); + } +} -C10_REGISTER_CREATOR(ThreadPoolRegistry, C10, createC10ThreadPool); +void launch(const std::function& func) { + get_pool().run(func); +} } // namespace at diff --git a/aten/src/ATen/Parallel.h b/aten/src/ATen/Parallel.h index 2668619436c2..fe7530793589 100644 --- a/aten/src/ATen/Parallel.h +++ b/aten/src/ATen/Parallel.h @@ -30,10 +30,10 @@ inline int64_t divup(int64_t x, int64_t y) { CAFFE2_API void init_num_threads(); // Sets the number of threads to be used in parallel region -CAFFE2_API void set_num_threads(size_t); +CAFFE2_API void set_num_threads(int); // Returns the number of threads used in parallel region -CAFFE2_API size_t get_num_threads(); +CAFFE2_API int get_num_threads(); // Returns the current thread number (starting from 0) // in the current parallel region, or 0 in the sequential region @@ -153,10 +153,19 @@ CAFFE2_API std::string get_parallel_info(); class CAFFE2_API PTThreadPool : public c10::ThreadPool { public: explicit PTThreadPool( - std::size_t pool_size, + int pool_size, int numa_node_id = -1); void init_thread() override; }; +// Sets number of threads used for inter-op parallelism +CAFFE2_API void set_num_interop_threads(int); + +// Returns the number of threads used for inter-op parallelism +CAFFE2_API int get_num_interop_threads(); + +// Launches inter-op parallel task +CAFFE2_API void launch(const std::function& func); + } // namespace at diff --git a/aten/src/ATen/SparseTensorImpl.cpp b/aten/src/ATen/SparseTensorImpl.cpp index d87b29707fa7..5f61313b98fc 100644 --- a/aten/src/ATen/SparseTensorImpl.cpp +++ b/aten/src/ATen/SparseTensorImpl.cpp @@ -51,7 +51,7 @@ SparseTensorImpl::SparseTensorImpl(at::TensorTypeId type_id, const caffe2::TypeM IntArrayRef SparseTensorImpl::strides() const { AT_ERROR("sparse tensors do not have strides"); } -bool SparseTensorImpl::is_contiguous() const { +bool SparseTensorImpl::is_contiguous(at::MemoryFormat memory_format) const { AT_ERROR("sparse tensors do not have is_contiguous"); } int64_t SparseTensorImpl::stride(int64_t d) const { @@ -74,7 +74,7 @@ int64_t SparseTensorImpl::dim() const { return sparse_dim_ + dense_dim_; } TensorImpl* SparseTensorImpl::maybe_zero_dim(bool condition_when_zero_dim) { - AT_CHECK(condition_when_zero_dim == (dim() == 0), + TORCH_CHECK(condition_when_zero_dim == (dim() == 0), "Attempted to maybe_zero_dim on a SparseTensorImpl to ", condition_when_zero_dim, " but the SparseTensor's dim() is ", dim(), " and SparseTensors do not support" " changing dimensionality via maybe_zero_dim"); @@ -90,29 +90,29 @@ int64_t SparseTensorImpl::storage_offset() const { AT_ERROR("sparse tensors do not have storage"); } void SparseTensorImpl::set_indices_and_values_unsafe(const Tensor& indices, const Tensor& values) { - AT_CHECK(allow_tensor_metadata_change(), "set_indices_and_values_unsafe is not allowed on Tensor created from .data or .detach()"); + TORCH_CHECK(allow_tensor_metadata_change(), "set_indices_and_values_unsafe is not allowed on Tensor created from .data or .detach()"); AT_ASSERT(!indices.is_variable() && !values.is_variable()); // They should be plain tensors! // TODO: change this to check `.requires_grad()` and `GradMode::is_enabled()` when Variable and Tensor are merged - AT_CHECK(!indices.is_sparse(), "expected indices to be a dense tensor, but got indices of layout ", indices.layout()); - AT_CHECK(!values.is_sparse(), "expected values to be a dense tensor, but got values of layout ", values.layout()); + TORCH_CHECK(!indices.is_sparse(), "expected indices to be a dense tensor, but got indices of layout ", indices.layout()); + TORCH_CHECK(!values.is_sparse(), "expected values to be a dense tensor, but got values of layout ", values.layout()); - AT_CHECK(values.device().type() == device().type(), "device type of values (", values.device().type(), ") must match device type of device().type()", device().type(), ")"); - AT_CHECK(values.scalar_type() == typeMetaToScalarType(dtype()), "dtype of values (", values.scalar_type(), ") must match dtype of sparse tensor (", typeMetaToScalarType(dtype()), ")"); - AT_CHECK(indices.scalar_type() == kLong, "indices must be an int64 tensor"); - AT_CHECK(indices.type().backend() == values.type().backend(), "backend of indices (", indices.type().backend(), ") must match backend of values (", values.type().backend(), ")"); - AT_CHECK(!indices.is_cuda() || indices.get_device() == values.get_device(), "device of indices (", indices.get_device(), ") must match device of values (", values.get_device(), ")"); + TORCH_CHECK(values.device().type() == device().type(), "device type of values (", values.device().type(), ") must match device type of device().type()", device().type(), ")"); + TORCH_CHECK(values.scalar_type() == typeMetaToScalarType(dtype()), "dtype of values (", values.scalar_type(), ") must match dtype of sparse tensor (", typeMetaToScalarType(dtype()), ")"); + TORCH_CHECK(indices.scalar_type() == kLong, "indices must be an int64 tensor"); + TORCH_CHECK(indices.type().backend() == values.type().backend(), "backend of indices (", indices.type().backend(), ") must match backend of values (", values.type().backend(), ")"); + TORCH_CHECK(!indices.is_cuda() || indices.get_device() == values.get_device(), "device of indices (", indices.get_device(), ") must match device of values (", values.get_device(), ")"); - AT_CHECK(indices.dim() == 2, "indices must be sparse_dim x nnz, but got: ", indices.sizes()); - AT_CHECK(indices.size(1) == values.size(0), "indices and values must have same nnz, but got nnz from indices: ", indices.size(1), ", nnz from values: ", values.size(0)); - AT_CHECK(indices.size(0) == sparse_dim_, "indices has incorrect first dimension, expected ", sparse_dim_, ", got ", indices.size(0)); - AT_CHECK(values.dim() == dense_dim_ + 1, "values has incorrect number of dimensions, expected ", dense_dim_ + 1, ", got ", values.dim()); + TORCH_CHECK(indices.dim() == 2, "indices must be sparse_dim x nnz, but got: ", indices.sizes()); + TORCH_CHECK(indices.size(1) == values.size(0), "indices and values must have same nnz, but got nnz from indices: ", indices.size(1), ", nnz from values: ", values.size(0)); + TORCH_CHECK(indices.size(0) == sparse_dim_, "indices has incorrect first dimension, expected ", sparse_dim_, ", got ", indices.size(0)); + TORCH_CHECK(values.dim() == dense_dim_ + 1, "values has incorrect number of dimensions, expected ", dense_dim_ + 1, ", got ", values.dim()); auto dense_size_original = sizes().slice(sparse_dim_); std::vector expected_values_size_vec = {values.size(0)}; expected_values_size_vec.insert(expected_values_size_vec.end(), dense_size_original.begin(), dense_size_original.end()); IntArrayRef expected_values_size(expected_values_size_vec); auto new_values_size = values.sizes(); - AT_CHECK( + TORCH_CHECK( std::equal(expected_values_size.begin(), expected_values_size.end(), new_values_size.begin()), "values has incorrect size, expected ", expected_values_size, ", got ", new_values_size ); diff --git a/aten/src/ATen/SparseTensorImpl.h b/aten/src/ATen/SparseTensorImpl.h index 29cdda99eef2..e611b3b86ee0 100644 --- a/aten/src/ATen/SparseTensorImpl.h +++ b/aten/src/ATen/SparseTensorImpl.h @@ -41,7 +41,7 @@ struct CAFFE2_API SparseTensorImpl : public TensorImpl { Tensor values() const { return values_; } IntArrayRef strides() const override; - bool is_contiguous() const override; + bool is_contiguous(at::MemoryFormat memory_format=at::MemoryFormat::Any) const override; int64_t stride(int64_t d) const override; void resize_dim(int64_t ndim) override; void set_size(int64_t dim, int64_t new_size) override; @@ -57,7 +57,7 @@ struct CAFFE2_API SparseTensorImpl : public TensorImpl { // WARNING: This function does NOT preserve invariants of sparse_dim/dense_dim with // respect to indices and values void raw_resize_(int64_t sparse_dim, int64_t dense_dim, IntArrayRef size) { - AT_CHECK(allow_tensor_metadata_change(), "raw_resize_ is not allowed on Tensor created from .data or .detach()"); + TORCH_CHECK(allow_tensor_metadata_change(), "raw_resize_ is not allowed on Tensor created from .data or .detach()"); sizes_ = size.vec(); sparse_dim_ = sparse_dim; dense_dim_ = dense_dim; @@ -87,8 +87,8 @@ struct CAFFE2_API SparseTensorImpl : public TensorImpl { // 4. When we attempt to shrink the size of any of the sparse dimensions on a non-empty sparse tensor // (this could make some of the stored indices out-of-bound and thus unsafe). void resize_(int64_t sparse_dim, int64_t dense_dim, IntArrayRef size) { - AT_CHECK(allow_tensor_metadata_change(), "resize_ is not allowed on Tensor created from .data or .detach()"); - AT_CHECK(sparse_dim + dense_dim == static_cast(size.size()), "number of dimensions must be sparse_dim (", sparse_dim, ") + dense_dim (", dense_dim, "), but got ", size.size()); + TORCH_CHECK(allow_tensor_metadata_change(), "resize_ is not allowed on Tensor created from .data or .detach()"); + TORCH_CHECK(sparse_dim + dense_dim == static_cast(size.size()), "number of dimensions must be sparse_dim (", sparse_dim, ") + dense_dim (", dense_dim, "), but got ", size.size()); if (nnz() > 0) { auto alt_options_msg = "You could try the following options:\n\ 1. If you need an empty sparse tensor of this size, call `x = torch.sparse_coo_tensor(size)`.\n\ @@ -96,10 +96,10 @@ struct CAFFE2_API SparseTensorImpl : public TensorImpl { 1. For both sparse and dense dimensions, keep the number of them constant and the size of them non-shrinking, and then try the same call again.\n\ 2. Or, create a new sparse tensor with the correct indices and values from this sparse tensor."; - AT_CHECK(sparse_dim == sparse_dim_, + TORCH_CHECK(sparse_dim == sparse_dim_, "changing the number of sparse dimensions (from ", sparse_dim_, " to ", sparse_dim, ") on a non-empty sparse tensor is not supported.\n", alt_options_msg); - AT_CHECK(dense_dim == dense_dim_, + TORCH_CHECK(dense_dim == dense_dim_, "changing the number of dense dimensions (from ", dense_dim_, " to ", dense_dim, ") on a non-empty sparse tensor is not supported.\n", alt_options_msg); bool shrinking_sparse_dims = false; @@ -121,10 +121,10 @@ struct CAFFE2_API SparseTensorImpl : public TensorImpl { } } - AT_CHECK(!shrinking_sparse_dims, + TORCH_CHECK(!shrinking_sparse_dims, "shrinking the size of sparse dimensions (from ", sparse_size_original, " to ", sparse_size_new, ") on a non-empty sparse tensor is not supported.\n", alt_options_msg); - AT_CHECK(!shrinking_dense_dim, + TORCH_CHECK(!shrinking_dense_dim, "shrinking the size of dense dimensions (from ", dense_size_original, " to ", dense_size_new, ") on a non-empty sparse tensor is not supported.\n", alt_options_msg); } @@ -145,8 +145,8 @@ struct CAFFE2_API SparseTensorImpl : public TensorImpl { // NOTE: this function will resize the sparse tensor and also set `indices` and `values` to empty. void resize_and_clear_(int64_t sparse_dim, int64_t dense_dim, IntArrayRef size) { - AT_CHECK(allow_tensor_metadata_change(), "resize_and_clear_ is not allowed on Tensor created from .data or .detach()"); - AT_CHECK(sparse_dim + dense_dim == static_cast(size.size()), "number of dimensions must be sparse_dim (", sparse_dim, ") + dense_dim (", dense_dim, "), but got ", size.size()); + TORCH_CHECK(allow_tensor_metadata_change(), "resize_and_clear_ is not allowed on Tensor created from .data or .detach()"); + TORCH_CHECK(sparse_dim + dense_dim == static_cast(size.size()), "number of dimensions must be sparse_dim (", sparse_dim, ") + dense_dim (", dense_dim, "), but got ", size.size()); sizes_ = size.vec(); sparse_dim_ = sparse_dim; @@ -162,13 +162,13 @@ struct CAFFE2_API SparseTensorImpl : public TensorImpl { } void set_coalesced(bool coalesced) { - AT_CHECK(allow_tensor_metadata_change(), "set_coalesced is not allowed on Tensor created from .data or .detach()"); + TORCH_CHECK(allow_tensor_metadata_change(), "set_coalesced is not allowed on Tensor created from .data or .detach()"); coalesced_ = coalesced; } // NOTE: this function is only used internally and not exposed to Python frontend void set_nnz_and_narrow(int64_t new_nnz) { - AT_CHECK(allow_tensor_metadata_change(), "set_nnz_and_narrow is not allowed on Tensor created from .data or .detach()"); + TORCH_CHECK(allow_tensor_metadata_change(), "set_nnz_and_narrow is not allowed on Tensor created from .data or .detach()"); AT_ASSERT(new_nnz <= nnz()); indices_ = indices_.narrow(1, 0, new_nnz); values_ = values_.narrow(0, 0, new_nnz); @@ -185,15 +185,15 @@ struct CAFFE2_API SparseTensorImpl : public TensorImpl { // NOTE: `shallow_copy_and_detach()` does not copy the following TensorImpl fields: // 1. the AutogradMeta pointer, because it is unique for each Variable. - // 2. the version counter, because although it lives in TensorImpl, the version counter is managed - // by autograd, and the call sites of `shallow_copy_and_detach()` (from autograd) should decide what - // the version counter should be for each new TensorImpl. See NOTE [ Version Counter Sharing ] for details. + // 2. the version counter, because it is set to the passed in `version_counter`. + // See NOTE [ Version Counter Sharing ] for details. // - // NOTE: We don't set `allow_tensor_metadata_change_` to false here, because there are call sites - // to this function that need to change the shallow copy's size or storage afterwards, and setting - // `allow_tensor_metadata_change_` to false would prevent those changes from happening and is - // undesirable. - c10::intrusive_ptr shallow_copy_and_detach() const override { + // NOTE: `allow_tensor_metadata_change` determines whether the TensorImpl shallow-copy + // allows changes to its metadata (e.g. sizes / strides / storage / storage_offset). + // See NOTE [ Metadata Change for a Detached Tensor ] for details. + c10::intrusive_ptr shallow_copy_and_detach( + const c10::VariableVersion& version_counter, + bool allow_tensor_metadata_change) const override { auto impl = c10::make_intrusive(type_id(), dtype()); // TensorImpl general fields // Note that these fields are not used in sparse tensor code, and we copy them here only for completeness. @@ -203,6 +203,8 @@ struct CAFFE2_API SparseTensorImpl : public TensorImpl { impl->is_contiguous_ = is_contiguous_; impl->is_wrapped_number_ = is_wrapped_number_; impl->reserved_ = reserved_; + impl->set_version_counter(version_counter); + impl->set_allow_tensor_metadata_change(allow_tensor_metadata_change); // Sparse-specific fields impl->sparse_dim_ = sparse_dim(); diff --git a/aten/src/ATen/TensorGeometry.h b/aten/src/ATen/TensorGeometry.h index 3928494cae3b..1e4391961e1d 100644 --- a/aten/src/ATen/TensorGeometry.h +++ b/aten/src/ATen/TensorGeometry.h @@ -46,8 +46,8 @@ struct CAFFE2_API TensorGeometry { TensorGeometry transpose(int64_t dim0, int64_t dim1) { TensorGeometry r = *this; // copy - AT_CHECK(dim0 < dim(), "transpose: dim0=", dim0, " out of range (dim=", dim(), ")") - AT_CHECK(dim1 < dim(), "transpose: dim1=", dim1, " out of range (dim=", dim(), ")") + TORCH_CHECK(dim0 < dim(), "transpose: dim0=", dim0, " out of range (dim=", dim(), ")") + TORCH_CHECK(dim1 < dim(), "transpose: dim1=", dim1, " out of range (dim=", dim(), ")") std::swap(r.sizes_[dim0], r.sizes_[dim1]); std::swap(r.strides_[dim0], r.strides_[dim1]); return r; diff --git a/aten/src/ATen/TensorUtils.cpp b/aten/src/ATen/TensorUtils.cpp index 745870423aa7..742089f2c7e2 100644 --- a/aten/src/ATen/TensorUtils.cpp +++ b/aten/src/ATen/TensorUtils.cpp @@ -20,13 +20,13 @@ std::ostream& operator<<(std::ostream & out, TensorGeometryArg t) { } void checkDim(CheckedFrom c, const TensorGeometryArg& t, int64_t dim) { - AT_CHECK(t->dim() == dim, + TORCH_CHECK(t->dim() == dim, "Expected ", dim, "-dimensional tensor, but got ", t->dim(), "-dimensional tensor for ", t," (while checking arguments for ", c, ")"); } void checkDimRange(CheckedFrom c, const TensorGeometryArg& t, int64_t dim_start, int64_t dim_end) { - AT_CHECK( + TORCH_CHECK( t->dim() >= dim_start && t->dim() < dim_end, "Expected ", dim_start, " to ", (dim_end - 1), " dimensions, but got ", t->dim(), "-dimensional tensor for ", t, " (while checking arguments for ", @@ -34,7 +34,7 @@ void checkDimRange(CheckedFrom c, const TensorGeometryArg& t, int64_t dim_start, } void checkContiguous(CheckedFrom c, const TensorGeometryArg& t) { - AT_CHECK( + TORCH_CHECK( t->is_contiguous(), "Expected contiguous tensor, but got non-contiguous tensor for ", t, " (while checking arguments for ", c, ")"); @@ -49,14 +49,14 @@ void checkAllContiguous(CheckedFrom c, at::ArrayRef ts) { void checkSize(CheckedFrom c, const TensorGeometryArg& t, IntArrayRef sizes) { checkDim(c, t, sizes.size()); - AT_CHECK( + TORCH_CHECK( t->sizes().equals(sizes), "Expected tensor of size ", sizes, ", but got tensor of size ", t->sizes(), " for ", t, " (while checking arguments for ", c, ")"); } void checkSize(CheckedFrom c, const TensorGeometryArg& t, int64_t dim, int64_t size) { - AT_CHECK( + TORCH_CHECK( t->size(dim) == size, "Expected tensor to have size ", size, " at dimension ", dim, ", but got size ", t->size(dim), " for ", t, @@ -76,7 +76,7 @@ void checkAllSame(CheckedFrom c, ArrayRef tensors, void(*fn)(CheckedF } void checkSameSize(CheckedFrom c, const TensorArg& t1, const TensorArg& t2) { - AT_CHECK( + TORCH_CHECK( t1->sizes().equals(t2->sizes()), "Expected tensor for ", t1, " to have same size as tensor for ", t2, "; but ", t1->sizes(), " does not equal ", t2->sizes(), @@ -88,7 +88,7 @@ void checkAllSameSize(CheckedFrom c, ArrayRef tensors) { } void checkNumel(CheckedFrom c, const TensorGeometryArg& t, int64_t numel) { - AT_CHECK( + TORCH_CHECK( t->numel() == numel, "Expected tensor for ", t, " to have ", numel, " elements; but it actually has ", t->numel(), " elements", @@ -96,7 +96,7 @@ void checkNumel(CheckedFrom c, const TensorGeometryArg& t, int64_t numel) { } void checkSameNumel(CheckedFrom c, const TensorArg& t1, const TensorArg& t2) { - AT_CHECK( + TORCH_CHECK( t1->numel() == t2->numel(), "Expected tensor for ", t1, " to have same number of elements as tensor for ", t2, "; but ", @@ -121,7 +121,7 @@ void checkSameGPU(CheckedFrom c, const TensorArg& t1, const TensorArg& t2) { << " to be on GPU (while checking arguments for " << c << ")"; AT_ERROR(oss.str()); } - AT_CHECK( + TORCH_CHECK( t1->get_device() == t2->get_device(), "Expected tensor for ", t1, " to have the same device as tensor for ", t2, "; but device ", t1->get_device(), " does not equal ", t2->get_device(), @@ -133,7 +133,7 @@ void checkAllSameGPU(CheckedFrom c, ArrayRef tensors) { } void checkSameType(CheckedFrom c, const TensorArg& t1, const TensorArg& t2) { - AT_CHECK( + TORCH_CHECK( t1->type() == t2->type(), "Expected tensor for ", t1, " to have the same type as tensor for ", t2, "; but type ", t1->toString(), " does not equal ", t2->toString(), @@ -141,7 +141,7 @@ void checkSameType(CheckedFrom c, const TensorArg& t1, const TensorArg& t2) { } void checkScalarType(CheckedFrom c, const TensorArg& t, ScalarType ty) { - AT_CHECK( + TORCH_CHECK( t->scalar_type() == ty, "Expected tensor for ", t, " to have scalar type ", toString(ty), "; but got ", t->toString(), " instead (while checking arguments for ", c, @@ -173,7 +173,7 @@ void checkAllSameType(CheckedFrom c, ArrayRef tensors) { } void checkSameDim(CheckedFrom c, const TensorGeometryArg& t1, const TensorGeometryArg& t2) { - AT_CHECK( + TORCH_CHECK( t1->dim() == t2->dim(), "Expected tensor for ", t1, " to have the same dimension as tensor for ", t2, "; but ", t1->dim(), " does not equal ", t2->dim(), @@ -181,7 +181,7 @@ void checkSameDim(CheckedFrom c, const TensorGeometryArg& t1, const TensorGeomet } void checkDefined(CheckedFrom c, const TensorArg& t) { - AT_CHECK( + TORCH_CHECK( t->defined(), "Expected tensor for ", t, " to be non-null, but it was undefined ", " (while checking arguments for ", c, ")"); @@ -195,7 +195,7 @@ void checkAllDefined(CheckedFrom c, ArrayRef ts) { } void checkBackend(CheckedFrom c, const Tensor& t, Backend backend) { - AT_CHECK( + TORCH_CHECK( !t.defined() || t.type().backend() == backend, "Expected tensor to have ", toString(backend), " Backend, but got tensor with ", toString(t.type().backend()), " Backend ", @@ -209,7 +209,7 @@ void checkBackend(CheckedFrom c, at::ArrayRef tensors, at::Backend backe } void checkDeviceType(CheckedFrom c, const Tensor& t, DeviceType device_type) { - AT_CHECK( + TORCH_CHECK( !t.defined() || t.type().device_type() == device_type, "Expected tensor to have ", device_type, " DeviceType, but got tensor with ", t.type().device_type(), " DeviceType ", @@ -223,7 +223,7 @@ void checkDeviceType(CheckedFrom c, at::ArrayRef tensors, at::DeviceType } void checkLayout(CheckedFrom c, const Tensor& t, Layout layout) { - AT_CHECK( + TORCH_CHECK( !t.defined() || t.layout() == layout, "Expected tensor to have ", layout, " Layout, but got tensor with ", t.layout(), " Layout ", @@ -263,6 +263,29 @@ bool geometry_is_contiguous(IntArrayRef sizes, IntArrayRef strides) { return contig_if_nonempty; } +// Correspond to THCUNN_check_dim_size/THNN_check_dim_size +void check_dim_size( + const Tensor& tensor, + int64_t dim, + int64_t dim_size, + int64_t size) { + /* Check dimension size of a tensor */ + TORCH_CHECK( + tensor.dim() == dim && tensor.size(dim_size) == size, + "Expected a tensor of dimension ", + dim, + " and tensor.size[", + dim_size, + "] == ", + size, + " but got: dimension ", + tensor.dim(), + " and tensor.size[", + dim_size, + "] = ", + tensor.size(dim_size)); +} + namespace detail { std::vector defaultStrides(IntArrayRef sizes) { @@ -287,5 +310,6 @@ int64_t computeStorageSize(IntArrayRef sizes, IntArrayRef strides) { } return size; } + } // namespace detail } // namespace at diff --git a/aten/src/ATen/TensorUtils.h b/aten/src/ATen/TensorUtils.h index 7ddd689376c6..3c8998c88f80 100644 --- a/aten/src/ATen/TensorUtils.h +++ b/aten/src/ATen/TensorUtils.h @@ -135,6 +135,13 @@ CAFFE2_API void* maybe_data_ptr(const TensorArg& tensor); // on whether a subgeometry is contiguous. CAFFE2_API bool geometry_is_contiguous(IntArrayRef sizes, IntArrayRef strides); +// Correspond to THCUNN_check_dim_size/THNN_check_dim_size +CAFFE2_API void check_dim_size( + const Tensor& tensor, + int64_t dim, + int64_t dim_size, + int64_t size); + namespace detail { CAFFE2_API std::vector defaultStrides(IntArrayRef sizes); CAFFE2_API int64_t computeStorageSize(IntArrayRef sizes, IntArrayRef strides); diff --git a/aten/src/ATen/WrapDimUtilsMulti.h b/aten/src/ATen/WrapDimUtilsMulti.h index eefc0c80be77..a2af1b0dcd71 100644 --- a/aten/src/ATen/WrapDimUtilsMulti.h +++ b/aten/src/ATen/WrapDimUtilsMulti.h @@ -13,11 +13,11 @@ namespace at { constexpr size_t dim_bitset_size = 64; static inline std::bitset dim_list_to_bitset(IntArrayRef dims, int64_t ndims) { - AT_CHECK(ndims <= (int64_t) dim_bitset_size, "only tensors with up to ", dim_bitset_size, " dims are supported"); + TORCH_CHECK(ndims <= (int64_t) dim_bitset_size, "only tensors with up to ", dim_bitset_size, " dims are supported"); std::bitset seen; for (size_t i = 0; i < dims.size(); i++) { size_t dim = maybe_wrap_dim(dims[i], ndims); - AT_CHECK(!seen[dim], "dim ", dim, " appears multiple times in the list of dims"); + TORCH_CHECK(!seen[dim], "dim ", dim, " appears multiple times in the list of dims"); seen[dim] = true; } return seen; diff --git a/aten/src/ATen/core/CMakeLists.txt b/aten/src/ATen/core/CMakeLists.txt index 0f5d86f2db1d..dd1f3f9058d5 100644 --- a/aten/src/ATen/core/CMakeLists.txt +++ b/aten/src/ATen/core/CMakeLists.txt @@ -9,7 +9,6 @@ EXCLUDE(ATen_CORE_SRCS "${ATen_CORE_SRCS}" ${ATen_CORE_TEST_SRCS}) # Add files needed from jit folders LIST(APPEND ATen_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../../../../torch/csrc/jit/source_range.h - ${CMAKE_CURRENT_SOURCE_DIR}/../../../../torch/csrc/jit/source_location.h ${CMAKE_CURRENT_SOURCE_DIR}/../../../../torch/csrc/jit/script/function_schema_parser.h ${CMAKE_CURRENT_SOURCE_DIR}/../../../../torch/csrc/jit/script/lexer.h ${CMAKE_CURRENT_SOURCE_DIR}/../../../../torch/csrc/jit/script/strtod.h @@ -23,6 +22,7 @@ LIST(APPEND ATen_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/../../../../torch/csrc/jit/script/lexer.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../../../torch/csrc/jit/script/strtod.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../../../torch/csrc/jit/script/schema_type_parser.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../../../torch/csrc/jit/source_range.cpp ) # Pass to parent diff --git a/aten/src/ATen/core/DeprecatedTypeProperties.h b/aten/src/ATen/core/DeprecatedTypeProperties.h index ed09ccb8af31..50645b62e5d4 100644 --- a/aten/src/ATen/core/DeprecatedTypeProperties.h +++ b/aten/src/ATen/core/DeprecatedTypeProperties.h @@ -64,9 +64,16 @@ class CAFFE2_API DeprecatedTypeProperties { } std::string toString() const { - std::stringstream ss; - ss << at::toString(backend()) << at::toString(scalarType()) << "Type"; - return ss.str(); + std::string base_str; + if (backend_ == Backend::Undefined || scalar_type_ == ScalarType::Undefined) { + base_str = "UndefinedType"; + } else { + base_str = std::string(at::toString(backend_)) + at::toString(scalar_type_) + "Type"; + } + if (is_variable_) { + return "Variable[" + base_str + "]"; + } + return base_str; } DeprecatedTypeProperties & toBackend(Backend b) const { diff --git a/aten/src/ATen/core/Dict.h b/aten/src/ATen/core/Dict.h index 43e7a5736072..2cd374677108 100644 --- a/aten/src/ATen/core/Dict.h +++ b/aten/src/ATen/core/Dict.h @@ -2,46 +2,19 @@ #include #include -#include #include namespace c10 { +struct IValue; template class Dict; namespace impl { - -inline bool shallowEquals(const IValue& lhs, const IValue& rhs) { - if (lhs.isNone()) { - return rhs.isNone(); - } else if (lhs.isInt()) { - return rhs.isInt() && lhs.toInt() == rhs.toInt(); - } else if (lhs.isString()) { - return rhs.isString() && lhs.toStringRef() == rhs.toStringRef(); - } else if (lhs.isDouble()) { - return rhs.isDouble() && lhs.toDouble() == rhs.toDouble(); - } else if (lhs.isBool()) { - return rhs.isBool() && lhs.toBool() == rhs.toBool(); - } else { - AT_ERROR("shallowEquals(IValue, IValue) not implemented for type ", lhs.tagKind()); - } -} +bool shallowEquals(const IValue& lhs, const IValue& rhs); } namespace detail { struct DictHash { - size_t operator()(const IValue& ivalue) const { - if (ivalue.isInt()) { - return std::hash()(ivalue.toInt()); - } else if (ivalue.isString()) { - return std::hash()(ivalue.toStringRef()); - } else if (ivalue.isDouble()) { - return std::hash()(ivalue.toDouble()); - } else if (ivalue.isBool()) { - return std::hash()(ivalue.toBool()); - } else { - throw std::runtime_error("Can't hash IValues with this tag"); - } - } + size_t operator()(const IValue& ivalue) const; }; struct DictEqualTo { @@ -208,71 +181,53 @@ class Dict final { * Returns an iterator to the first element of the container. * If the container is empty, the returned iterator will be equal to end(). */ - iterator begin() { - return iterator{map_.begin()}; - } + iterator begin(); /** * Returns an iterator to the first element of the container. * If the container is empty, the returned iterator will be equal to end(). */ - const_iterator begin() const { - return const_iterator{map_.begin()}; - } + const_iterator begin() const; /** * Returns an iterator to the first element of the container. * If the container is empty, the returned iterator will be equal to end(). */ - const_iterator cbegin() const { - return const_iterator{map_.cbegin()}; - } + const_iterator cbegin() const; /** * Returns an iterator to the element following the last element of the container. * This element acts as a placeholder; attempting to access it results in undefined behavior. */ - iterator end() { - return iterator{map_.end()}; - } + iterator end(); /** * Returns an iterator to the element following the last element of the container. * This element acts as a placeholder; attempting to access it results in undefined behavior. */ - const_iterator end() const { - return const_iterator{map_.end()}; - } + const_iterator end() const; /** * Returns an iterator to the element following the last element of the container. * This element acts as a placeholder; attempting to access it results in undefined behavior. */ - const_iterator cend() const { - return const_iterator{map_.cend()}; - } + const_iterator cend() const; /** * Checks if the container has no elements. */ - bool empty() const { - return map_.empty(); - } + bool empty() const; /** * Returns the number of elements in the container. */ - size_type size() const { - return map_.size(); - } + size_type size() const; /** * Erases all elements from the container. After this call, size() returns zero. * Invalidates any references, pointers, or iterators referring to contained elements. May also invalidate past-the-end iterators. */ - void clear() { - map_.clear(); - } + void clear(); /** * Inserts element(s) into the container, if the container doesn't already contain an element with an equivalent key. @@ -281,14 +236,7 @@ class Dict final { * @return A pair consisting of an iterator to the inserted element (or to the element that prevented the insertion) and a bool denoting whether the insertion took place. */ template - std::pair insert(Key_&& key, Value_&& value) { - static_assert(std::is_constructible::value, "Wrong type for the key argument of Dict::insert"); - static_assert(std::is_constructible::value, "Wrong type for the value argument of Dict::insert"); - auto inserted = map_.insert({ - Key(std::forward(key)), - Value(std::forward(value))}); - return {iterator{inserted.first}, inserted.second}; - } + std::pair insert(Key_&& key, Value_&& value); /** * If an element with the given key already exists, it is overwritten with the given value. @@ -298,23 +246,14 @@ class Dict final { * @return The bool component is true if the insertion took place and false if the assignment took place. The iterator component is pointing at the element that was inserted or updated. */ template - std::pair insert_or_assign(Key_&& key, Value_&& value) { - static_assert(std::is_constructible::value, "Wrong type for the key argument of Dict::insert_or_assign"); - static_assert(std::is_constructible::value, "Wrong type for the value argument of Dict::insert_or_assign"); - auto inserted = map_.insert_or_assign( - Key(std::forward(key)), - Value(std::forward(value))); - return {iterator{inserted.first}, inserted.second}; - } + std::pair insert_or_assign(Key_&& key, Value_&& value); /** * Removes the element pointed to by iter. * May invalidate any references, pointers, or iterators referring to contained elements. * The iterator iter must be valid and dereferenceable. Thus the end() iterator (which is valid, but is not dereferenceable) cannot be used as a value for iter. */ - void erase(const_iterator iter) { - map_.erase(iter.entryRef_.iterator_); - } + void erase(const_iterator iter); /** * Removes the element with the given key, if it exists. @@ -322,17 +261,13 @@ class Dict final { * * @return The number of elements removed. This is either '1' if an element with the key existed, or '0' if it didn't. */ - C10_NODISCARD size_t erase(const Key& key) { - return map_.erase(key); - } + C10_NODISCARD size_t erase(const Key& key); /** * Returns the mapped value of the element with key equivalent to key. * If no such element exists, an exception of type std::out_of_range is thrown. */ - Value at(const Key& key) { - return map_.at(key).template to(); - } + Value at(const Key& key) const; /** * Finds an element with key equivalent to key. @@ -340,9 +275,7 @@ class Dict final { * @return Iterator to an element with key equivalent to key. * If no such element is found, past-the-end (see end()) iterator is returned. */ - iterator find(const Key& key) { - return iterator{map_.find(key)}; - } + iterator find(const Key& key); /** * Finds an element with key equivalent to key. @@ -350,26 +283,20 @@ class Dict final { * @return Iterator to an element with key equivalent to key. * If no such element is found, past-the-end (see end()) iterator is returned. */ - const_iterator find(const Key& key) const { - return const_iterator{map_.find(key)}; - } + const_iterator find(const Key& key) const; /** * Checks if there is an element with key equivalent to key in the container. * * @return true if there is such an element, otherwise false. */ - bool contains(const Key& key) const { - return end() != find(key); - } + bool contains(const Key& key) const; /** * Increase the capacity so that at least count elements can be stored without * having to reallocate or rehash. */ - void reserve(size_type count) { - map_.reserve(count); - } + void reserve(size_type count); }; namespace impl { @@ -391,4 +318,4 @@ GenericDict toGenericDict(Dict&& dict) { } -#include +#include diff --git a/aten/src/ATen/core/Dict_inl.h b/aten/src/ATen/core/Dict_inl.h new file mode 100644 index 000000000000..0118bee18c41 --- /dev/null +++ b/aten/src/ATen/core/Dict_inl.h @@ -0,0 +1,144 @@ +#pragma once + +#include + +namespace c10 { +namespace impl { +inline bool shallowEquals(const IValue& lhs, const IValue& rhs) { + if (lhs.isNone()) { + return rhs.isNone(); + } else if (lhs.isInt()) { + return rhs.isInt() && lhs.toInt() == rhs.toInt(); + } else if (lhs.isString()) { + return rhs.isString() && lhs.toStringRef() == rhs.toStringRef(); + } else if (lhs.isDouble()) { + return rhs.isDouble() && lhs.toDouble() == rhs.toDouble(); + } else if (lhs.isBool()) { + return rhs.isBool() && lhs.toBool() == rhs.toBool(); + } else { + AT_ERROR("shallowEquals(IValue, IValue) not implemented for type ", lhs.tagKind()); + } +} +} + +namespace detail { + +inline size_t DictHash::operator()(const IValue& ivalue) const { + if (ivalue.isInt()) { + return std::hash()(ivalue.toInt()); + } else if (ivalue.isString()) { + return std::hash()(ivalue.toStringRef()); + } else if (ivalue.isDouble()) { + return std::hash()(ivalue.toDouble()); + } else if (ivalue.isBool()) { + return std::hash()(ivalue.toBool()); + } else { + throw std::runtime_error("Can't hash IValues with this tag"); + } +} + +} + +template +typename Dict::iterator Dict::begin() { + return iterator{map_.begin()}; +} + +template +typename Dict::const_iterator Dict::begin() const { + return const_iterator{map_.begin()}; +} + +template +typename Dict::const_iterator Dict::cbegin() const { + return const_iterator{map_.cbegin()}; +} + +template +typename Dict::iterator Dict::end() { + return iterator{map_.end()}; +} + +template +typename Dict::const_iterator Dict::end() const { + return const_iterator{map_.end()}; +} + +template +typename Dict::const_iterator Dict::cend() const { + return const_iterator{map_.cend()}; +} + +template +bool Dict::empty() const { + return map_.empty(); +} + +template +typename Dict::size_type Dict::size() const { + return map_.size(); +} + +template +void Dict::clear() { + map_.clear(); +} + +template +template +std::pair::iterator, bool> Dict::insert(Key_&& key, Value_&& value) { + static_assert(std::is_constructible::value, "Wrong type for the key argument of Dict::insert"); + static_assert(std::is_constructible::value, "Wrong type for the value argument of Dict::insert"); + auto inserted = map_.insert(std::pair{ + Key(std::forward(key)), + Value(std::forward(value))}); + return {iterator{inserted.first}, inserted.second}; +} + +template +template +std::pair::iterator, bool> Dict::insert_or_assign(Key_&& key, Value_&& value) { + static_assert(std::is_constructible::value, "Wrong type for the key argument of Dict::insert_or_assign"); + static_assert(std::is_constructible::value, "Wrong type for the value argument of Dict::insert_or_assign"); + auto inserted = map_.insert_or_assign( + Key(std::forward(key)), + Value(std::forward(value))); + return {iterator{inserted.first}, inserted.second}; +} + +template +void Dict::erase(const_iterator iter) { + map_.erase(iter.entryRef_.iterator_); +} + +template +C10_NODISCARD size_t Dict::erase(const Key& key) { + return map_.erase(key); +} + +template +Value Dict::at(const Key& key) const { + return map_.at(key).template to(); +} + +template +typename Dict::iterator Dict::find(const Key& key) { + return iterator{map_.find(key)}; +} + +template +typename Dict::const_iterator Dict::find(const Key& key) const { + return const_iterator{map_.find(key)}; +} + +template +bool Dict::contains(const Key& key) const { + return end() != find(key); +} + +template +void Dict::reserve(size_type count) { + map_.reserve(count); +} + +} diff --git a/aten/src/ATen/core/LegacyTypeDispatch.cpp b/aten/src/ATen/core/LegacyTypeDispatch.cpp index d5b959a06508..f20062dbd34d 100644 --- a/aten/src/ATen/core/LegacyTypeDispatch.cpp +++ b/aten/src/ATen/core/LegacyTypeDispatch.cpp @@ -2,54 +2,6 @@ namespace at { -/// NOTE [ Treating Variables as non-Variables in type dispatch ] -/// -/// Previously, in VariableType_*.cpp (generated by gen_variable_type.py), when -/// a function is using the 'use_derived' strategy, we call its implementation -/// on the base non-Variable type (`baseType`), passing unwrapped tensors to the -/// call so that any `.dispatch_type()` calls in the implementation can treat the passed -/// tensors as non-Variables and won't dispatch back to functions in VariableType. -/// -/// However, after the Variable/Tensor merge, there is no concept of unwrapping -/// a tensor anymore, and directly passing variables to the base type calls will -/// cause the `.dispatch_type()` dispatch in the implementation to treat the tensor as a -/// variable, and any function dispatch based on `.dispatch_type()` will dispatch back to -/// VariableType, which is not what we want. -/// -/// The solution to the above problem is to add `at::NonVariableTypeMode`, which -/// when enabled will cause `legacyTensorType()` and `getType()` to always return -/// non-Variable type, even if the tensor being called on is a variable. -/// -/// TODO: Since `torch::NoGradGuard` serves the same purpose in libtorch, we should -/// merge these two thread-local guards. - -/// In the CAFFE2_FB_LIMITED_MOBILE_CAPABILITY build setting, -/// thread_local is not supported. In that case, we don't provide -/// `at::NonVariableTypeMode`. -#ifndef CAFFE2_FB_LIMITED_MOBILE_CAPABILITY - -thread_local bool NonVariableTypeMode_enabled = false; - -bool NonVariableTypeMode::is_enabled() { - return NonVariableTypeMode_enabled; -} - -void NonVariableTypeMode::set_enabled(bool enabled) { - NonVariableTypeMode_enabled = enabled; -} - -#else // defined(CAFFE2_FB_LIMITED_MOBILE_CAPABILITY) - -bool NonVariableTypeMode::is_enabled() { - throw std::runtime_error("NonVariableTypeMode is not supported on mobile"); -} - -void NonVariableTypeMode::set_enabled(bool enabled) { - throw std::runtime_error("NonVariableTypeMode is not supported on mobile"); -} - -#endif - // TODO: This could be bad juju if someone calls globalContext() in the // destructor of an object with static lifetime. LegacyTypeDispatch & globalLegacyTypeDispatch() { diff --git a/aten/src/ATen/core/LegacyTypeDispatch.h b/aten/src/ATen/core/LegacyTypeDispatch.h index 93b5348ec6a4..e65205124d10 100644 --- a/aten/src/ATen/core/LegacyTypeDispatch.h +++ b/aten/src/ATen/core/LegacyTypeDispatch.h @@ -139,11 +139,6 @@ class CAFFE2_API LegacyTypeDispatch { CAFFE2_API LegacyTypeDispatch& globalLegacyTypeDispatch(); -struct CAFFE2_API NonVariableTypeMode { - static bool is_enabled(); - static void set_enabled(bool enabled); -}; - // A RAII, thread local (!) guard that has the following effect: // // Upon construction: sets NonVariableTypeMode_enabled for the current thread to @@ -180,7 +175,7 @@ inline Type& legacyTensorType(const TensorImpl& tensor) { return *globalLegacyTypeDispatch().getTypeRaw( tensorTypeIdToBackend(tensor.type_id()), typeMetaToScalarType(tensor.dtype()), - tensor.is_variable() && !at::NonVariableTypeMode::is_enabled()); + tensor.is_variable()); } inline void initializeLegacyTypeDispatchFor(const TensorImpl& tensor) { @@ -188,7 +183,7 @@ inline void initializeLegacyTypeDispatchFor(const TensorImpl& tensor) { globalLegacyTypeDispatch().getType( tensorTypeIdToBackend(tensor.type_id()), typeMetaToScalarType(tensor.dtype()), - tensor.is_variable() && !at::NonVariableTypeMode::is_enabled()); + tensor.is_variable()); } } // namespace at diff --git a/aten/src/ATen/core/Tensor.cpp b/aten/src/ATen/core/Tensor.cpp index aa611e87454f..4597432cf171 100644 --- a/aten/src/ATen/core/Tensor.cpp +++ b/aten/src/ATen/core/Tensor.cpp @@ -35,14 +35,14 @@ void Tensor::enforce_invariants() { void Tensor::print() const { if (defined()) { - std::cerr << "[" << dispatch_type().toString() << " " << sizes() << "]" << std::endl; + std::cerr << "[" << type().toString() << " " << sizes() << "]" << std::endl; } else { std::cerr << "[UndefinedTensor]" << std::endl; } } -const char * Tensor::toString() const { - return dispatch_type().toString(); +std::string Tensor::toString() const { + return type().toString(); } } // namespace at diff --git a/aten/src/ATen/core/Tensor.h b/aten/src/ATen/core/Tensor.h index ba1daec99ab6..55e80aa630f3 100644 --- a/aten/src/ATen/core/Tensor.h +++ b/aten/src/ATen/core/Tensor.h @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -154,7 +155,7 @@ class CAFFE2_API Tensor { return impl_.weak_use_count(); } - const char * toString() const; + std::string toString() const; IntArrayRef sizes() const { return impl_->sizes(); @@ -165,8 +166,8 @@ class CAFFE2_API Tensor { int64_t ndimension() const { return dim(); } - bool is_contiguous() const { - return impl_->is_contiguous(); + bool is_contiguous(at::MemoryFormat memory_format=at::MemoryFormat::Any) const { + return impl_->is_contiguous(memory_format); } // Total bytes consumed by the "view" of elements of the array. Does not @@ -193,7 +194,7 @@ class CAFFE2_API Tensor { return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties( tensorTypeIdToBackend(type_id()), scalar_type(), - is_variable() && !at::NonVariableTypeMode::is_enabled()); + is_variable()); } Type & dispatch_type() const { return legacyTensorType(*impl_); @@ -266,7 +267,7 @@ class CAFFE2_API Tensor { template TensorAccessor accessor() const& { static_assert(N > 0, "accessor is used for indexing tensor, for scalars use *data()"); - AT_CHECK(dim() == N, "expected ", N, " dims but tensor has ", dim()); + TORCH_CHECK(dim() == N, "expected ", N, " dims but tensor has ", dim()); return TensorAccessor(data(),sizes().data(),strides().data()); } template @@ -280,7 +281,7 @@ class CAFFE2_API Tensor { template class PtrTraits = DefaultPtrTraits, typename index_t = int64_t> PackedTensorAccessor packed_accessor() const& { static_assert(N > 0, "accessor is used for indexing tensor, for scalars use *data()"); - AT_CHECK(dim() == N, "expected ", N, " dims but tensor has ", dim()); + TORCH_CHECK(dim() == N, "expected ", N, " dims but tensor has ", dim()); return PackedTensorAccessor(static_cast::PtrType>(data()),sizes().data(),strides().data()); } template class PtrTraits = DefaultPtrTraits, typename index_t = int64_t> @@ -373,7 +374,7 @@ class CAFFE2_API Tensor { Tensor & clamp_max_(Scalar max); Tensor clamp_min(Scalar min) const; Tensor & clamp_min_(Scalar min); - Tensor contiguous() const; + Tensor contiguous(MemoryFormat memory_format=MemoryFormat::Contiguous) const; Tensor & copy_(const Tensor & src, bool non_blocking=false); Tensor cos() const; Tensor & cos_(); @@ -580,8 +581,9 @@ class CAFFE2_API Tensor { Tensor to_sparse(int64_t sparse_dim) const; Tensor to_sparse() const; Tensor to_mkldnn() const; - Tensor quantize_linear(double scale, int64_t zero_point) const; + Tensor quantize_linear(double scale, int64_t zero_point, ScalarType dtype) const; Tensor dequantize() const; + Tensor dequantize_linear(double scale, int64_t zero_point, ScalarType dtype) const; Scalar q_scale() const; Scalar q_zero_point() const; Tensor int_repr() const; diff --git a/aten/src/ATen/core/TensorMethods.h b/aten/src/ATen/core/TensorMethods.h index 138842d532be..25ce96c9b749 100644 --- a/aten/src/ATen/core/TensorMethods.h +++ b/aten/src/ATen/core/TensorMethods.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include #include @@ -176,8 +177,8 @@ inline Tensor Tensor::clamp_min(Scalar min) const { inline Tensor & Tensor::clamp_min_(Scalar min) { return dispatch_type().clamp_min_(*this, min); } -inline Tensor Tensor::contiguous() const { - return dispatch_type().contiguous(*this); +inline Tensor Tensor::contiguous(MemoryFormat memory_format) const { + return dispatch_type().contiguous(*this, memory_format); } inline Tensor & Tensor::copy_(const Tensor & src, bool non_blocking) { return dispatch_type().copy_(*this, src, non_blocking); @@ -797,12 +798,15 @@ inline Tensor Tensor::to_sparse() const { inline Tensor Tensor::to_mkldnn() const { return dispatch_type().to_mkldnn(*this); } -inline Tensor Tensor::quantize_linear(double scale, int64_t zero_point) const { - return dispatch_type().quantize_linear(*this, scale, zero_point); +inline Tensor Tensor::quantize_linear(double scale, int64_t zero_point, ScalarType dtype) const { + return dispatch_type().quantize_linear(*this, scale, zero_point, dtype); } inline Tensor Tensor::dequantize() const { return dispatch_type().dequantize(*this); } +inline Tensor Tensor::dequantize_linear(double scale, int64_t zero_point, ScalarType dtype) const { + return dispatch_type().dequantize_linear(*this, scale, zero_point, dtype); +} inline Scalar Tensor::q_scale() const { return dispatch_type().q_scale(*this); } @@ -1372,7 +1376,7 @@ inline bool is_quantized(Tensor self) { #define DEFINE_CAST(T, name, _) \ template <> \ inline T* Tensor::data() const { \ - AT_CHECK( \ + TORCH_CHECK( \ scalar_type() == ScalarType::name, \ "expected scalar type ", \ #name, \ diff --git a/aten/src/ATen/core/Type.h b/aten/src/ATen/core/Type.h index 68451b81ce08..ff1bb03e7e55 100644 --- a/aten/src/ATen/core/Type.h +++ b/aten/src/ATen/core/Type.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -182,7 +183,7 @@ struct CAFFE2_API Type { virtual Tensor & clamp_max_(Tensor & self, Scalar max) const = 0; virtual Tensor clamp_min(const Tensor & self, Scalar min) const = 0; virtual Tensor & clamp_min_(Tensor & self, Scalar min) const = 0; - virtual Tensor contiguous(const Tensor & self) const = 0; + virtual Tensor contiguous(const Tensor & self, MemoryFormat memory_format) const = 0; virtual Tensor & copy_(Tensor & self, const Tensor & src, bool non_blocking) const = 0; virtual Tensor cos(const Tensor & self) const = 0; virtual Tensor & cos_(Tensor & self) const = 0; @@ -390,8 +391,9 @@ struct CAFFE2_API Type { virtual Tensor to_sparse(const Tensor & self, int64_t sparse_dim) const = 0; virtual Tensor to_sparse(const Tensor & self) const = 0; virtual Tensor to_mkldnn(const Tensor & self) const = 0; - virtual Tensor quantize_linear(const Tensor & self, double scale, int64_t zero_point) const = 0; + virtual Tensor quantize_linear(const Tensor & self, double scale, int64_t zero_point, ScalarType dtype) const = 0; virtual Tensor dequantize(const Tensor & self) const = 0; + virtual Tensor dequantize_linear(const Tensor & self, double scale, int64_t zero_point, ScalarType dtype) const = 0; virtual Scalar q_scale(const Tensor & self) const = 0; virtual Scalar q_zero_point(const Tensor & self) const = 0; virtual Tensor int_repr(const Tensor & self) const = 0; diff --git a/aten/src/ATen/core/alias_info.h b/aten/src/ATen/core/alias_info.h index 36704e424a8d..c9cb3d71f403 100644 --- a/aten/src/ATen/core/alias_info.h +++ b/aten/src/ATen/core/alias_info.h @@ -22,11 +22,6 @@ class AliasInfo { static const Symbol wc = Symbol::fromQualString("alias::*"); return wc; } - static AliasInfo createWildcard() { - AliasInfo ret; - ret.addBeforeSet(wildcardSet()); - return ret; - } void setIsWrite(bool isWrite) { isWrite_ = isWrite; @@ -57,10 +52,14 @@ class AliasInfo { return *beforeSets_.begin(); } - bool isWildcard() const { + bool isWildcardBefore() const { return beforeSets_.count(wildcardSet()) != 0; } + bool isWildcardAfter() const { + return afterSets_.count(wildcardSet()) != 0; + } + // the alias info for the contained types of the type // e.g. if this is an annotation on List[T], `sets` refers to // the alias sets that the list may be in diff --git a/aten/src/ATen/core/aten_interned_strings.h b/aten/src/ATen/core/aten_interned_strings.h index cec23de35dc7..ac3e71ebf5d8 100644 --- a/aten/src/ATen/core/aten_interned_strings.h +++ b/aten/src/ATen/core/aten_interned_strings.h @@ -8,7 +8,6 @@ // To explicitly use interned strings as symbols in your code, you must add // them to this list. -#if !defined(C10_MOBILE) || defined(FEATURE_TORCH_MOBILE) #define FORALL_ATEN_BASE_SYMBOLS(_) \ _(aten, __and__) \ _(aten, __iand__) \ @@ -258,6 +257,8 @@ _(aten, cosh) \ _(aten, cosine_embedding_loss) \ _(aten, cosine_similarity) \ _(aten, cross) \ +_(aten, std_mean) \ +_(aten, var_mean) \ _(aten, ctc_loss) \ _(aten, cudnn_affine_grid_generator) \ _(aten, cudnn_affine_grid_generator_backward) \ @@ -906,6 +907,8 @@ _(attr, padding_value) \ _(attr, params) \ _(attr, pdist) \ _(attr, cdist) \ +_(attr, std_mean) \ +_(attr, var_mean) \ _(attr, periodic) \ _(attr, pivot) \ _(attr, pivots) \ @@ -1013,4 +1016,3 @@ _(attr, workspace) \ _(attr, x) \ _(attr, x1) \ _(attr, x2) -#endif diff --git a/aten/src/ATen/core/dispatch/DispatchTable.h b/aten/src/ATen/core/dispatch/DispatchTable.h index 9c4dafcbf116..ddf0564af187 100644 --- a/aten/src/ATen/core/dispatch/DispatchTable.h +++ b/aten/src/ATen/core/dispatch/DispatchTable.h @@ -61,7 +61,7 @@ class KernelTable_ final { if (!emplaced.second) { // Element already existed. Overwrite it. emplaced.first->second = value; - AT_WARN("Registered a kernel that overwrote a previoulsy registered kernel with same dispatch key '", + AT_WARN("Registered a kernel that overwrote a previously registered kernel with same dispatch key '", detail::dispatch_key_to_string(key), "' for operator '", operator_name ,"'."); } } @@ -205,7 +205,7 @@ class DispatchTable final { bool is_valid_; TensorTypeId get_dispatch_key(const Stack* stack) const { - auto first_tensor_arg = torch::jit::peek( + const IValue& first_tensor_arg = torch::jit::peek( *stack, 0, reverse_index_of_first_tensor_arg_ @@ -217,8 +217,7 @@ class DispatchTable final { } return tensor_list[0].type_id(); } else { - // TODO Avoid bumping the refcounter - return first_tensor_arg.toTensor().type_id(); + return first_tensor_arg.unsafeToTensorImpl()->type_id(); } } }; diff --git a/aten/src/ATen/core/function_schema_inl.h b/aten/src/ATen/core/function_schema_inl.h index d5e4c188589b..d1bfa03c4009 100644 --- a/aten/src/ATen/core/function_schema_inl.h +++ b/aten/src/ATen/core/function_schema_inl.h @@ -97,7 +97,7 @@ inline void FunctionSchema::checkAndNormalizeInputs( std::vector& inputs, const std::unordered_map& kwargs) const { // Do we have more inputs than the schema accepts? - AT_CHECK( + TORCH_CHECK( inputs.size() <= arguments().size(), "Expected at most ", arguments().size(), diff --git a/aten/src/ATen/core/interned_strings.h b/aten/src/ATen/core/interned_strings.h index f073f061259b..1f1d1f4e0e86 100644 --- a/aten/src/ATen/core/interned_strings.h +++ b/aten/src/ATen/core/interned_strings.h @@ -5,9 +5,12 @@ #include #include -#include #include +#if !defined(C10_MOBILE) || defined(FEATURE_TORCH_MOBILE) +#include +#endif + namespace c10 { #if !defined(C10_MOBILE) || defined(FEATURE_TORCH_MOBILE) @@ -70,6 +73,7 @@ namespace c10 { _(prim, requires_grad) \ _(prim, AutogradAdd) \ _(prim, GradOf) \ + _(prim, Guard) \ _(prim, FusedConcat) \ _(prim, ConstantChunk) \ _(prim, MMTreeReduce) \ @@ -205,7 +209,7 @@ namespace c10 { // 'onnx' symbols correspond to ONNX operators. Their semantics // are defined in https://github.com/onnx/onnx/blob/master/docs/Operators.md // The particular version we are targeting is specified by '_onnx_opset_version' -// in torch.onnx.symbolic +// in torch.onnx.symbolic_helper // // In general, most ONNX operators won't get an entry here, because they // are handled from the Python end. However, you may occasionally need diff --git a/aten/src/ATen/core/ivalue.cpp b/aten/src/ATen/core/ivalue.cpp index 7382f904a997..b41c8b311256 100644 --- a/aten/src/ATen/core/ivalue.cpp +++ b/aten/src/ATen/core/ivalue.cpp @@ -114,6 +114,16 @@ std::string ivalue::Object::name() const { return this->type_->qualname(); } +IValue ivalue::Object::getAttr(const std::string& name) const { + const size_t slot = type_->getAttributeSlot(name); + return getSlot(slot); +} + +void ivalue::Object::setAttr(const std::string& name, IValue v) { + const size_t slot = type_->getAttributeSlot(name); + setSlot(slot, std::move(v)); +} + void ivalue::Object::resizeObject(size_t slot) { AT_ASSERT(slot < type()->numAttributes()); slots_.resize(type()->numAttributes()); diff --git a/aten/src/ATen/core/ivalue.h b/aten/src/ATen/core/ivalue.h index 9d6e46b3306a..dfc809323c89 100644 --- a/aten/src/ATen/core/ivalue.h +++ b/aten/src/ATen/core/ivalue.h @@ -1,623 +1,409 @@ #pragma once -#include -#include - -#include -#include -#include -#include -#include -#include +#include +#include +#include +namespace torch { +namespace jit { +namespace script { +struct Function; +} +} // namespace jit +} // namespace torch namespace c10 { +template class Dict; struct IValue; -struct ClassType; - -template -c10::intrusive_ptr IValue::moveToIntrusivePtr() { - auto t = c10::intrusive_ptr::reclaim(static_cast(payload.as_intrusive_ptr)); - clearToNone(); - return t; -} -template -c10::intrusive_ptr IValue::toIntrusivePtr() const { - auto r = c10::intrusive_ptr::reclaim(static_cast(payload.as_intrusive_ptr)); - auto p = r; - r.release(); - return p; -} - -inline c10::intrusive_ptr IValue::toTuple() && { - AT_ASSERT(isTuple()); - return moveToIntrusivePtr(); -} -inline c10::intrusive_ptr IValue::toTuple() const & { - AT_ASSERT(isTuple()); - return toIntrusivePtr(); -} -inline c10::intrusive_ptr IValue::toFuture() && { - AT_ASSERT(isFuture()); - return moveToIntrusivePtr(); -} -inline c10::intrusive_ptr IValue::toFuture() const & { - AT_ASSERT(isFuture()); - return toIntrusivePtr(); -} -inline c10::intrusive_ptr IValue::toIntList() && { - AT_ASSERT(isIntList()); - return moveToIntrusivePtr(); -} -inline c10::intrusive_ptr IValue::toIntList() const & { - AT_ASSERT(isIntList()); - return toIntrusivePtr(); -} -inline c10::intrusive_ptr IValue::toString() && { - AT_ASSERT(isString()); - return moveToIntrusivePtr(); -} -inline c10::intrusive_ptr IValue::toString() const & { - AT_ASSERT(isString()); - return toIntrusivePtr(); -} -inline c10::intrusive_ptr IValue::toDoubleList() && { - AT_ASSERT(isDoubleList()); - return moveToIntrusivePtr(); -} -inline c10::intrusive_ptr IValue::toDoubleList() const & { - AT_ASSERT(isDoubleList()); - return toIntrusivePtr(); -} -inline c10::intrusive_ptr IValue::toBoolList() && { - AT_ASSERT(isBoolList()); - return moveToIntrusivePtr(); -} -inline c10::intrusive_ptr IValue::toBoolList() const & { - AT_ASSERT(isBoolList()); - return toIntrusivePtr(); -} -inline c10::intrusive_ptr IValue::toTensorList() && { - AT_ASSERT(isTensorList()); - return moveToIntrusivePtr(); -} -inline c10::intrusive_ptr IValue::toTensorList() const & { - AT_ASSERT(isTensorList()); - return toIntrusivePtr(); -} -inline c10::intrusive_ptr IValue::toGenericList() && { - AT_ASSERT(isGenericList()); - return moveToIntrusivePtr(); -} -inline c10::intrusive_ptr IValue::toGenericList() const & { - AT_ASSERT(isGenericList()); - return toIntrusivePtr(); -} -inline c10::intrusive_ptr IValue::toGenericDict() && { - AT_ASSERT(isGenericDict()); - return moveToIntrusivePtr(); -} -inline c10::intrusive_ptr IValue::toGenericDict() const & { - AT_ASSERT(isGenericDict()); - return toIntrusivePtr(); -} -inline c10::intrusive_ptr IValue::toObject() && { - AT_ASSERT(isObject()); - return toIntrusivePtr(); -} -inline c10::intrusive_ptr IValue::toObject() const & { - AT_ASSERT(isObject()); - return toIntrusivePtr(); -} -inline at::Tensor IValue::toTensor() && { - AT_ASSERT(isTensor()); - return at::Tensor(moveToIntrusivePtr()); -} -inline at::Tensor IValue::toTensor() const & { - AT_ASSERT(isTensor()); - return at::Tensor(toIntrusivePtr()); -} -inline c10::intrusive_ptr IValue::toBlob() && { - AT_ASSERT(isBlob()); - return moveToIntrusivePtr(); -} -inline c10::intrusive_ptr IValue::toBlob() const & { - AT_ASSERT(isBlob()); - return toIntrusivePtr();; -} - namespace ivalue { - -template -using Shared = c10::intrusive_ptr; - -// string -struct CAFFE2_API ConstantString final : c10::intrusive_ptr_target { - private: - const std::string str_; - public: - ConstantString(std::string str) - : str_(std::move(str)) {} - static c10::intrusive_ptr create(std::string str_); - const std::string & string() const { - return str_; - } - operator const std::string & () const { - return string(); - } - CAFFE2_API friend std::ostream& operator<<( - std::ostream& out, - const ConstantString& v); -}; - -template -struct CAFFE2_API List : c10::intrusive_ptr_target { - private: - std::vector elements_; - - public: - typedef Elem ElemType; - - List(std::vector elements_) : elements_(std::move(elements_)) {} - static c10::intrusive_ptr> create(std::vector elements_) { - return c10::make_intrusive>(std::move(elements_)); - } - const std::vector& elements() const & { - return elements_; - } - operator const std::vector&() const { - return elements(); - } - - std::vector& elements() & { - return elements_; - } - operator std::vector&() { - return elements(); - } - - std::vector&& elements() && { - return std::move(elements_); - } -}; - +struct Tuple; +template struct List; +using IntList = List; +using TensorList = List; +using DoubleList = List; +using BoolList = List; +using GenericList = List; struct Future; +struct ConstantString; struct GenericDict; - -struct CAFFE2_API Tuple : public List { - using List::List; - static c10::intrusive_ptr create(std::vector elements_) { - return c10::make_intrusive(std::move(elements_)); - } -}; - struct Object; } -// Future -struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target { - private: - c10::intrusive_ptr intrusive_from_this() { - c10::raw::intrusive_ptr::incref(this); // we are creating a new pointer - // from a raw `this` pointer - // so we need to bump the refcount - // to account for this ownership - return c10::intrusive_ptr::reclaim(this); +// IValue is the generic tagged union used by the interpreter to hold +// all value types. +// It is a 16-byte object with an 8-byte payload and an 8-byte tag. +// The tag is currently 4 bytes to determine the type, and 1 byte +// to mark whether that type is a subtype of c10::intrusive_ptr_target and needs +// retain/release calls. + +#define TORCH_FORALL_TAGS(_) \ + _(None) \ + _(Tensor) \ + _(Double) \ + _(Int) \ + _(Bool) \ + _(Tuple) \ + _(IntList) \ + _(DoubleList) \ + _(BoolList) \ + _(String) \ + _(TensorList) \ + _(Blob) \ + _(GenericList) \ + _(GenericDict) \ + _(Future) \ + _(Device) \ + _(Object) + +struct CAFFE2_API IValue final { + IValue() + : payload{0} + , tag(Tag::None) + , is_intrusive_ptr(false) {} + IValue(const IValue& rhs) + : payload(rhs.payload), + tag(rhs.tag), + is_intrusive_ptr(rhs.is_intrusive_ptr) { + if (is_intrusive_ptr) { + c10::raw::intrusive_ptr::incref(payload.as_intrusive_ptr); + } } - - public: - struct CAFFE2_API FutureError final : public std::exception { - FutureError(std::string&& error_msg_) - : error_msg(std::move(error_msg_)) {} - - FutureError() = default; - - const char* what() const noexcept override { - return error_msg.c_str(); + IValue(IValue&& rhs) noexcept : IValue() { + swap(rhs); + } + ~IValue() { + if (is_intrusive_ptr) { + c10::raw::intrusive_ptr::decref(payload.as_intrusive_ptr); } + } + IValue & operator=(IValue && rhs) & noexcept { + IValue(std::move(rhs)).swap(*this); // this also sets rhs to None + return *this; + } + IValue & operator=(IValue const & rhs) & { + IValue(rhs).swap(*this); + return *this; + } - std::string error_msg; - }; + void dump() const; - /** - * Wait on the future until it completes. - */ - void wait() { - if (completed()) { - return; + bool isAliasOf(const IValue& rhs) const { + if (this->tag != rhs.tag) { + // Trivially don't alias if the type is different + return false; } - std::condition_variable finished; - bool fired = false; - - // Add a callback to notify the current thread - // when the current future completes. - addCallback([&] { - std::unique_lock lock(mutex_); - finished.notify_all(); - fired = true; - }); - - // The current thread will be blocked unless the above callback is fired. - std::unique_lock lock(mutex_); - while (!fired) { - finished.wait(lock); + + if (!this->is_intrusive_ptr) { + // Primitive types don't alias anything + return false; } - AT_ASSERT(completed()); - } + AT_ASSERT(rhs.is_intrusive_ptr); - /** - * Explicitly mark the future as completed with the output value. - */ - void markCompleted(IValue value) { - { - // This is not to protect completed_ but to create a barrier - // from possible addCallback() calls - std::unique_lock lock(mutex_); - AT_ASSERT(!completed()); - completed_ = true; - value_ = std::move(value); + // Tensors should be compared based on internal storage + if (this->isTensor()) { + const auto thisTensor = this->toTensor(); + const auto rhsTensor = rhs.toTensor(); + return thisTensor.is_alias_of(rhsTensor); } - fireCallbacks(); + // Other types can be compared by their ptr value + return this->payload.as_intrusive_ptr == rhs.payload.as_intrusive_ptr; + } + void swap(IValue & rhs) noexcept { + std::swap(payload, rhs.payload); + std::swap(is_intrusive_ptr, rhs.is_intrusive_ptr); + std::swap(tag, rhs.tag); } - void markCompleted(FutureError&& error_) { - { - // This is not to protect completed_ but to create a barrier - // from possible addCallback() calls - std::unique_lock lock(mutex_); - AT_ASSERT(!completed()); - completed_ = true; - has_error = true; - error = std::move(error_); - } - - fireCallbacks(); + // Accessors for subtypes are arranged together below + // While some of these accessors could be generated through templates, + // we prefer to write them manually for clarity + + // Tensor + IValue(at::Tensor t) + : tag(Tag::Tensor), is_intrusive_ptr(t.defined()) { + // Note: the undefined tensor is not refcounted, so while it + // is tagged as a tensor, is_intrusive_ptr is set to false. + // This is not an optional optimization: our incref call + // *will not* do the right thing when called on an + // undefined tensor. + payload.as_intrusive_ptr = t.unsafeReleaseTensorImpl(); + } + bool isTensor() const { return Tag::Tensor == tag; } + at::Tensor toTensor() &&; + at::Tensor toTensor() const &; + at::TensorImpl* unsafeToTensorImpl() const { + return static_cast(payload.as_intrusive_ptr); } - // Get the result of the current future. - IValue value() { - std::unique_lock lock(mutex_); - AT_ASSERT(completed()); - if (has_error) { - throw error; - } - return value_; - } - - /** - * Add a callback to the future. - * The callbacks will be executed once the future completes. - * If the future has already completed, - * this function will execute the callback immediately. - */ - void addCallback(std::function callback) { - std::unique_lock lock(mutex_); - if (completed()) { - lock.unlock(); - callback(); - return; - } - callbacks.push_back(callback); + const IValue& toIValue() const { + return *this; + } + IValue& toIValue() { + return *this; } - // Check if the current future has completed - bool completed() { - return completed_; + IValue(intrusive_ptr blob) + : tag(Tag::Blob), is_intrusive_ptr(true) { + // TODO (after Tensor merge) If we pass in a Blob holding a Tensor, extract + // and store it as a Tensor instead. + payload.as_intrusive_ptr = blob.release(); + } + bool isBlob() const { + return Tag::Blob == tag; + } + c10::intrusive_ptr toBlob() &&; + c10::intrusive_ptr toBlob() const &; + + // Tuple + IValue(c10::intrusive_ptr v); + bool isTuple() const { return Tag::Tuple == tag; } + c10::intrusive_ptr toTuple() &&; + c10::intrusive_ptr toTuple() const &; + + // Double + IValue(double d) + : tag(Tag::Double), is_intrusive_ptr(false) { + payload.as_double = d; + } + bool isDouble() const { return Tag::Double == tag; } + double toDouble() const { + AT_ASSERT(isDouble()); + return payload.as_double; } - CAFFE2_API friend std::ostream& operator<<( - std::ostream& out, - const Future& v); + // Future + IValue(c10::intrusive_ptr v); + bool isFuture() const { return Tag::Future == tag; } + c10::intrusive_ptr toFuture() &&; + c10::intrusive_ptr toFuture() const &; - private: - void fireCallbacks() { - AT_ASSERT(completed()); - // There is no need to protect callbacks with the lock. - // Once completed_ is set to true, no one can add new callback to the list. - for (auto& callback : callbacks) { - callback(); - } - callbacks.clear(); + // Int + IValue(int64_t i) + : tag(Tag::Int), is_intrusive_ptr(false) { + payload.as_int = i; } - std::mutex mutex_; - IValue value_; // when finished the value - std::atomic_bool completed_ = {false}; // is this future complete - std::vector> callbacks; - bool has_error = false; - FutureError error; -}; + // allow you to pass literals (3, 4) without ambiguity + IValue(int32_t i) + : IValue(static_cast(i)) {} -// User-defined object. -struct C10_EXPORT ivalue::Object final : c10::intrusive_ptr_target { - public: - Object(std::shared_ptr type, size_t numSlots) : type_(std::move(type)) { - slots_.resize(numSlots); - } + bool isInt() const { return Tag::Int == tag; } - static c10::intrusive_ptr create( - std::shared_ptr type, - size_t numSlots) { - return c10::make_intrusive(std::move(type), numSlots); + int64_t toInt() const { + AT_ASSERT(isInt()); + return payload.as_int; } - void setSlot(size_t slot, IValue v) { - if (slot >= slots_.size()) { - // for module types, it is possible that the members of the class have - // expanded after the object was created. In this case, we expand - // the slots to the right size - resizeObject(slot); - } - slots_[slot] = v; + // Bool + IValue(bool b) + : tag(Tag::Bool), is_intrusive_ptr(false) { + payload.as_bool = b; } - - const IValue& getSlot(size_t slot) const { - return slots_.at(slot); + bool isBool() const { return Tag::Bool == tag; } + bool toBool() const { + AT_ASSERT(isBool()); + return payload.as_bool; } - std::string name() const; - - const std::vector& slots() const { - return slots_; + // IntList + IValue(c10::intrusive_ptr v); + IValue(std::vector v); + IValue(at::ArrayRef v) + : IValue(v.vec()) {} + bool isIntList() const { return Tag::IntList == tag; } + c10::intrusive_ptr toIntList() &&; + c10::intrusive_ptr toIntList() const &; + + const std::vector& toIntListRef() const; + const std::vector& toDoubleListRef() const; + const std::vector& toBoolListRef() const; + const std::vector& toTensorListRef() const; + const std::vector& toGenericListRef() const; + const c10::Dict& toGenericDictRef() const; + const std::string& toStringRef() const; + + // ConstantString + IValue(c10::intrusive_ptr v); + IValue(std::string v); + IValue(const char* v): IValue(std::string(v)) {} + bool isString() const { return Tag::String == tag; } + c10::intrusive_ptr toString() &&; + c10::intrusive_ptr toString() const &; + + // DoubleList + IValue(c10::intrusive_ptr v); + IValue(std::vector v); + bool isDoubleList() const { return Tag::DoubleList == tag; } + c10::intrusive_ptr toDoubleList() &&; + c10::intrusive_ptr toDoubleList() const &; + + // BoolList + IValue(c10::intrusive_ptr v); + IValue(std::vector v); + bool isBoolList() const { return Tag::BoolList == tag; } + c10::intrusive_ptr toBoolList() &&; + c10::intrusive_ptr toBoolList() const &; + + //TensorList + IValue(c10::intrusive_ptr v); + IValue(std::vector v); + bool isTensorList() const { return Tag::TensorList == tag; } + c10::intrusive_ptr toTensorList() &&; + c10::intrusive_ptr toTensorList() const &; + + //GenericList + IValue(c10::intrusive_ptr v); + IValue(std::vector v); + bool isGenericList() const { return Tag::GenericList == tag; } + c10::intrusive_ptr toGenericList() &&; + c10::intrusive_ptr toGenericList() const &; + + // GenericDict + IValue(c10::intrusive_ptr v); + IValue(c10::Dict v); + bool isGenericDict() const { return Tag::GenericDict == tag; } + c10::intrusive_ptr toGenericDict() &&; + c10::intrusive_ptr toGenericDict() const &; + + // ClassType + IValue(c10::intrusive_ptr v); + bool isObject() const { return tag == Tag::Object; } + c10::intrusive_ptr toObject() &&; + c10::intrusive_ptr toObject() const & ; + + // None + bool isNone() const { + return Tag::None == tag; } - std::shared_ptr type() const { - return type_; + std::string toNone() const { + AT_ASSERT(isNone()); + return "None"; } - - private: - void resizeObject(size_t slot); - std::shared_ptr type_; - std::vector slots_; -}; - -struct C10_EXPORT ivalue::GenericDict : c10::intrusive_ptr_target { - private: - c10::impl::GenericDict elements_; - - public: - GenericDict(c10::impl::GenericDict elements_) - : elements_(std::move(elements_)) {} - static c10::intrusive_ptr create( - c10::impl::GenericDict elements_) { - return c10::make_intrusive(std::move(elements_)); + // Scalar, which gets encoded as either an Int or a Double + IValue(at::Scalar s) + : IValue() { + if(s.isFloatingPoint()) { + *this = s.toDouble(); + } else { + *this = s.toLong(); + } } - const c10::impl::GenericDict& elements() const & { - return elements_; + bool isScalar() const { + return isDouble() || isInt(); } - c10::impl::GenericDict& elements() & { - return elements_; + at::Scalar toScalar() const { + if(isDouble()) + return toDouble(); + else if(isInt()) + return toInt(); + throw std::runtime_error("IValue is not a Scalar"); } - using IterationOrder = std::vector>; - const IterationOrder iterationOrder() const; -}; - -#undef TORCH_FORALL_TAGS - -namespace detail { - -struct _guarded_unsigned_long_unique_dummy final { - _guarded_unsigned_long_unique_dummy(int64_t){}; -}; -using _guarded_unsigned_long = c10::guts::conditional_t< - std::is_same::value || - std::is_same::value, - _guarded_unsigned_long_unique_dummy, - unsigned long>; - -} // namespace detail - -#define DEFINE_TO(type, method_name) \ -template<> \ -inline type IValue::to() && { \ - return std::move(*this).method_name(); \ -} \ -template<> \ -inline type IValue::to() const & { \ - return this->method_name(); \ -} -DEFINE_TO(at::Tensor, toTensor) -DEFINE_TO(c10::intrusive_ptr, toTuple) -DEFINE_TO(float, toDouble) -DEFINE_TO(double, toDouble) -DEFINE_TO(unsigned char, toInt) -DEFINE_TO(signed char, toInt) -DEFINE_TO(unsigned short, toInt) -DEFINE_TO(short, toInt) -DEFINE_TO(int, toInt) -DEFINE_TO(uint32_t, toInt) -DEFINE_TO(uint64_t, toInt) -DEFINE_TO(detail::_guarded_unsigned_long, toInt) -DEFINE_TO(int64_t, toInt) -DEFINE_TO(bool, toBool) -DEFINE_TO(c10::intrusive_ptr, toBlob); -DEFINE_TO(c10::intrusive_ptr, toDoubleList) -DEFINE_TO(c10::intrusive_ptr, toIntList) -DEFINE_TO(c10::intrusive_ptr, toBoolList) -DEFINE_TO(c10::intrusive_ptr, toTensorList) -DEFINE_TO(c10::intrusive_ptr, toGenericList) -DEFINE_TO(c10::intrusive_ptr, toGenericDict) -DEFINE_TO(c10::intrusive_ptr, toString) -DEFINE_TO(c10::intrusive_ptr, toObject) -DEFINE_TO(at::Scalar, toScalar) -DEFINE_TO(std::vector, toIntListRef) -DEFINE_TO(std::vector, toDoubleListRef) -DEFINE_TO(std::vector, toBoolListRef) -DEFINE_TO(std::vector, toTensorListRef) -DEFINE_TO(std::vector, toGenericListRef) -DEFINE_TO(std::string, toStringRef) -DEFINE_TO(c10::intrusive_ptr, toFuture) -DEFINE_TO(IValue, toIValue) -DEFINE_TO(c10::Device, toDevice) -DEFINE_TO(at::ScalarType, toScalarType) -DEFINE_TO(at::Layout, toLayout) - -template -struct _fake_type {}; - -template -std::vector generic_to( - const IValue* ivalue, - _fake_type>) { - return fmap(ivalue->toGenericListRef(), [](IValue item_ivalue) { return item_ivalue.to(); }); -} - -template -std::unordered_map generic_to( - const IValue* ivalue, - _fake_type>) { - std::unordered_map specialized_dict; - - for (auto item : ivalue->toGenericDictRef()) { - specialized_dict[item.key().to()] = item.value().to(); + // Device + IValue(c10::Device d) + : tag(Tag::Device), is_intrusive_ptr(false) { + payload.as_device.type = d.type(); + payload.as_device.index = d.index(); + } + bool isDevice() const { return Tag::Device == tag; } + c10::Device toDevice() const { + AT_ASSERT(isDevice()); + return c10::Device(payload.as_device.type, payload.as_device.index); } - return specialized_dict; -} - -template -inline T IValue::to() && { - return generic_to(this, _fake_type{}); -} - -template -inline T IValue::to() const& { - return generic_to(this, _fake_type{}); -} - -// note: when adding a DEFINE_TO case here you should also add a -// toX method to IValue. These named methods are much more discoverable -// than the to templated function. - -inline IValue::IValue(c10::intrusive_ptr v) -: tag(Tag::Tuple), is_intrusive_ptr(true) { - payload.as_intrusive_ptr = v.release(); -} - -inline IValue::IValue(c10::intrusive_ptr v) -: tag(Tag::IntList), is_intrusive_ptr(true) { - payload.as_intrusive_ptr = v.release(); -} -inline IValue::IValue(std::vector v) -: IValue(ivalue::IntList::create(std::move(v))) {} - -inline IValue::IValue(c10::intrusive_ptr v) -: tag(Tag::String), is_intrusive_ptr(true) { - payload.as_intrusive_ptr = v.release(); -} -inline IValue::IValue(std::string v) -: IValue(ivalue::ConstantString::create(std::move(v))) {} - -inline IValue::IValue(c10::intrusive_ptr v) -: tag(Tag::DoubleList), is_intrusive_ptr(true) { - payload.as_intrusive_ptr = v.release(); -} -inline IValue::IValue(std::vector v) -: IValue(ivalue::DoubleList::create(std::move(v))) {} + // ScalarType + at::ScalarType toScalarType() const { + return static_cast(toInt()); + } -inline IValue::IValue(c10::intrusive_ptr v) -: tag(Tag::BoolList), is_intrusive_ptr(true) { - payload.as_intrusive_ptr = v.release(); -} -inline IValue::IValue(std::vector v) -: IValue(ivalue::BoolList::create(std::move(v))) {} + // Layout + at::Layout toLayout() const { + return static_cast(toInt()); + } -inline IValue::IValue(c10::intrusive_ptr v) -: tag(Tag::TensorList), is_intrusive_ptr(true) { - payload.as_intrusive_ptr = v.release(); -} -inline IValue::IValue(std::vector v) -: IValue(ivalue::TensorList::create(std::move(v))) {} + // MemoryFormat + at::MemoryFormat toMemoryFormat() const { + return static_cast(toInt()); + } -inline IValue::IValue(c10::intrusive_ptr v) -: tag(Tag::GenericList), is_intrusive_ptr(true) { - payload.as_intrusive_ptr = v.release(); -} -inline IValue::IValue(std::vector v) -: IValue(ivalue::GenericList::create(std::move(v))) {} -inline IValue::IValue(c10::intrusive_ptr v) -: tag(Tag::GenericDict), is_intrusive_ptr(true) { - payload.as_intrusive_ptr = v.release(); -} -inline IValue::IValue(c10::impl::GenericDict v) -: IValue(ivalue::GenericDict::create(std::move(v))) {} + // for debugging + std::string tagKind() const { + switch(tag) { + #define DEFINE_CASE(x) case Tag::x: return #x; + TORCH_FORALL_TAGS(DEFINE_CASE) + #undef DEFINE_CASE + } + return "Invalid Tag"; + } -inline IValue::IValue(c10::intrusive_ptr v) -: tag(Tag::Object), is_intrusive_ptr(true) { - payload.as_intrusive_ptr = v.release(); -} -inline IValue::IValue(c10::intrusive_ptr v) -: tag(Tag::Future), is_intrusive_ptr(true) { - payload.as_intrusive_ptr = v.release(); -} + // generic v.to() implementations + // that can be used in special functions like pop/push + // that use template meta-programming. + // prefer the directly named methods when you can, + // since they are simpler to understand -inline const std::vector& IValue::toIntListRef() const { - return toIntList()->elements(); -} + // Note: if you get linker errors saying one of these is missing, + // change it to ... && = delete; and you will see better error messages for why + // However, we cannot commit this because some compiler versions barf on it. + template + T to() &&; + template + T to() const &; -inline const std::vector& IValue::toDoubleListRef() const { - return toDoubleList()->elements(); -} + // ToOptional: convert a IValue to the Optional obj that accepts both T and None + template + optional toOptional(); -inline const std::vector& IValue::toTensorListRef() const { - return toTensorList()->elements(); -} + // this is a shallow comparison of two IValues to test the object identity + bool isSameIdentity(const IValue& rhs) const; -inline const std::vector& IValue::toBoolListRef() const { - return toBoolList()->elements(); -} + CAFFE2_API friend std::ostream& operator<<( + std::ostream& out, + const IValue& v); -inline const std::vector& IValue::toGenericListRef() const { - return toGenericList()->elements(); -} + bool isPtrType() const { + return is_intrusive_ptr; + } -inline const c10::impl::GenericDict& IValue:: - toGenericDictRef() const { - return toGenericDict()->elements(); -} + private: + // NOTE: IValue tags are intentionally private. In the future we may encode + // this value different (e.g. using NaN boxing), and this would make it more + // costly to determine the tag for all types vs just determining if something + // is a particular type. Instead we want clients to use the `isX` methods when + // possible. If for perf. reasons you really, absolutely, must have a jump + // table, then we can revisit this. + enum class Tag : uint32_t { +#define DEFINE_TAG(x) x, + TORCH_FORALL_TAGS(DEFINE_TAG) +#undef DEFINE_TAG + }; -inline const std::string& IValue::toStringRef() const { - return toString()->string(); -} + template> + c10::intrusive_ptr moveToIntrusivePtr(); + template> + c10::intrusive_ptr toIntrusivePtr() const; -template -inline optional IValue::toOptional() { - if (this->isNone()) { - return nullopt; + void clearToNone() { + payload.as_int = 0; + tag = Tag::None; + is_intrusive_ptr = false; } - return this->to(); -} + union { + int64_t as_int; + double as_double; + bool as_bool; + c10::intrusive_ptr_target* as_intrusive_ptr; + struct { + DeviceType type; + DeviceIndex index; + } as_device; + } payload; + Tag tag; + bool is_intrusive_ptr; +}; -inline bool IValue::isSameIdentity(const IValue& rhs) const { - // We choose to not use memcmp for payload check due to potential random padding characters on union type - - // Semantics: - // 1. None is None, False is False, and True is True are all true - // 2. If it is a tensor type, we need to take undefined tensor into account - // 3. Undefined_tensor is None and vice versa should be true - // 4. If it is a reference type (i.e. is_intrusive_ptr), then is is True when the pointed-to object is the same. - // 5. False for all other comparisons. - if (this->isNone() && rhs.isNone()) { - return true; - } else if (this->isBool() && rhs.isBool()) { - // for bool type, do equality check - return this->toBool() == rhs.toBool(); - } else if (this->isTensor() && rhs.isTensor()) { - // for tensor type, just check the as_intrusive_ptr since is_intrusive_ptr is false for undefined tensor - return this->payload.as_intrusive_ptr == rhs.payload.as_intrusive_ptr; - } else if (this->isTensor() && rhs.isNone()) { - // special case: undefined tensor and None are the same identity - return !this->is_intrusive_ptr; - } else if (this->isNone() && rhs.isTensor()) { - // special case: undefined tensor and None are the same identity - return !rhs.is_intrusive_ptr; - } else { - // for objects holding in IValue, do shallow compare on pointer address to testify the identity - return this->is_intrusive_ptr && rhs.is_intrusive_ptr - && this->payload.as_intrusive_ptr == rhs.payload.as_intrusive_ptr; - } } -} // namespace c10 +#include diff --git a/aten/src/ATen/core/ivalue_base.h b/aten/src/ATen/core/ivalue_base.h deleted file mode 100644 index b1bd0a4473ef..000000000000 --- a/aten/src/ATen/core/ivalue_base.h +++ /dev/null @@ -1,391 +0,0 @@ -#pragma once - -#include -#include -#include - -namespace c10 { -template class Dict; -struct IValue; -namespace ivalue { -struct Tuple; -template struct List; -using IntList = List; -using TensorList = List; -using DoubleList = List; -using BoolList = List; -using GenericList = List; -struct Future; -struct ConstantString; -struct GenericDict; -struct Object; -} - -// IValue is the generic tagged union used by the interpreter to hold -// all value types. -// It is a 16-byte object with an 8-byte payload and an 8-byte tag. -// The tag is currently 4 bytes to determine the type, and 1 byte -// to mark whether that type is a subtype of c10::intrusive_ptr_target and needs -// retain/release calls. - -#define TORCH_FORALL_TAGS(_) \ - _(None) \ - _(Tensor) \ - _(Double) \ - _(Int) \ - _(Bool) \ - _(Tuple) \ - _(IntList) \ - _(DoubleList) \ - _(BoolList) \ - _(String) \ - _(TensorList) \ - _(Blob) \ - _(GenericList) \ - _(GenericDict) \ - _(Future) \ - _(Device) \ - _(Object) - -struct CAFFE2_API IValue final { - IValue() - : payload{0} - , tag(Tag::None) - , is_intrusive_ptr(false) {} - IValue(const IValue& rhs) - : payload(rhs.payload), - tag(rhs.tag), - is_intrusive_ptr(rhs.is_intrusive_ptr) { - if (is_intrusive_ptr) { - c10::raw::intrusive_ptr::incref(payload.as_intrusive_ptr); - } - } - IValue(IValue&& rhs) noexcept : IValue() { - swap(rhs); - } - ~IValue() { - if (is_intrusive_ptr) { - c10::raw::intrusive_ptr::decref(payload.as_intrusive_ptr); - } - } - IValue & operator=(IValue && rhs) & noexcept { - IValue(std::move(rhs)).swap(*this); // this also sets rhs to None - return *this; - } - IValue & operator=(IValue const & rhs) & { - IValue(rhs).swap(*this); - return *this; - } - - void dump() const; - - bool isAliasOf(const IValue& rhs) const { - if (this->tag != rhs.tag) { - // Trivially don't alias if the type is different - return false; - } - - if (!this->is_intrusive_ptr) { - // Primitive types don't alias anything - return false; - } - - AT_ASSERT(rhs.is_intrusive_ptr); - - // Tensors should be compared based on internal storage - if (this->isTensor()) { - const auto thisTensor = this->toTensor(); - const auto rhsTensor = rhs.toTensor(); - return thisTensor.is_alias_of(rhsTensor); - } - - // Other types can be compared by their ptr value - return this->payload.as_intrusive_ptr == rhs.payload.as_intrusive_ptr; - } - void swap(IValue & rhs) noexcept { - std::swap(payload, rhs.payload); - std::swap(is_intrusive_ptr, rhs.is_intrusive_ptr); - std::swap(tag, rhs.tag); - } - - // Accessors for subtypes are arranged together below - // While some of these accessors could be generated through templates, - // we prefer to write them manually for clarity - - // Tensor - IValue(at::Tensor t) - : tag(Tag::Tensor), is_intrusive_ptr(t.defined()) { - // Note: the undefined tensor is not refcounted, so while it - // is tagged as a tensor, is_intrusive_ptr is set to false. - // This is not an optional optimization: our incref call - // *will not* do the right thing when called on an - // undefined tensor. - payload.as_intrusive_ptr = t.unsafeReleaseTensorImpl(); - } - bool isTensor() const { return Tag::Tensor == tag; } - at::Tensor toTensor() &&; - at::Tensor toTensor() const &; - - const IValue& toIValue() const { - return *this; - } - IValue& toIValue() { - return *this; - } - - IValue(intrusive_ptr blob) - : tag(Tag::Blob), is_intrusive_ptr(true) { - // TODO (after Tensor merge) If we pass in a Blob holding a Tensor, extract - // and store it as a Tensor instead. - payload.as_intrusive_ptr = blob.release(); - } - bool isBlob() const { - return Tag::Blob == tag; - } - c10::intrusive_ptr toBlob() &&; - c10::intrusive_ptr toBlob() const &; - - // Tuple - IValue(c10::intrusive_ptr v); - bool isTuple() const { return Tag::Tuple == tag; } - c10::intrusive_ptr toTuple() &&; - c10::intrusive_ptr toTuple() const &; - - // Double - IValue(double d) - : tag(Tag::Double), is_intrusive_ptr(false) { - payload.as_double = d; - } - bool isDouble() const { return Tag::Double == tag; } - double toDouble() const { - AT_ASSERT(isDouble()); - return payload.as_double; - } - - // Future - IValue(c10::intrusive_ptr v); - bool isFuture() const { return Tag::Future == tag; } - c10::intrusive_ptr toFuture() &&; - c10::intrusive_ptr toFuture() const &; - - // Int - IValue(int64_t i) - : tag(Tag::Int), is_intrusive_ptr(false) { - payload.as_int = i; - } - - // allow you to pass literals (3, 4) without ambiguity - IValue(int32_t i) - : IValue(static_cast(i)) {} - - bool isInt() const { return Tag::Int == tag; } - - int64_t toInt() const { - AT_ASSERT(isInt()); - return payload.as_int; - } - - // Bool - IValue(bool b) - : tag(Tag::Bool), is_intrusive_ptr(false) { - payload.as_bool = b; - } - bool isBool() const { return Tag::Bool == tag; } - bool toBool() const { - AT_ASSERT(isBool()); - return payload.as_bool; - } - - // IntList - IValue(c10::intrusive_ptr v); - IValue(std::vector v); - IValue(at::ArrayRef v) - : IValue(v.vec()) {} - bool isIntList() const { return Tag::IntList == tag; } - c10::intrusive_ptr toIntList() &&; - c10::intrusive_ptr toIntList() const &; - - const std::vector& toIntListRef() const; - const std::vector& toDoubleListRef() const; - const std::vector& toBoolListRef() const; - const std::vector& toTensorListRef() const; - const std::vector& toGenericListRef() const; - const c10::Dict& toGenericDictRef() const; - const std::string& toStringRef() const; - - // ConstantString - IValue(c10::intrusive_ptr v); - IValue(std::string v); - IValue(const char* v): IValue(std::string(v)) {} - bool isString() const { return Tag::String == tag; } - c10::intrusive_ptr toString() &&; - c10::intrusive_ptr toString() const &; - - // DoubleList - IValue(c10::intrusive_ptr v); - IValue(std::vector v); - bool isDoubleList() const { return Tag::DoubleList == tag; } - c10::intrusive_ptr toDoubleList() &&; - c10::intrusive_ptr toDoubleList() const &; - - // BoolList - IValue(c10::intrusive_ptr v); - IValue(std::vector v); - bool isBoolList() const { return Tag::BoolList == tag; } - c10::intrusive_ptr toBoolList() &&; - c10::intrusive_ptr toBoolList() const &; - - //TensorList - IValue(c10::intrusive_ptr v); - IValue(std::vector v); - bool isTensorList() const { return Tag::TensorList == tag; } - c10::intrusive_ptr toTensorList() &&; - c10::intrusive_ptr toTensorList() const &; - - //GenericList - IValue(c10::intrusive_ptr v); - IValue(std::vector v); - bool isGenericList() const { return Tag::GenericList == tag; } - c10::intrusive_ptr toGenericList() &&; - c10::intrusive_ptr toGenericList() const &; - - // GenericDict - IValue(c10::intrusive_ptr v); - IValue(c10::Dict v); - bool isGenericDict() const { return Tag::GenericDict == tag; } - c10::intrusive_ptr toGenericDict() &&; - c10::intrusive_ptr toGenericDict() const &; - - // ClassType - IValue(c10::intrusive_ptr v); - bool isObject() const { return tag == Tag::Object; } - c10::intrusive_ptr toObject() &&; - c10::intrusive_ptr toObject() const & ; - - // None - bool isNone() const { - return Tag::None == tag; - } - std::string toNone() const { - AT_ASSERT(isNone()); - return "None"; - } - // Scalar, which gets encoded as either an Int or a Double - IValue(at::Scalar s) - : IValue() { - if(s.isFloatingPoint()) { - *this = s.toDouble(); - } else { - *this = s.toLong(); - } - } - bool isScalar() const { - return isDouble() || isInt(); - } - at::Scalar toScalar() const { - if(isDouble()) - return toDouble(); - else if(isInt()) - return toInt(); - throw std::runtime_error("IValue is not a Scalar"); - } - - // Device - IValue(c10::Device d) - : tag(Tag::Device), is_intrusive_ptr(false) { - payload.as_device.type = d.type(); - payload.as_device.index = d.index(); - } - bool isDevice() const { return Tag::Device == tag; } - c10::Device toDevice() const { - AT_ASSERT(isDevice()); - return c10::Device(payload.as_device.type, payload.as_device.index); - } - - // ScalarType - at::ScalarType toScalarType() const { - return static_cast(toInt()); - } - - // Layout - at::Layout toLayout() const { - return static_cast(toInt()); - } - - // for debugging - std::string tagKind() const { - switch(tag) { - #define DEFINE_CASE(x) case Tag::x: return #x; - TORCH_FORALL_TAGS(DEFINE_CASE) - #undef DEFINE_CASE - } - return "Invalid Tag"; - } - - // generic v.to() implementations - // that can be used in special functions like pop/push - // that use template meta-programming. - // prefer the directly named methods when you can, - // since they are simpler to understand - - // Note: if you get linker errors saying one of these is missing, - // change it to ... && = delete; and you will see better error messages for why - // However, we cannot commit this because some compiler versions barf on it. - template - T to() &&; - template - T to() const &; - - // ToOptional: convert a IValue to the Optional obj that accepts both T and None - template - optional toOptional(); - - // this is a shallow comparison of two IValues to test the object identity - bool isSameIdentity(const IValue& rhs) const; - - CAFFE2_API friend std::ostream& operator<<( - std::ostream& out, - const IValue& v); - - bool isPtrType() const { - return is_intrusive_ptr; - } - - private: - // NOTE: IValue tags are intentionally private. In the future we may encode - // this value different (e.g. using NaN boxing), and this would make it more - // costly to determine the tag for all types vs just determining if something - // is a particular type. Instead we want clients to use the `isX` methods when - // possible. If for perf. reasons you really, absolutely, must have a jump - // table, then we can revisit this. - enum class Tag : uint32_t { -#define DEFINE_TAG(x) x, - TORCH_FORALL_TAGS(DEFINE_TAG) -#undef DEFINE_TAG - }; - - template> - c10::intrusive_ptr moveToIntrusivePtr(); - template> - c10::intrusive_ptr toIntrusivePtr() const; - - void clearToNone() { - payload.as_int = 0; - tag = Tag::None; - is_intrusive_ptr = false; - } - union { - int64_t as_int; - double as_double; - bool as_bool; - c10::intrusive_ptr_target* as_intrusive_ptr; - struct { - DeviceType type; - DeviceIndex index; - } as_device; - } payload; - Tag tag; - bool is_intrusive_ptr; -}; - -} diff --git a/aten/src/ATen/core/ivalue_inl.h b/aten/src/ATen/core/ivalue_inl.h new file mode 100644 index 000000000000..9aba56ebc84e --- /dev/null +++ b/aten/src/ATen/core/ivalue_inl.h @@ -0,0 +1,652 @@ +#pragma once + +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace torch { +namespace jit { +namespace script { +struct Function; +} +} // namespace jit +} // namespace torch +namespace c10 { +struct IValue; +struct ClassType; + +template +c10::intrusive_ptr IValue::moveToIntrusivePtr() { + auto t = c10::intrusive_ptr::reclaim(static_cast(payload.as_intrusive_ptr)); + clearToNone(); + return t; +} +template +c10::intrusive_ptr IValue::toIntrusivePtr() const { + auto r = c10::intrusive_ptr::reclaim(static_cast(payload.as_intrusive_ptr)); + auto p = r; + r.release(); + return p; +} + +inline c10::intrusive_ptr IValue::toTuple() && { + AT_ASSERT(isTuple()); + return moveToIntrusivePtr(); +} +inline c10::intrusive_ptr IValue::toTuple() const & { + AT_ASSERT(isTuple()); + return toIntrusivePtr(); +} +inline c10::intrusive_ptr IValue::toFuture() && { + AT_ASSERT(isFuture()); + return moveToIntrusivePtr(); +} +inline c10::intrusive_ptr IValue::toFuture() const & { + AT_ASSERT(isFuture()); + return toIntrusivePtr(); +} +inline c10::intrusive_ptr IValue::toIntList() && { + AT_ASSERT(isIntList()); + return moveToIntrusivePtr(); +} +inline c10::intrusive_ptr IValue::toIntList() const & { + AT_ASSERT(isIntList()); + return toIntrusivePtr(); +} +inline c10::intrusive_ptr IValue::toString() && { + AT_ASSERT(isString()); + return moveToIntrusivePtr(); +} +inline c10::intrusive_ptr IValue::toString() const & { + AT_ASSERT(isString()); + return toIntrusivePtr(); +} +inline c10::intrusive_ptr IValue::toDoubleList() && { + AT_ASSERT(isDoubleList()); + return moveToIntrusivePtr(); +} +inline c10::intrusive_ptr IValue::toDoubleList() const & { + AT_ASSERT(isDoubleList()); + return toIntrusivePtr(); +} +inline c10::intrusive_ptr IValue::toBoolList() && { + AT_ASSERT(isBoolList()); + return moveToIntrusivePtr(); +} +inline c10::intrusive_ptr IValue::toBoolList() const & { + AT_ASSERT(isBoolList()); + return toIntrusivePtr(); +} +inline c10::intrusive_ptr IValue::toTensorList() && { + AT_ASSERT(isTensorList()); + return moveToIntrusivePtr(); +} +inline c10::intrusive_ptr IValue::toTensorList() const & { + AT_ASSERT(isTensorList()); + return toIntrusivePtr(); +} +inline c10::intrusive_ptr IValue::toGenericList() && { + AT_ASSERT(isGenericList()); + return moveToIntrusivePtr(); +} +inline c10::intrusive_ptr IValue::toGenericList() const & { + AT_ASSERT(isGenericList()); + return toIntrusivePtr(); +} +inline c10::intrusive_ptr IValue::toGenericDict() && { + AT_ASSERT(isGenericDict()); + return moveToIntrusivePtr(); +} +inline c10::intrusive_ptr IValue::toGenericDict() const & { + AT_ASSERT(isGenericDict()); + return toIntrusivePtr(); +} +inline c10::intrusive_ptr IValue::toObject() && { + AT_ASSERT(isObject()); + return toIntrusivePtr(); +} +inline c10::intrusive_ptr IValue::toObject() const & { + AT_ASSERT(isObject()); + return toIntrusivePtr(); +} +inline at::Tensor IValue::toTensor() && { + AT_ASSERT(isTensor()); + return at::Tensor(moveToIntrusivePtr()); +} +inline at::Tensor IValue::toTensor() const & { + AT_ASSERT(isTensor()); + return at::Tensor(toIntrusivePtr()); +} +inline c10::intrusive_ptr IValue::toBlob() && { + AT_ASSERT(isBlob()); + return moveToIntrusivePtr(); +} +inline c10::intrusive_ptr IValue::toBlob() const & { + AT_ASSERT(isBlob()); + return toIntrusivePtr();; +} + +namespace ivalue { + +template +using Shared = c10::intrusive_ptr; + +// string +struct CAFFE2_API ConstantString final : c10::intrusive_ptr_target { + private: + const std::string str_; + public: + ConstantString(std::string str) + : str_(std::move(str)) {} + static c10::intrusive_ptr create(std::string str_); + const std::string & string() const { + return str_; + } + operator const std::string & () const { + return string(); + } + CAFFE2_API friend std::ostream& operator<<( + std::ostream& out, + const ConstantString& v); +}; + +template +struct CAFFE2_API List : c10::intrusive_ptr_target { + private: + std::vector elements_; + + public: + typedef Elem ElemType; + + List(std::vector elements_) : elements_(std::move(elements_)) {} + static c10::intrusive_ptr> create(std::vector elements_) { + return c10::make_intrusive>(std::move(elements_)); + } + const std::vector& elements() const & { + return elements_; + } + operator const std::vector&() const { + return elements(); + } + + std::vector& elements() & { + return elements_; + } + operator std::vector&() { + return elements(); + } + + std::vector&& elements() && { + return std::move(elements_); + } +}; + +struct Future; +struct GenericDict; + +struct CAFFE2_API Tuple : public List { + using List::List; + static c10::intrusive_ptr create(std::vector elements_) { + return c10::make_intrusive(std::move(elements_)); + } +}; + +struct Object; +} + +// Future +struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target { + private: + c10::intrusive_ptr intrusive_from_this() { + c10::raw::intrusive_ptr::incref(this); // we are creating a new pointer + // from a raw `this` pointer + // so we need to bump the refcount + // to account for this ownership + return c10::intrusive_ptr::reclaim(this); + } + + public: + struct CAFFE2_API FutureError final : public std::exception { + FutureError(std::string&& error_msg_) + : error_msg(std::move(error_msg_)) {} + + FutureError() = default; + + const char* what() const noexcept override { + return error_msg.c_str(); + } + + std::string error_msg; + }; + + /** + * Wait on the future until it completes. + */ + void wait() { + if (completed()) { + return; + } + std::condition_variable finished; + bool fired = false; + + // Add a callback to notify the current thread + // when the current future completes. + addCallback([&] { + std::unique_lock lock(mutex_); + finished.notify_all(); + fired = true; + }); + + // The current thread will be blocked unless the above callback is fired. + std::unique_lock lock(mutex_); + while (!fired) { + finished.wait(lock); + } + + AT_ASSERT(completed()); + } + + /** + * Explicitly mark the future as completed with the output value. + */ + void markCompleted(IValue value) { + { + // This is not to protect completed_ but to create a barrier + // from possible addCallback() calls + std::unique_lock lock(mutex_); + AT_ASSERT(!completed()); + completed_ = true; + value_ = std::move(value); + } + + fireCallbacks(); + } + + void markCompleted(FutureError&& error_) { + { + // This is not to protect completed_ but to create a barrier + // from possible addCallback() calls + std::unique_lock lock(mutex_); + AT_ASSERT(!completed()); + completed_ = true; + has_error = true; + error = std::move(error_); + } + + fireCallbacks(); + } + + // Get the result of the current future. + IValue value() { + std::unique_lock lock(mutex_); + AT_ASSERT(completed()); + if (has_error) { + throw error; + } + return value_; + } + + /** + * Add a callback to the future. + * The callbacks will be executed once the future completes. + * If the future has already completed, + * this function will execute the callback immediately. + */ + void addCallback(std::function callback) { + std::unique_lock lock(mutex_); + if (completed()) { + lock.unlock(); + callback(); + return; + } + callbacks.push_back(callback); + } + + // Check if the current future has completed + bool completed() { + return completed_; + } + + CAFFE2_API friend std::ostream& operator<<( + std::ostream& out, + const Future& v); + + private: + void fireCallbacks() { + AT_ASSERT(completed()); + // There is no need to protect callbacks with the lock. + // Once completed_ is set to true, no one can add new callback to the list. + for (auto& callback : callbacks) { + callback(); + } + callbacks.clear(); + } + + std::mutex mutex_; + IValue value_; // when finished the value + std::atomic_bool completed_ = {false}; // is this future complete + std::vector> callbacks; + bool has_error = false; + FutureError error; +}; + +// User-defined object. +struct C10_EXPORT ivalue::Object final : c10::intrusive_ptr_target { + public: + Object(std::shared_ptr type, size_t numSlots) : type_(std::move(type)) { + slots_.resize(numSlots); + } + + static c10::intrusive_ptr create( + std::shared_ptr type, + size_t numSlots) { + return c10::make_intrusive(std::move(type), numSlots); + } + + /** + * Slot API. + * + * Attributes are stored as a simple vector so that lookups are fast at + * runtime. A "slot" is just an index into that vector, which can be computed + * statically if you have access to the class type. Use this API if you are + * writing compiler stuff. + */ + void setSlot(size_t slot, IValue v) { + if (slot >= slots_.size()) { + // for module types, it is possible that the members of the class have + // expanded after the object was created. In this case, we expand + // the slots to the right size + resizeObject(slot); + } + slots_[slot] = v; + } + + const IValue& getSlot(size_t slot) const { + return slots_.at(slot); + } + + /** + * Attribute API. + * + * Wrappers around the slot stuff so that users can access attributes + * directly. Use this API if you are a user. + * + * Note: Unlike in Python, TorchScript must make a distinction between + * attributes (which are IValues) and methods (which are Methods). If you + * want a method, use `obj.type()->getMethod()` + */ + IValue getAttr(const std::string& name) const; + void setAttr(const std::string& name, IValue v); + + std::string name() const; + + const std::vector& slots() const { + return slots_; + } + std::shared_ptr type() const { + return type_; + } + + private: + void resizeObject(size_t slot); + std::shared_ptr type_; + std::vector slots_; +}; + +struct C10_EXPORT ivalue::GenericDict : c10::intrusive_ptr_target { + private: + c10::impl::GenericDict elements_; + + public: + GenericDict(c10::impl::GenericDict elements_) + : elements_(std::move(elements_)) {} + static c10::intrusive_ptr create( + c10::impl::GenericDict elements_) { + return c10::make_intrusive(std::move(elements_)); + } + const c10::impl::GenericDict& elements() const & { + return elements_; + } + c10::impl::GenericDict& elements() & { + return elements_; + } + + using IterationOrder = std::vector>; + const IterationOrder iterationOrder() const; +}; + +#undef TORCH_FORALL_TAGS + +namespace detail { + +struct _guarded_unsigned_long_unique_dummy final { + _guarded_unsigned_long_unique_dummy(int64_t){}; +}; +using _guarded_unsigned_long = c10::guts::conditional_t< + std::is_same::value || + std::is_same::value, + _guarded_unsigned_long_unique_dummy, + unsigned long>; + +} // namespace detail + +#define DEFINE_TO(type, method_name) \ +template<> \ +inline type IValue::to() && { \ + return std::move(*this).method_name(); \ +} \ +template<> \ +inline type IValue::to() const & { \ + return this->method_name(); \ +} +DEFINE_TO(at::Tensor, toTensor) +DEFINE_TO(c10::intrusive_ptr, toTuple) +DEFINE_TO(float, toDouble) +DEFINE_TO(double, toDouble) +DEFINE_TO(unsigned char, toInt) +DEFINE_TO(signed char, toInt) +DEFINE_TO(unsigned short, toInt) +DEFINE_TO(short, toInt) +DEFINE_TO(int, toInt) +DEFINE_TO(uint32_t, toInt) +DEFINE_TO(uint64_t, toInt) +DEFINE_TO(detail::_guarded_unsigned_long, toInt) +DEFINE_TO(int64_t, toInt) +DEFINE_TO(bool, toBool) +DEFINE_TO(c10::intrusive_ptr, toBlob); +DEFINE_TO(c10::intrusive_ptr, toDoubleList) +DEFINE_TO(c10::intrusive_ptr, toIntList) +DEFINE_TO(c10::intrusive_ptr, toBoolList) +DEFINE_TO(c10::intrusive_ptr, toTensorList) +DEFINE_TO(c10::intrusive_ptr, toGenericList) +DEFINE_TO(c10::intrusive_ptr, toGenericDict) +DEFINE_TO(c10::intrusive_ptr, toString) +DEFINE_TO(c10::intrusive_ptr, toObject) +DEFINE_TO(at::Scalar, toScalar) +DEFINE_TO(std::vector, toIntListRef) +DEFINE_TO(std::vector, toDoubleListRef) +DEFINE_TO(std::vector, toBoolListRef) +DEFINE_TO(std::vector, toTensorListRef) +DEFINE_TO(std::vector, toGenericListRef) +DEFINE_TO(std::string, toStringRef) +DEFINE_TO(c10::intrusive_ptr, toFuture) +DEFINE_TO(IValue, toIValue) +DEFINE_TO(c10::Device, toDevice) +DEFINE_TO(at::ScalarType, toScalarType) +DEFINE_TO(at::Layout, toLayout) +DEFINE_TO(at::MemoryFormat, toMemoryFormat) + +template +struct _fake_type {}; + +template +std::vector generic_to( + const IValue* ivalue, + _fake_type>) { + return fmap(ivalue->toGenericListRef(), [](IValue item_ivalue) { return item_ivalue.to(); }); +} + +template +std::unordered_map generic_to( + const IValue* ivalue, + _fake_type>) { + std::unordered_map specialized_dict; + + for (auto item : ivalue->toGenericDictRef()) { + specialized_dict[item.key().to()] = item.value().to(); + } + + return specialized_dict; +} + +template +inline T IValue::to() && { + return generic_to(this, _fake_type{}); +} + +template +inline T IValue::to() const& { + return generic_to(this, _fake_type{}); +} + +// note: when adding a DEFINE_TO case here you should also add a +// toX method to IValue. These named methods are much more discoverable +// than the to templated function. + +inline IValue::IValue(c10::intrusive_ptr v) +: tag(Tag::Tuple), is_intrusive_ptr(true) { + payload.as_intrusive_ptr = v.release(); +} + +inline IValue::IValue(c10::intrusive_ptr v) +: tag(Tag::IntList), is_intrusive_ptr(true) { + payload.as_intrusive_ptr = v.release(); +} +inline IValue::IValue(std::vector v) +: IValue(ivalue::IntList::create(std::move(v))) {} + +inline IValue::IValue(c10::intrusive_ptr v) +: tag(Tag::String), is_intrusive_ptr(true) { + payload.as_intrusive_ptr = v.release(); +} +inline IValue::IValue(std::string v) +: IValue(ivalue::ConstantString::create(std::move(v))) {} + +inline IValue::IValue(c10::intrusive_ptr v) +: tag(Tag::DoubleList), is_intrusive_ptr(true) { + payload.as_intrusive_ptr = v.release(); +} +inline IValue::IValue(std::vector v) +: IValue(ivalue::DoubleList::create(std::move(v))) {} + +inline IValue::IValue(c10::intrusive_ptr v) +: tag(Tag::BoolList), is_intrusive_ptr(true) { + payload.as_intrusive_ptr = v.release(); +} +inline IValue::IValue(std::vector v) +: IValue(ivalue::BoolList::create(std::move(v))) {} + +inline IValue::IValue(c10::intrusive_ptr v) +: tag(Tag::TensorList), is_intrusive_ptr(true) { + payload.as_intrusive_ptr = v.release(); +} +inline IValue::IValue(std::vector v) +: IValue(ivalue::TensorList::create(std::move(v))) {} + +inline IValue::IValue(c10::intrusive_ptr v) +: tag(Tag::GenericList), is_intrusive_ptr(true) { + payload.as_intrusive_ptr = v.release(); +} +inline IValue::IValue(std::vector v) +: IValue(ivalue::GenericList::create(std::move(v))) {} + +inline IValue::IValue(c10::intrusive_ptr v) +: tag(Tag::GenericDict), is_intrusive_ptr(true) { + payload.as_intrusive_ptr = v.release(); +} +inline IValue::IValue(c10::impl::GenericDict v) +: IValue(ivalue::GenericDict::create(std::move(v))) {} + +inline IValue::IValue(c10::intrusive_ptr v) +: tag(Tag::Object), is_intrusive_ptr(true) { + payload.as_intrusive_ptr = v.release(); +} +inline IValue::IValue(c10::intrusive_ptr v) +: tag(Tag::Future), is_intrusive_ptr(true) { + payload.as_intrusive_ptr = v.release(); +} + +inline const std::vector& IValue::toIntListRef() const { + return toIntList()->elements(); +} + +inline const std::vector& IValue::toDoubleListRef() const { + return toDoubleList()->elements(); +} + +inline const std::vector& IValue::toTensorListRef() const { + return toTensorList()->elements(); +} + +inline const std::vector& IValue::toBoolListRef() const { + return toBoolList()->elements(); +} + +inline const std::vector& IValue::toGenericListRef() const { + return toGenericList()->elements(); +} + +inline const c10::impl::GenericDict& IValue:: + toGenericDictRef() const { + return toGenericDict()->elements(); +} + +inline const std::string& IValue::toStringRef() const { + return toString()->string(); +} + +template +inline optional IValue::toOptional() { + if (this->isNone()) { + return nullopt; + } + return this->to(); +} + +inline bool IValue::isSameIdentity(const IValue& rhs) const { + // We choose to not use memcmp for payload check due to potential random padding characters on union type + + // Semantics: + // 1. None is None, False is False, and True is True are all true + // 2. If it is a tensor type, we need to take undefined tensor into account + // 3. Undefined_tensor is None and vice versa should be true + // 4. If it is a reference type (i.e. is_intrusive_ptr), then is is True when the pointed-to object is the same. + // 5. False for all other comparisons. + if (this->isNone() && rhs.isNone()) { + return true; + } else if (this->isBool() && rhs.isBool()) { + // for bool type, do equality check + return this->toBool() == rhs.toBool(); + } else if (this->isTensor() && rhs.isTensor()) { + // for tensor type, just check the as_intrusive_ptr since is_intrusive_ptr is false for undefined tensor + return this->payload.as_intrusive_ptr == rhs.payload.as_intrusive_ptr; + } else if (this->isTensor() && rhs.isNone()) { + // special case: undefined tensor and None are the same identity + return !this->is_intrusive_ptr; + } else if (this->isNone() && rhs.isTensor()) { + // special case: undefined tensor and None are the same identity + return !rhs.is_intrusive_ptr; + } else { + // for objects holding in IValue, do shallow compare on pointer address to testify the identity + return this->is_intrusive_ptr && rhs.is_intrusive_ptr + && this->payload.as_intrusive_ptr == rhs.payload.as_intrusive_ptr; + } +} + +} // namespace c10 diff --git a/aten/src/ATen/core/jit_type.h b/aten/src/ATen/core/jit_type.h index 092c9c38fa73..350a0100ee23 100644 --- a/aten/src/ATen/core/jit_type.h +++ b/aten/src/ATen/core/jit_type.h @@ -494,6 +494,15 @@ struct CAFFE2_API ProfiledTensorType : public TensorType { return ProfiledTensorTypePtr(new ProfiledTensorType(scalar_type, device, sizes, strides, requires_grad)); } + static ProfiledTensorTypePtr create(ProfiledTensorTypePtr pttp) { + return ProfiledTensorTypePtr(new ProfiledTensorType( + pttp->scalarType(), + pttp->device(), + pttp->sizes(), + pttp->strides(), + pttp->requiresGrad())); + } + const VaryingShape& sizes() const { return sizes_; } const VaryingStrides& strides() const { return strides_; } c10::optional device() const { return device_; } diff --git a/aten/src/ATen/core/op_registration/base.h b/aten/src/ATen/core/op_registration/base.h deleted file mode 100644 index 3fdf6e9314b1..000000000000 --- a/aten/src/ATen/core/op_registration/base.h +++ /dev/null @@ -1,58 +0,0 @@ -#pragma once - -/** - * This file sets up the basics for operator registration. - * - * You probably don't want to include this file directly but include - * op_registration.h instead since that adds more functionality you'll - * likely need to register your operators. - */ - -#include - -namespace c10 { - -namespace detail { - - // KernelRegistrationConfig accumulates all information from the config - // parameters passed to a RegisterOperators::op() call into one object. - struct KernelRegistrationConfig final { - c10::optional dispatch_key = c10::nullopt; - KernelFunction* kernel_func = nullptr; - KernelCacheCreatorFunction cache_creator_func = nullptr; - std::unique_ptr inferred_function_schema = nullptr; - }; - - // is_registration_config_parameter is a concept that returns true_type iff its argument is - // a valid parameter to be passed to c10::RegisterOperators().op(parameters...) - // That is, it must have an apply method that takes a KernelRegistrationConfig*. - template - struct is_registration_config_parameter : std::false_type { - static_assert(std::is_same>::value, "is_registration_config_parameter doesn't work with reference types"); - }; - template - struct is_registration_config_parameter().apply(std::declval()), - std::declval().apply(std::declval()) - )>> : std::true_type { - static_assert(std::is_same>::value, "is_registration_config_parameter doesn't work with reference types"); - }; - static_assert(!is_registration_config_parameter::value, "For classes that aren't registration parameters, this concept should return false"); - // note: the corresponding asserts that the concept returns true are next to the definition of the corresponding classes - - // Take a list of configuration parameters and return a - // KernelRegistrationConfig accumulating all their configurations. - template - KernelRegistrationConfig make_registration_config(ConfigParameters&&... configParameters) { - static_assert(guts::conjunction>...>::value, "One of the parameters isn't a valid registration config parameter."); - - KernelRegistrationConfig config; - - // apply all configParameters - (void)std::initializer_list{(std::forward(configParameters).apply(&config), 0)...}; - - return config; - } -} - -} diff --git a/aten/src/ATen/core/op_registration/dispatch_key.h b/aten/src/ATen/core/op_registration/dispatch_key.h deleted file mode 100644 index a4ced362d9e5..000000000000 --- a/aten/src/ATen/core/op_registration/dispatch_key.h +++ /dev/null @@ -1,59 +0,0 @@ -#pragma once - -/** - * This file implements c10::dispatchKey() which is used in the kernel - * registration API to set the dispatch key for a registered kernel. - * - * You probably don't want to include this file directly but include - * op_registration.h instead since that adds more functionality you'll - * likely need to register your operators. - */ - -#include - -namespace c10 { - -namespace detail { - struct DispatchKeyConfigParameter final { - explicit constexpr DispatchKeyConfigParameter(TensorTypeId dispatch_key) - : dispatch_key_(dispatch_key) {} - - void apply(KernelRegistrationConfig* registration) const { - registration->dispatch_key = dispatch_key_; - } - - private: - TensorTypeId dispatch_key_; - }; - static_assert(is_registration_config_parameter::value, "KernelRegistrationConfigParameter must fulfill the registration config parameter concept"); -} - -/** - * Use this to register an operator with a kernel for a certain dispatch key. - * - * Example: - * - * > namespace { - * > class my_kernel_cpu final : public c10::OperatorKernel { - * > public: - * > Tensor operator()(Tensor a, Tensor b) {...} - * > }; - * > class my_kernel_cuda final : public c10::OperatorKernel { - * > public: - * > Tensor operator()(Tensor a, Tensor b) {...} - * > }; - * > } - * > - * > static auto registry = c10::RegisterOperators() - * > .op("my_op", - * > c10::kernel(), - * > c10::dispatchKey(CPUTensorId())) - * > .op("my_op", - * > c10::kernel(), - * > c10::dispatchKey(CUDATensorId())); - */ -inline constexpr detail::DispatchKeyConfigParameter dispatchKey(TensorTypeId dispatch_key) { - return detail::DispatchKeyConfigParameter(dispatch_key); -} - -} diff --git a/aten/src/ATen/core/op_registration/infer_schema.h b/aten/src/ATen/core/op_registration/infer_schema.h index 36f681efb82c..32846755083c 100644 --- a/aten/src/ATen/core/op_registration/infer_schema.h +++ b/aten/src/ATen/core/op_registration/infer_schema.h @@ -98,6 +98,6 @@ FunctionSchema inferFunctionSchema(std::string name, std::string overload_name) return detail::createFunctionSchemaFromTraits>(std::move(name), std::move(overload_name)); } -C10_API void assertSchemasHaveSameSignature(const FunctionSchema& inferred, const FunctionSchema& specified); +CAFFE2_API void assertSchemasHaveSameSignature(const FunctionSchema& inferred, const FunctionSchema& specified); } diff --git a/aten/src/ATen/core/op_registration/kernel_function.h b/aten/src/ATen/core/op_registration/kernel_function.h index e5d7bb3fde1e..b08490d856ba 100644 --- a/aten/src/ATen/core/op_registration/kernel_function.h +++ b/aten/src/ATen/core/op_registration/kernel_function.h @@ -26,26 +26,4 @@ namespace detail { }; } -/** - * Use this to register an operator whose kernel is implemented by a function: - * - * Example: - * - * > namespace { Tensor my_kernel_cpu(Tensor a, Tensor b) {...} } - * > - * > static auto registry = c10::RegisterOperators() - * > .op("my_op", - * > c10::kernel(), - * > c10::dispatchKey(CPUTensorId())); - */ -template -inline constexpr auto kernel() -> -// enable_if: only enable it if FuncType is actually a function -guts::enable_if_t::value, -decltype(kernel::type>())> { - static_assert(!std::is_same::value, "Tried to register a stackbased (i.e. internal) kernel function using the public kernel<...>() API. Please either use the internal kernel(...) API or also implement the kernel function as defined by the public API."); - - return kernel::type>(); -} - } diff --git a/aten/src/ATen/core/op_registration/kernel_function_legacy_test.cpp b/aten/src/ATen/core/op_registration/kernel_function_legacy_test.cpp index 5d13721de165..b4b24aff78da 100644 --- a/aten/src/ATen/core/op_registration/kernel_function_legacy_test.cpp +++ b/aten/src/ATen/core/op_registration/kernel_function_legacy_test.cpp @@ -18,8 +18,6 @@ #pragma GCC diagnostic ignored "-Wdeprecated-declarations" using c10::RegisterOperators; -using c10::kernel; -using c10::dispatchKey; using c10::TensorTypeId; using c10::KernelCache; using c10::Stack; @@ -27,7 +25,6 @@ using c10::guts::make_unique; using c10::ivalue::TensorList; using c10::ivalue::IntList; using c10::intrusive_ptr; -using c10::ArrayRef; using c10::Dict; using at::Tensor; using std::string; @@ -364,7 +361,7 @@ TEST(OperatorRegistrationTest_LegacyFunctionBasedKernel, givenKernelWithIntInput int64_t captured_input_list_size = 0; -void kernelWithIntListInputWithoutOutput(Tensor, ArrayRef input1) { +void kernelWithIntListInputWithoutOutput(Tensor, const std::vector& input1) { captured_input_list_size = input1.size(); } @@ -381,7 +378,7 @@ TEST(OperatorRegistrationTest_LegacyFunctionBasedKernel, givenKernelWithIntListI EXPECT_EQ(3, captured_input_list_size); } -int64_t kernelWithIntListInputWithOutput(Tensor, ArrayRef input1) { +int64_t kernelWithIntListInputWithOutput(Tensor, const std::vector& input1) { return input1.size(); } @@ -397,7 +394,7 @@ TEST(OperatorRegistrationTest_LegacyFunctionBasedKernel, givenKernelWithIntListI EXPECT_EQ(3, outputs[0].toInt()); } -void kernelWithTensorListInputWithoutOutput(ArrayRef input1) { +void kernelWithTensorListInputWithoutOutput(const std::vector& input1) { captured_input_list_size = input1.size(); } @@ -414,7 +411,7 @@ TEST(OperatorRegistrationTest_LegacyFunctionBasedKernel, givenKernelWithTensorLi EXPECT_EQ(2, captured_input_list_size); } -int64_t kernelWithTensorListInputWithOutput(ArrayRef input1) { +int64_t kernelWithTensorListInputWithOutput(const std::vector& input1) { return input1.size(); } @@ -496,6 +493,27 @@ TEST(OperatorRegistrationTest_LegacyFunctionBasedKernel, givenKernelWithLegacyTe EXPECT_EQ(2, outputs[0].toInt()); } +std::vector kernelWithStringListOutput(std::vector input) { + return input; +} + +TEST(OperatorRegistrationTest_LegacyFunctionBasedKernel, givenKernelWithStringListOutput_whenRegistered_thenCanBeCalled) { + auto registrar = RegisterOperators() + .op("_test::stringlist_output(str[] input) -> str[]", &kernelWithStringListOutput); + + auto op = c10::Dispatcher::singleton().findSchema("_test::stringlist_output", ""); + ASSERT_TRUE(op.has_value()); + + std::vector list{"value1", "value2"}; + auto outputs = callOp(*op, list); + EXPECT_EQ(1, outputs.size()); + auto output = std::move(outputs[0].toGenericList()->elements()); + + EXPECT_EQ(2, output.size()); + EXPECT_EQ("value1", output[0].toString()->string()); + EXPECT_EQ("value2", output[1].toString()->string()); +} + int captured_dict_size = 0; void kernelWithDictInputWithoutOutput(Dict input1) { @@ -622,6 +640,118 @@ TEST(OperatorRegistrationTest_LegacyFunctionBasedKernel, givenKernelWithUnordere EXPECT_EQ("value2", output.at("key2")); } +std::unordered_map> kernelWithMapOfIntList(std::unordered_map> input) { + return input; +} + +TEST(OperatorRegistrationTest_LegacyFunctionBasedKernel, givenKernelWithMapOfList_whenRegistered_thenCanBeCalled) { + auto registrar = RegisterOperators() + .op("_test::dict_output(Dict(str, int[]) input) -> Dict(str, int[])", &kernelWithMapOfIntList); + + auto op = c10::Dispatcher::singleton().findSchema("_test::dict_output", ""); + ASSERT_TRUE(op.has_value()); + + std::unordered_map> dict; + dict.insert({"key1", std::vector{10, 20}}); + dict.insert({"key2", std::vector{30, 40}}); + auto outputs = callOp(*op, dict); + EXPECT_EQ(1, outputs.size()); + auto output = c10::impl::toTypedDict>(std::move(outputs[0].toGenericDict()->elements())); + + EXPECT_EQ(2, output.size()); + EXPECT_EQ(2, output.at("key1").size()); + EXPECT_EQ(10, output.at("key1")[0]); + EXPECT_EQ(20, output.at("key1")[1]); + EXPECT_EQ(2, output.at("key2").size()); + EXPECT_EQ(30, output.at("key2")[0]); + EXPECT_EQ(40, output.at("key2")[1]); +} + +std::unordered_map>> kernelWithMapOfListOfMap(std::unordered_map>> input) { + return input; +} + +TEST(OperatorRegistrationTest_LegacyFunctionBasedKernel, givenKernelWithMapOfListOfMap_whenRegistered_thenCanBeCalled) { + auto registrar = RegisterOperators() + .op("_test::dict_output(Dict(str, Dict(int,str)[]) input) -> Dict(str, Dict(int,str)[])", &kernelWithMapOfListOfMap); + + auto op = c10::Dispatcher::singleton().findSchema("_test::dict_output", ""); + ASSERT_TRUE(op.has_value()); + + std::unordered_map>> dict; + dict.insert({"key1", {{{10, "10"}, {20, "20"}}}}); + dict.insert({"key2", {{{30, "30"}, {40, "40"}}}}); + auto outputs = callOp(*op, dict); + EXPECT_EQ(1, outputs.size()); + auto output = c10::impl::toTypedDict>>(std::move(outputs[0].toGenericDict()->elements())); + + EXPECT_EQ(2, output.size()); + EXPECT_EQ(1, output.at("key1").size()); + EXPECT_EQ(2, output.at("key1")[0].size()); + EXPECT_EQ("10", output.at("key1")[0][10]); + EXPECT_EQ("20", output.at("key1")[0][20]); + EXPECT_EQ(2, output.at("key2")[0].size()); + EXPECT_EQ("30", output.at("key2")[0][30]); + EXPECT_EQ("40", output.at("key2")[0][40]); +} + +std::vector> kernelWithListOfMap(std::vector> input) { + return input; +} + +TEST(OperatorRegistrationTest_LegacyFunctionBasedKernel, givenKernelWithListOfMap_whenRegistered_thenCanBeCalled) { + auto registrar = RegisterOperators() + .op("_test::list_output(Dict(str, int)[] input) -> Dict(str, int)[]", &kernelWithListOfMap); + + auto op = c10::Dispatcher::singleton().findSchema("_test::list_output", ""); + ASSERT_TRUE(op.has_value()); + + std::vector> list{{{"1", 1}, {"2", 2}}, {{"3", 3}, {"4", 4}}}; + auto outputs = callOp(*op, list); + EXPECT_EQ(1, outputs.size()); + std::vector output = std::move(outputs[0].toGenericList()->elements()); + + EXPECT_EQ(2, output.size()); + EXPECT_EQ(2, output[0].toGenericDictRef().size()); + EXPECT_EQ(1, output[0].toGenericDictRef().at("1").toInt()); + EXPECT_EQ(2, output[0].toGenericDictRef().at("2").toInt()); + EXPECT_EQ(2, output[1].toGenericDictRef().size()); + EXPECT_EQ(3, output[1].toGenericDictRef().at("3").toInt()); + EXPECT_EQ(4, output[1].toGenericDictRef().at("4").toInt()); +} + +std::vector>> kernelWithListOfMapOfIntList(std::vector>> input) { + return input; +} + +TEST(OperatorRegistrationTest_LegacyFunctionBasedKernel, givenKernelWithListOfMapOfIntList_whenRegistered_thenCanBeCalled) { + auto registrar = RegisterOperators() + .op("_test::list_output(Dict(str, int[])[] input) -> Dict(str, int[])[]", &kernelWithListOfMapOfIntList); + + auto op = c10::Dispatcher::singleton().findSchema("_test::list_output", ""); + ASSERT_TRUE(op.has_value()); + + std::vector>> list{{{"1", {1, 2}}, {"3", {3, 4}}}, {{"5", {5, 6}}, {"7", {7, 8}}}}; + auto outputs = callOp(*op, list); + EXPECT_EQ(1, outputs.size()); + std::vector output = std::move(outputs[0].toGenericList()->elements()); + + EXPECT_EQ(2, output.size()); + EXPECT_EQ(2, output[0].toGenericDictRef().size()); + EXPECT_EQ(2, output[0].toGenericDictRef().at("1").toIntListRef().size()); + EXPECT_EQ(1, output[0].toGenericDictRef().at("1").toIntListRef()[0]); + EXPECT_EQ(2, output[0].toGenericDictRef().at("1").toIntListRef()[1]); + EXPECT_EQ(2, output[0].toGenericDictRef().at("3").toIntListRef().size()); + EXPECT_EQ(3, output[0].toGenericDictRef().at("3").toIntListRef()[0]); + EXPECT_EQ(4, output[0].toGenericDictRef().at("3").toIntListRef()[1]); + EXPECT_EQ(2, output[1].toGenericDictRef().at("5").toIntListRef().size()); + EXPECT_EQ(5, output[1].toGenericDictRef().at("5").toIntListRef()[0]); + EXPECT_EQ(6, output[1].toGenericDictRef().at("5").toIntListRef()[1]); + EXPECT_EQ(2, output[1].toGenericDictRef().at("7").toIntListRef().size()); + EXPECT_EQ(7, output[1].toGenericDictRef().at("7").toIntListRef()[0]); + EXPECT_EQ(8, output[1].toGenericDictRef().at("7").toIntListRef()[1]); +} + bool called = false; void kernelWithoutInputs() { @@ -760,7 +890,7 @@ TEST(OperatorRegistrationTest_LegacyFunctionBasedKernel, givenKernelWithOptional EXPECT_TRUE(outputs[2].isNone()); } -std::tuple kernelForSchemaInference(Tensor arg1, int64_t arg2, ArrayRef arg3) { +std::tuple kernelForSchemaInference(Tensor arg1, int64_t arg2, const std::vector& arg3) { return {}; } diff --git a/aten/src/ATen/core/op_registration/kernel_function_test.cpp b/aten/src/ATen/core/op_registration/kernel_function_test.cpp index 0bad6f2486ac..397cc5e60bd2 100644 --- a/aten/src/ATen/core/op_registration/kernel_function_test.cpp +++ b/aten/src/ATen/core/op_registration/kernel_function_test.cpp @@ -6,8 +6,6 @@ #include using c10::RegisterOperators; -using c10::kernel; -using c10::dispatchKey; using c10::TensorTypeId; using c10::KernelCache; using c10::Stack; @@ -15,7 +13,6 @@ using c10::guts::make_unique; using c10::ivalue::TensorList; using c10::ivalue::IntList; using c10::intrusive_ptr; -using c10::ArrayRef; using c10::Dict; using at::Tensor; using std::string; @@ -60,32 +57,32 @@ void expectCallsDecrement(TensorTypeId type_id) { } TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernel_whenRegistered_thenCanBeCalled) { - auto registrar = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", kernel(), dispatchKey(TensorType1())); + auto registrar = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", RegisterOperators::options().kernel().dispatchKey(TensorType1())); expectCallsIncrement(TensorType1()); } TEST(OperatorRegistrationTest_FunctionBasedKernel, givenMultipleOperatorsAndKernels_whenRegisteredInOneRegistrar_thenCallsRightKernel) { auto registrar = RegisterOperators() - .op("_test::my_op(Tensor dummy, int input) -> int", kernel(), dispatchKey(TensorType1())) - .op("_test::my_op(Tensor dummy, int input) -> int", kernel(), dispatchKey(TensorType2())) - .op("_test::error(Tensor dummy, int input) -> int", kernel(), dispatchKey(TensorType1())) - .op("_test::error(Tensor dummy, int input) -> int", kernel(), dispatchKey(TensorType2())); + .op("_test::my_op(Tensor dummy, int input) -> int", RegisterOperators::options().kernel().dispatchKey(TensorType1())) + .op("_test::my_op(Tensor dummy, int input) -> int", RegisterOperators::options().kernel().dispatchKey(TensorType2())) + .op("_test::error(Tensor dummy, int input) -> int", RegisterOperators::options().kernel().dispatchKey(TensorType1())) + .op("_test::error(Tensor dummy, int input) -> int", RegisterOperators::options().kernel().dispatchKey(TensorType2())); expectCallsIncrement(TensorType1()); } TEST(OperatorRegistrationTest_FunctionBasedKernel, givenMultipleOperatorsAndKernels_whenRegisteredInMultipleRegistrars_thenCallsRightKernel) { - auto registrar1 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", kernel(), dispatchKey(TensorType1())); - auto registrar2 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", kernel(), dispatchKey(TensorType2())); - auto registrar3 = RegisterOperators().op("_test::error(Tensor dummy, int input) -> int", kernel(), dispatchKey(TensorType1())); - auto registrar4 = RegisterOperators().op("_test::error(Tensor dummy, int input) -> int", kernel(), dispatchKey(TensorType2())); + auto registrar1 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", RegisterOperators::options().kernel().dispatchKey(TensorType1())); + auto registrar2 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", RegisterOperators::options().kernel().dispatchKey(TensorType2())); + auto registrar3 = RegisterOperators().op("_test::error(Tensor dummy, int input) -> int", RegisterOperators::options().kernel().dispatchKey(TensorType1())); + auto registrar4 = RegisterOperators().op("_test::error(Tensor dummy, int input) -> int", RegisterOperators::options().kernel().dispatchKey(TensorType2())); expectCallsIncrement(TensorType1()); } TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernel_whenRegistrationRunsOutOfScope_thenCannotBeCalledAnymore) { { - auto registrar1 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", kernel(), dispatchKey(TensorType1())); + auto registrar1 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", RegisterOperators::options().kernel().dispatchKey(TensorType1())); { - auto registrar2 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", kernel(), dispatchKey(TensorType2())); + auto registrar2 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", RegisterOperators::options().kernel().dispatchKey(TensorType2())); // assert that schema and cpu kernel are present expectCallsIncrement(TensorType1()); @@ -108,7 +105,7 @@ void kernelWithoutOutput(const Tensor&) { } TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithoutOutput_whenRegistered_thenCanBeCalled) { - auto registrar = RegisterOperators().op("_test::no_return(Tensor dummy) -> ()", kernel(), dispatchKey(TensorType1())); + auto registrar = RegisterOperators().op("_test::no_return(Tensor dummy) -> ()", RegisterOperators::options().kernel().dispatchKey(TensorType1())); auto op = c10::Dispatcher::singleton().findSchema("_test::no_return", ""); ASSERT_TRUE(op.has_value()); @@ -124,7 +121,7 @@ std::tuple<> kernelWithZeroOutputs(const Tensor&) { } TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithZeroOutputs_whenRegistered_thenCanBeCalled) { - auto registrar = RegisterOperators().op("_test::zero_outputs(Tensor dummy) -> ()", kernel(), dispatchKey(TensorType1())); + auto registrar = RegisterOperators().op("_test::zero_outputs(Tensor dummy) -> ()", RegisterOperators::options().kernel().dispatchKey(TensorType1())); auto op = c10::Dispatcher::singleton().findSchema("_test::zero_outputs", ""); ASSERT_TRUE(op.has_value()); @@ -140,7 +137,7 @@ int64_t kernelWithIntOutput(Tensor, int64_t a, int64_t b) { TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithIntOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() - .op("_test::int_output(Tensor dummy, int a, int b) -> int", kernel(), dispatchKey(TensorType1())); + .op("_test::int_output(Tensor dummy, int a, int b) -> int", RegisterOperators::options().kernel().dispatchKey(TensorType1())); auto op = c10::Dispatcher::singleton().findSchema("_test::int_output", ""); ASSERT_TRUE(op.has_value()); @@ -156,8 +153,8 @@ Tensor kernelWithTensorOutput(const Tensor& input) { TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithTensorOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() - .op("_test::returning_tensor(Tensor input) -> Tensor", kernel(), dispatchKey(TensorType1())) - .op("_test::returning_tensor(Tensor input) -> Tensor", kernel(), dispatchKey(TensorType2())); + .op("_test::returning_tensor(Tensor input) -> Tensor", RegisterOperators::options().kernel().dispatchKey(TensorType1())) + .op("_test::returning_tensor(Tensor input) -> Tensor", RegisterOperators::options().kernel().dispatchKey(TensorType2())); auto op = c10::Dispatcher::singleton().findSchema("_test::returning_tensor", ""); ASSERT_TRUE(op.has_value()); @@ -177,7 +174,7 @@ std::vector kernelWithTensorListOutput(const Tensor& input1, const Tenso TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithTensorListOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() - .op("_test::list_output(Tensor input1, Tensor input2, Tensor input3) -> Tensor[]", kernel(), dispatchKey(TensorType1())); + .op("_test::list_output(Tensor input1, Tensor input2, Tensor input3) -> Tensor[]", RegisterOperators::options().kernel().dispatchKey(TensorType1())); auto op = c10::Dispatcher::singleton().findSchema("_test::list_output", ""); ASSERT_TRUE(op.has_value()); @@ -196,7 +193,7 @@ std::vector kernelWithIntListOutput(const Tensor&, int64_t input1, int6 TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithIntListOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() - .op("_test::list_output(Tensor dummy, int input1, int input2, int input3) -> int[]", kernel(), dispatchKey(TensorType1())); + .op("_test::list_output(Tensor dummy, int input1, int input2, int input3) -> int[]", RegisterOperators::options().kernel().dispatchKey(TensorType1())); auto op = c10::Dispatcher::singleton().findSchema("_test::list_output", ""); ASSERT_TRUE(op.has_value()); @@ -224,7 +221,7 @@ std::tuple, c10::optional, Dict (Tensor, int, Tensor[], int?, Dict(str, Tensor))", kernel(), dispatchKey(TensorType1())); + .op("_test::multiple_outputs(Tensor dummy) -> (Tensor, int, Tensor[], int?, Dict(str, Tensor))", RegisterOperators::options().kernel().dispatchKey(TensorType1())); auto op = c10::Dispatcher::singleton().findSchema("_test::multiple_outputs", ""); ASSERT_TRUE(op.has_value()); @@ -253,8 +250,8 @@ Tensor kernelWithTensorInputByValueWithOutput(Tensor input1) { TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithTensorInputByReference_withOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() - .op("_test::tensor_input(Tensor input) -> Tensor", kernel(), dispatchKey(TensorType1())) - .op("_test::tensor_input(Tensor input) -> Tensor", kernel(), dispatchKey(TensorType2())); + .op("_test::tensor_input(Tensor input) -> Tensor", RegisterOperators::options().kernel().dispatchKey(TensorType1())) + .op("_test::tensor_input(Tensor input) -> Tensor", RegisterOperators::options().kernel().dispatchKey(TensorType2())); auto op = c10::Dispatcher::singleton().findSchema("_test::tensor_input", ""); ASSERT_TRUE(op.has_value()); @@ -270,8 +267,8 @@ TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithTensorInputByR TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithTensorInputByValue_withOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() - .op("_test::tensor_input(Tensor input) -> Tensor", kernel(), dispatchKey(TensorType1())) - .op("_test::tensor_input(Tensor input) -> Tensor", kernel(), dispatchKey(TensorType2())); + .op("_test::tensor_input(Tensor input) -> Tensor", RegisterOperators::options().kernel().dispatchKey(TensorType1())) + .op("_test::tensor_input(Tensor input) -> Tensor", RegisterOperators::options().kernel().dispatchKey(TensorType2())); auto op = c10::Dispatcher::singleton().findSchema("_test::tensor_input", ""); ASSERT_TRUE(op.has_value()); @@ -297,8 +294,8 @@ void kernelWithTensorInputByValueWithoutOutput(Tensor input1) { TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithTensorInputByReference_withoutOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() - .op("_test::tensor_input(Tensor input) -> ()", kernel(), dispatchKey(TensorType1())) - .op("_test::tensor_input(Tensor input) -> ()", kernel(), dispatchKey(TensorType2())); + .op("_test::tensor_input(Tensor input) -> ()", RegisterOperators::options().kernel().dispatchKey(TensorType1())) + .op("_test::tensor_input(Tensor input) -> ()", RegisterOperators::options().kernel().dispatchKey(TensorType2())); auto op = c10::Dispatcher::singleton().findSchema("_test::tensor_input", ""); ASSERT_TRUE(op.has_value()); @@ -314,8 +311,8 @@ TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithTensorInputByR TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithTensorInputByValue_withoutOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() - .op("_test::tensor_input(Tensor input) -> ()", kernel(), dispatchKey(TensorType1())) - .op("_test::tensor_input(Tensor input) -> ()", kernel(), dispatchKey(TensorType2())); + .op("_test::tensor_input(Tensor input) -> ()", RegisterOperators::options().kernel().dispatchKey(TensorType1())) + .op("_test::tensor_input(Tensor input) -> ()", RegisterOperators::options().kernel().dispatchKey(TensorType2())); auto op = c10::Dispatcher::singleton().findSchema("_test::tensor_input", ""); ASSERT_TRUE(op.has_value()); @@ -337,7 +334,7 @@ void kernelWithIntInputWithoutOutput(Tensor, int64_t input1) { TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithIntInput_withoutOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() - .op("_test::int_input(Tensor dummy, int input) -> ()", kernel(), dispatchKey(TensorType1())); + .op("_test::int_input(Tensor dummy, int input) -> ()", RegisterOperators::options().kernel().dispatchKey(TensorType1())); auto op = c10::Dispatcher::singleton().findSchema("_test::int_input", ""); ASSERT_TRUE(op.has_value()); @@ -354,7 +351,7 @@ int64_t kernelWithIntInputWithOutput(Tensor, int64_t input1) { TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithIntInput_withOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() - .op("_test::int_input(Tensor dummy, int input) -> int", kernel(), dispatchKey(TensorType1())); + .op("_test::int_input(Tensor dummy, int input) -> int", RegisterOperators::options().kernel().dispatchKey(TensorType1())); auto op = c10::Dispatcher::singleton().findSchema("_test::int_input", ""); ASSERT_TRUE(op.has_value()); @@ -366,13 +363,13 @@ TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithIntInput_withO int64_t captured_input_list_size = 0; -void kernelWithIntListInputWithoutOutput(Tensor, ArrayRef input1) { +void kernelWithIntListInputWithoutOutput(Tensor, const std::vector& input1) { captured_input_list_size = input1.size(); } TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithIntListInput_withoutOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() - .op("_test::int_list_input(Tensor dummy, int[] input) -> ()", kernel(), dispatchKey(TensorType1())); + .op("_test::int_list_input(Tensor dummy, int[] input) -> ()", RegisterOperators::options().kernel().dispatchKey(TensorType1())); auto op = c10::Dispatcher::singleton().findSchema("_test::int_list_input", ""); ASSERT_TRUE(op.has_value()); @@ -383,13 +380,13 @@ TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithIntListInput_w EXPECT_EQ(3, captured_input_list_size); } -int64_t kernelWithIntListInputWithOutput(Tensor, ArrayRef input1) { +int64_t kernelWithIntListInputWithOutput(Tensor, const std::vector& input1) { return input1.size(); } TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithIntListInput_withOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() - .op("_test::int_list_input(Tensor dummy, int[] input) -> int", kernel(), dispatchKey(TensorType1())); + .op("_test::int_list_input(Tensor dummy, int[] input) -> int", RegisterOperators::options().kernel().dispatchKey(TensorType1())); auto op = c10::Dispatcher::singleton().findSchema("_test::int_list_input", ""); ASSERT_TRUE(op.has_value()); @@ -399,13 +396,13 @@ TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithIntListInput_w EXPECT_EQ(3, outputs[0].toInt()); } -void kernelWithTensorListInputWithoutOutput(ArrayRef input1) { +void kernelWithTensorListInputWithoutOutput(const std::vector& input1) { captured_input_list_size = input1.size(); } TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithTensorListInput_withoutOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() - .op("_test::tensor_list_input(Tensor[] input) -> ()", kernel(), dispatchKey(TensorType1())); + .op("_test::tensor_list_input(Tensor[] input) -> ()", RegisterOperators::options().kernel().dispatchKey(TensorType1())); auto op = c10::Dispatcher::singleton().findSchema("_test::tensor_list_input", ""); ASSERT_TRUE(op.has_value()); @@ -416,13 +413,13 @@ TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithTensorListInpu EXPECT_EQ(2, captured_input_list_size); } -int64_t kernelWithTensorListInputWithOutput(ArrayRef input1) { +int64_t kernelWithTensorListInputWithOutput(const std::vector& input1) { return input1.size(); } TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithTensorListInput_withOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() - .op("_test::tensor_list_input(Tensor[] input) -> int", kernel(), dispatchKey(TensorType1())); + .op("_test::tensor_list_input(Tensor[] input) -> int", RegisterOperators::options().kernel().dispatchKey(TensorType1())); auto op = c10::Dispatcher::singleton().findSchema("_test::tensor_list_input", ""); ASSERT_TRUE(op.has_value()); @@ -440,7 +437,7 @@ void kernelWithDictInputWithoutOutput(Dict input1) { TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithDictInput_withoutOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() - .op("_test::dict_input(Dict(str, Tensor) input) -> ()", kernel()); + .op("_test::dict_input(Dict(str, Tensor) input) -> ()", RegisterOperators::options().kernel()); auto op = c10::Dispatcher::singleton().findSchema("_test::dict_input", ""); ASSERT_TRUE(op.has_value()); @@ -460,7 +457,7 @@ string kernelWithDictInputWithOutput(Dict input1) { TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithDictInput_withOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() - .op("_test::dict_input(Dict(str, str) input) -> str", kernel()); + .op("_test::dict_input(Dict(str, str) input) -> str", RegisterOperators::options().kernel()); auto op = c10::Dispatcher::singleton().findSchema("_test::dict_input", ""); ASSERT_TRUE(op.has_value()); @@ -479,7 +476,7 @@ Dict kernelWithDictOutput(Dict input) { TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithDictOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() - .op("_test::dict_output(Dict(str, str) input) -> Dict(str, str)", kernel()); + .op("_test::dict_output(Dict(str, str) input) -> Dict(str, str)", RegisterOperators::options().kernel()); auto op = c10::Dispatcher::singleton().findSchema("_test::dict_output", ""); ASSERT_TRUE(op.has_value()); @@ -507,7 +504,7 @@ TEST(OperatorRegistrationTest_FunctionBasedKernel, givenFallbackKernelWithoutAny // is no way to get the dispatch key. For operators that only have a fallback // kernel, this must work for backwards compatibility. auto registrar = RegisterOperators() - .op("_test::no_tensor_args() -> ()", kernel()); + .op("_test::no_tensor_args() -> ()", RegisterOperators::options().kernel()); auto op = c10::Dispatcher::singleton().findSchema("_test::no_tensor_args", ""); ASSERT_TRUE(op.has_value()); @@ -526,7 +523,7 @@ TEST(OperatorRegistrationTest_FunctionBasedKernel, givenFallbackKernelWithoutTen // is no way to get the dispatch key. For operators that only have a fallback // kernel, this must work for backwards compatibility. auto registrar = RegisterOperators() - .op("_test::no_tensor_args(int arg) -> int", kernel()); + .op("_test::no_tensor_args(int arg) -> int", RegisterOperators::options().kernel()); auto op = c10::Dispatcher::singleton().findSchema("_test::no_tensor_args", ""); ASSERT_TRUE(op.has_value()); @@ -548,7 +545,7 @@ void kernelWithOptInputWithoutOutput(Tensor arg1, const c10::optional& a } TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithOptionalInputs_withoutOutput_whenRegistered_thenCanBeCalled) { - auto registrar = RegisterOperators().op("_test::opt_input(Tensor arg1, Tensor? arg2, int? arg3, str? arg4) -> ()", kernel(), dispatchKey(TensorType1())); + auto registrar = RegisterOperators().op("_test::opt_input(Tensor arg1, Tensor? arg2, int? arg3, str? arg4) -> ()", RegisterOperators::options().kernel().dispatchKey(TensorType1())); auto op = c10::Dispatcher::singleton().findSchema("_test::opt_input", ""); ASSERT_TRUE(op.has_value()); @@ -583,7 +580,7 @@ c10::optional kernelWithOptInputWithOutput(Tensor arg1, const c10::optio } TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithOptionalInputs_withOutput_whenRegistered_thenCanBeCalled) { - auto registrar = RegisterOperators().op("_test::opt_input(Tensor arg1, Tensor? arg2, int? arg3, str? arg4) -> Tensor?", kernel(), dispatchKey(TensorType1())); + auto registrar = RegisterOperators().op("_test::opt_input(Tensor arg1, Tensor? arg2, int? arg3, str? arg4) -> Tensor?", RegisterOperators::options().kernel().dispatchKey(TensorType1())); auto op = c10::Dispatcher::singleton().findSchema("_test::opt_input", ""); ASSERT_TRUE(op.has_value()); @@ -617,7 +614,7 @@ kernelWithOptInputWithMultipleOutputs(Tensor arg1, const c10::optional& } TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithOptionalInputs_withMultipleOutputs_whenRegistered_thenCanBeCalled) { - auto registrar = RegisterOperators().op("_test::opt_input(Tensor arg1, Tensor? arg2, int? arg3, str? arg4) -> (Tensor?, int?, str?)", kernel(), dispatchKey(TensorType1())); + auto registrar = RegisterOperators().op("_test::opt_input(Tensor arg1, Tensor? arg2, int? arg3, str? arg4) -> (Tensor?, int?, str?)", RegisterOperators::options().kernel().dispatchKey(TensorType1())); auto op = c10::Dispatcher::singleton().findSchema("_test::opt_input", ""); ASSERT_TRUE(op.has_value()); @@ -634,13 +631,13 @@ TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithOptionalInputs EXPECT_TRUE(outputs[2].isNone()); } -std::tuple kernelForSchemaInference(Tensor arg1, int64_t arg2, ArrayRef arg3) { +std::tuple kernelForSchemaInference(Tensor arg1, int64_t arg2, const std::vector& arg3) { return {}; } TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernel_whenRegisteredWithoutSpecifyingSchema_thenInfersSchema) { auto registrar = RegisterOperators() - .op("_test::no_schema_specified", kernel()); + .op("_test::no_schema_specified", RegisterOperators::options().kernel()); auto op = c10::Dispatcher::singleton().findSchema("_test::no_schema_specified", ""); ASSERT_TRUE(op.has_value()); @@ -658,35 +655,35 @@ template struct kernel_func final { TEST(OperatorRegistrationTest_FunctionBasedKernel, givenMismatchedKernel_withDifferentNumArguments_whenRegistering_thenFails) { // assert this does not fail because it matches RegisterOperators() - .op("_test::mismatch(Tensor arg) -> int", kernel::func), &kernel_func::func>(), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> int", RegisterOperators::options().kernel::func), &kernel_func::func>().dispatchKey(TensorType1())); // and now a set of mismatching schemas expectThrows([] { RegisterOperators() - .op("_test::mismatch(Tensor arg, Tensor arg2) -> int", kernel::func), &kernel_func::func>(), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg, Tensor arg2) -> int", RegisterOperators::options().kernel::func), &kernel_func::func>().dispatchKey(TensorType1())); }, "The number of arguments is different. Specified 2 but inferred 1" ); // assert this does not fail because it matches RegisterOperators() - .op("_test::mismatch(Tensor arg, Tensor arg2) -> ()", kernel::func), &kernel_func::func>(), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg, Tensor arg2) -> ()", RegisterOperators::options().kernel::func), &kernel_func::func>().dispatchKey(TensorType1())); // and now a set of mismatching schemas expectThrows([] { RegisterOperators() - .op("_test::mismatch() -> ()", kernel::func), &kernel_func::func>(), dispatchKey(TensorType1())); + .op("_test::mismatch() -> ()", RegisterOperators::options().kernel::func), &kernel_func::func>().dispatchKey(TensorType1())); }, "The number of arguments is different. Specified 0 but inferred 2" ); expectThrows([] { RegisterOperators() - .op("_test::mismatch(Tensor arg) -> ()", kernel::func), &kernel_func::func>(), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> ()", RegisterOperators::options().kernel::func), &kernel_func::func>().dispatchKey(TensorType1())); }, "The number of arguments is different. Specified 1 but inferred 2" ); expectThrows([] { RegisterOperators() - .op("_test::mismatch(Tensor arg, Tensor arg2, Tensor arg3) -> ()", kernel::func), &kernel_func::func>(), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg, Tensor arg2, Tensor arg3) -> ()", RegisterOperators::options().kernel::func), &kernel_func::func>().dispatchKey(TensorType1())); }, "The number of arguments is different. Specified 3 but inferred 2" ); } @@ -694,18 +691,18 @@ TEST(OperatorRegistrationTest_FunctionBasedKernel, givenMismatchedKernel_withDif TEST(OperatorRegistrationTest_FunctionBasedKernel, givenMismatchedKernel_withDifferentArgumentType_whenRegistering_thenFails) { // assert this does not fail because it matches RegisterOperators() - .op("_test::mismatch(Tensor arg1, int arg2) -> int", kernel::func), &kernel_func::func>(), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg1, int arg2) -> int", RegisterOperators::options().kernel::func), &kernel_func::func>().dispatchKey(TensorType1())); // and now a set of mismatching schemas expectThrows([] { RegisterOperators() - .op("_test::mismatch(Tensor arg1, float arg2) -> int", kernel::func), &kernel_func::func>(), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg1, float arg2) -> int", RegisterOperators::options().kernel::func), &kernel_func::func>().dispatchKey(TensorType1())); }, "Type mismatch in argument 2: specified float but inferred int" ); expectThrows([] { RegisterOperators() - .op("_test::mismatch(int arg1, int arg2) -> int", kernel::func), &kernel_func::func>(), dispatchKey(TensorType1())); + .op("_test::mismatch(int arg1, int arg2) -> int", RegisterOperators::options().kernel::func), &kernel_func::func>().dispatchKey(TensorType1())); }, "Type mismatch in argument 1: specified int but inferred Tensor" ); } @@ -713,58 +710,58 @@ TEST(OperatorRegistrationTest_FunctionBasedKernel, givenMismatchedKernel_withDif TEST(OperatorRegistrationTest_FunctionBasedKernel, givenMismatchedKernel_withDifferentNumReturns_whenRegistering_thenFails) { // assert this does not fail because it matches RegisterOperators() - .op("_test::mismatch(Tensor arg) -> int", kernel::func), &kernel_func::func>(), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> int", RegisterOperators::options().kernel::func), &kernel_func::func>().dispatchKey(TensorType1())); // and now a set of mismatching schemas expectThrows([] { RegisterOperators() - .op("_test::mismatch(Tensor arg) -> ()", kernel::func), &kernel_func::func>(), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> ()", RegisterOperators::options().kernel::func), &kernel_func::func>().dispatchKey(TensorType1())); }, "The number of returns is different. Specified 0 but inferred 1" ); expectThrows([] { RegisterOperators() - .op("_test::mismatch(Tensor arg) -> (int, int)", kernel::func), &kernel_func::func>(), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> (int, int)", RegisterOperators::options().kernel::func), &kernel_func::func>().dispatchKey(TensorType1())); }, "The number of returns is different. Specified 2 but inferred 1" ); // assert this does not fail because it matches RegisterOperators() - .op("_test::mismatch(Tensor arg) -> ()", kernel::func), &kernel_func::func>(), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> ()", RegisterOperators::options().kernel::func), &kernel_func::func>().dispatchKey(TensorType1())); // and now a set of mismatching schemas expectThrows([] { RegisterOperators() - .op("_test::mismatch(Tensor arg) -> Tensor", kernel::func), &kernel_func::func>(), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> Tensor", RegisterOperators::options().kernel::func), &kernel_func::func>().dispatchKey(TensorType1())); }, "The number of returns is different. Specified 1 but inferred 0" ); expectThrows([] { RegisterOperators() - .op("_test::mismatch(Tensor arg) -> (Tensor, Tensor)", kernel::func), &kernel_func::func>(), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> (Tensor, Tensor)", RegisterOperators::options().kernel::func), &kernel_func::func>().dispatchKey(TensorType1())); }, "The number of returns is different. Specified 2 but inferred 0" ); // assert this does not fail because it matches RegisterOperators() - .op("_test::mismatch(Tensor arg) -> (Tensor, Tensor)", kernel, Tensor>::func), &kernel_func, Tensor>::func>(), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> (Tensor, Tensor)", RegisterOperators::options().kernel, Tensor>::func), &kernel_func, Tensor>::func>().dispatchKey(TensorType1())); // and now a set of mismatching schemas expectThrows([] { RegisterOperators() - .op("_test::mismatch(Tensor arg) -> ()", kernel, Tensor>::func), &kernel_func, Tensor>::func>(), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> ()", RegisterOperators::options().kernel, Tensor>::func), &kernel_func, Tensor>::func>().dispatchKey(TensorType1())); }, "The number of returns is different. Specified 0 but inferred 2" ); expectThrows([] { RegisterOperators() - .op("_test::mismatch(Tensor arg) -> Tensor", kernel, Tensor>::func), &kernel_func, Tensor>::func>(), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> Tensor", RegisterOperators::options().kernel, Tensor>::func), &kernel_func, Tensor>::func>().dispatchKey(TensorType1())); }, "The number of returns is different. Specified 1 but inferred 2" ); expectThrows([] { RegisterOperators() - .op("_test::mismatch(Tensor arg) -> (Tensor, Tensor, Tensor)", kernel, Tensor>::func), &kernel_func, Tensor>::func>(), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> (Tensor, Tensor, Tensor)", RegisterOperators::options().kernel, Tensor>::func), &kernel_func, Tensor>::func>().dispatchKey(TensorType1())); }, "The number of returns is different. Specified 3 but inferred 2" ); } @@ -772,46 +769,46 @@ TEST(OperatorRegistrationTest_FunctionBasedKernel, givenMismatchedKernel_withDif TEST(OperatorRegistrationTest_FunctionBasedKernel, givenMismatchedKernel_withDifferentReturnTypes_whenRegistering_thenFails) { // assert this does not fail because it matches RegisterOperators() - .op("_test::mismatch(Tensor arg) -> int", kernel::func), &kernel_func::func>(), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> int", RegisterOperators::options().kernel::func), &kernel_func::func>().dispatchKey(TensorType1())); // and now a set of mismatching schemas expectThrows([] { RegisterOperators() - .op("_test::mismatch(Tensor arg) -> Tensor", kernel::func), &kernel_func::func>(), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> Tensor", RegisterOperators::options().kernel::func), &kernel_func::func>().dispatchKey(TensorType1())); }, "Type mismatch in return 1: specified Tensor but inferred int" ); expectThrows([] { RegisterOperators() - .op("_test::mismatch(Tensor arg) -> float", kernel::func), &kernel_func::func>(), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> float", RegisterOperators::options().kernel::func), &kernel_func::func>().dispatchKey(TensorType1())); }, "Type mismatch in return 1: specified float but inferred int" ); // assert this does not fail because it matches RegisterOperators() - .op("_test::mismatch(Tensor arg) -> Tensor", kernel::func), &kernel_func::func>(), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> Tensor", RegisterOperators::options().kernel::func), &kernel_func::func>().dispatchKey(TensorType1())); // and now a set of mismatching schemas expectThrows([] { RegisterOperators() - .op("_test::mismatch(Tensor arg) -> float", kernel::func), &kernel_func::func>(), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> float", RegisterOperators::options().kernel::func), &kernel_func::func>().dispatchKey(TensorType1())); }, "Type mismatch in return 1: specified float but inferred Tensor" ); // assert this does not fail because it matches RegisterOperators() - .op("_test::mismatch(Tensor arg) -> (Tensor, int)", kernel, Tensor>::func), &kernel_func, Tensor>::func>(), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> (Tensor, int)", RegisterOperators::options().kernel, Tensor>::func), &kernel_func, Tensor>::func>().dispatchKey(TensorType1())); // and now a set of mismatching schemas expectThrows([] { RegisterOperators() - .op("_test::mismatch(Tensor arg) -> (Tensor, float)", kernel, Tensor>::func), &kernel_func, Tensor>::func>(), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> (Tensor, float)", RegisterOperators::options().kernel, Tensor>::func), &kernel_func, Tensor>::func>().dispatchKey(TensorType1())); }, "Type mismatch in return 2: specified float but inferred int" ); expectThrows([] { RegisterOperators() - .op("_test::mismatch(Tensor arg) -> (int, int)", kernel, Tensor>::func), &kernel_func, Tensor>::func>(), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> (int, int)", RegisterOperators::options().kernel, Tensor>::func), &kernel_func, Tensor>::func>().dispatchKey(TensorType1())); }, "Type mismatch in return 1: specified int but inferred Tensor" ); } diff --git a/aten/src/ATen/core/op_registration/kernel_functor.h b/aten/src/ATen/core/op_registration/kernel_functor.h index 8966b734cc14..f43c0fc3ef56 100644 --- a/aten/src/ATen/core/op_registration/kernel_functor.h +++ b/aten/src/ATen/core/op_registration/kernel_functor.h @@ -1,6 +1,5 @@ #pragma once -#include #include namespace c10 { @@ -52,17 +51,6 @@ namespace detail { } }; template - struct ivalue_to_arg_type> { - static ArrayRef call(const IValue& v) { - // Note: This takes a `const IValue&` argument and not `IValue&&`, because the - // returned ArrayRef is non-owning, so the call site needs to keep ownership - // TODO Do we want to support ArrayRef> ? - static_assert(guts::typelist::contains::value, "You tried to register a kernel with an unsupported argument type: c10::ArrayRef and T is not one of the supported primitive types."); - static_assert(!std::is_same::value, "You tried to register a kernel with an unsupported argument type: c10::ArrayRef. Please use c10::ArrayRef, c10::ArrayRef or Tensor instead."); - return v.to>>()->elements(); - } - }; - template struct ivalue_to_arg_type> { static optional call(IValue&& v) { if (v.isNone()) { @@ -81,24 +69,40 @@ namespace detail { return impl::toTypedDict(std::move(dict_ptr->elements())); } }; - // The following specialisations of ivalue_to_arg_type are technically not - // necessary since we would hit the base case and show an error message - // there if they didn't exist, but we can show a better error message - // in some common error scenarios. template - struct ivalue_to_arg_type> { - // We don't support std::vector because that would prevent us from doing - // internal optimization to how we represent lists (e.g. SmallVector). - // Users should use ArrayRef instead. - static_assert(guts::false_t>::value, "You tried to register a kernel with an unsupported argument type: std::vector. Please use c10::ArrayRef instead."); + struct ivalue_to_arg_type, guts::enable_if_t::value && !std::is_same::value>> final { + static std::vector call(IValue&& v) { + return std::move(*std::move(v).to>>()).elements(); + } + }; + template + struct ivalue_to_arg_type, guts::enable_if_t::value || std::is_same::value>> final { + static std::vector call(IValue&& v) { + auto list = std::move(v).toGenericList(); + std::vector result; + result.reserve(list->elements().size()); + for (auto&& elem : std::move(list)->elements()) { + result.push_back(ivalue_to_arg_type::call(std::move(elem))); + } + return result; + } }; template - struct ivalue_to_arg_type> { - // We don't support std::vector because that would prevent us from doing - // internal optimization to how we represent lists (e.g. SmallVector). - // Users should use ArrayRef instead. - static_assert(guts::false_t>::value, "You tried to register a kernel with an unsupported argument type: std::unordered_map. Please use c10::Dict instead."); + struct ivalue_to_arg_type> final { + static std::unordered_map call(IValue&& v) { + auto dict = std::move(v).toGenericDict(); + std::unordered_map result; + result.reserve(dict->elements().size()); + for (auto& element : dict->elements()) { + result.emplace(ivalue_to_arg_type::call(element.key()), ivalue_to_arg_type::call(element.value())); + } + return result; + } }; + // The following specialisations of ivalue_to_arg_type are technically not + // necessary since we would hit the base case and show an error message + // there if they didn't exist, but we can show a better error message + // in some common error scenarios. template struct ivalue_to_arg_type::value>> { // There is no reason to support float when we have double. Keep the API lean. @@ -115,37 +119,14 @@ namespace detail { // legacy_ivalue_to_arg_type is like ivalue_to_arg_type but additionally // allows a few deprecated types like std::vector. - template + template struct legacy_ivalue_to_arg_type final { static auto call(IValue&& v) -> decltype(ivalue_to_arg_type::call(std::move(v))) { return ivalue_to_arg_type::call(std::move(v)); } }; - template - struct legacy_ivalue_to_arg_type> final { - static std::vector call(IValue&& v) { - static_assert(guts::typelist::contains::value, "You tried to register a kernel with an unsupported argument type: std::vector and T is not one of the supported primitive types."); - return std::move(*std::move(v).to>>()).elements(); - } - }; - template - struct legacy_ivalue_to_arg_type> final { - static std::unordered_map call(const IValue& v) { - static_assert(guts::typelist::contains::value, "You tried to register a kernel with an unsupported argument type: std::unordered_map and Key is not one of the supported primitive types."); - static_assert(guts::typelist::contains::value, "You tried to register a kernel with an unsupported argument type: std::unordered_map and Value is not one of the supported primitive types."); - auto dict_ptr = std::move(v).toGenericDict(); - auto dict = impl::toTypedDict(std::move(dict_ptr->elements())); - std::unordered_map result; - result.reserve(dict.size()); - for (auto& element : dict) { - result.emplace(element.key(), element.value()); - } - return result; - } - }; - - // TODO Make nesting types work, e.g. Dicts of lists, lists of lists, and so on + // TODO Make nesting types work with new style API, e.g. Dicts of lists, lists of lists, and so on template struct return_type_to_ivalue { @@ -153,8 +134,9 @@ namespace detail { }; template struct return_type_to_ivalue::value>> { - static IValue call(T&& v) { - return IValue(std::move(v)); + template + static IValue call(T_&& v) { + return IValue(std::forward(v)); } }; template @@ -167,13 +149,25 @@ namespace detail { } }; template - struct return_type_to_ivalue> { + struct return_type_to_ivalue, guts::enable_if_t::value && !std::is_same::value>> { static IValue call(std::vector&& v) { static_assert(guts::typelist::contains::value, "You tried to register a kernel with an unsupported return type: vector and T is not one of the supported primitive types."); static_assert(!std::is_same::value, "You tried to register a kernel with an unsupported return type: vector. Please use vector, vector or Tensor instead."); return IValue(std::move(v)); } }; + template + struct return_type_to_ivalue, guts::enable_if_t::value || std::is_same::value>> { + static IValue call(std::vector&& v) { + static_assert(!std::is_same::value, "You tried to register a kernel with an unsupported return type: vector. Please use vector, vector or Tensor instead."); + std::vector result; + result.reserve(v.size()); + for (auto& elem : v) { + result.push_back(return_type_to_ivalue::call(std::move(elem))); + } + return result; + } + }; template struct return_type_to_ivalue> { static IValue call(c10::Dict&& v) { @@ -182,6 +176,17 @@ namespace detail { return IValue(impl::toGenericDict(std::move(v))); } }; + template + struct return_type_to_ivalue> final { + static IValue call(std::unordered_map&& v) { + c10::impl::GenericDict dict; + dict.reserve(v.size()); + for (auto& element : v) { + dict.insert(return_type_to_ivalue::call(Key{element.first}), return_type_to_ivalue::call(std::move(element.second))); + } + return dict; + } + }; // The following specialisations of return_type_to_ivalue are technically not // necessary since we would hit the base case and show an error message // there if they didn't exist, but we can show a better error message @@ -190,10 +195,6 @@ namespace detail { struct return_type_to_ivalue> { static_assert(guts::false_t>::value, "You tried to register a kernel with an unsupported return type: c10::ArrayRef. Please use std::vector instead."); }; - template - struct return_type_to_ivalue> { - static_assert(guts::false_t>::value, "You tried to register a kernel with an unsupported return type: std::unordered_map. Please use c10::Dict instead."); - }; template struct return_type_to_ivalue::value>> { static_assert(guts::false_t::value, "You tried to register a kernel with an unsupported return type: float. Please use double instead."); @@ -207,22 +208,12 @@ namespace detail { static_assert(guts::false_t::value, "You tried to register a kernel with an unsupported integral return argument type. Please use int64_t instead."); }; // legacy_return_type_to_ivalue is like return_type_to_ivalue but additionally - // allows a few deprecated types like std::vector. - template + // allows a few deprecated types like std::unordered_map. + template struct legacy_return_type_to_ivalue final { - static IValue call(T&& v) { - return return_type_to_ivalue::call(std::move(v)); - } - }; - template - struct legacy_return_type_to_ivalue> final { - static IValue call(std::unordered_map&& v) { - c10::Dict dict; - dict.reserve(v.size()); - for (auto& element : v) { - dict.insert(element.first, element.second); - } - return return_type_to_ivalue>::call(std::move(dict)); + template + static IValue call(T_&& v) { + return return_type_to_ivalue::call(std::forward(v)); } }; @@ -320,63 +311,6 @@ namespace detail { return guts::make_unique(inferFunctionSchema("", "")); } }; - - template - detail::KernelRegistrationConfigParameter...>, detail::FunctionSchemaInferer> - kernelFunctor(ConstructorParameters&&... constructorParameters) { - return { - &detail::wrap_kernel_functor::call, - detail::KernelFactory...>(std::forward(constructorParameters)...), - detail::FunctionSchemaInferer() - }; - } -} - -/** - * Use this to register an operator whose kernel is implemented as a functor - * - * Example: - * - * > namespace { - * > class my_kernel_cpu final : public c10::OperatorKernel { - * > public: - * > Tensor operator()(Tensor a, Tensor b) {...} - * > }; - * > } - * > - * > static auto registry = c10::RegisterOperators() - * > .op("my_op", - * > c10::kernel(), - * > c10::dispatchKey(CPUTensorId())); - * - * The functor constructor can take arguments to configure the kernel. - * The arguments are defined in the kernel registration. - * Example: - * - * > namespace { - * > class my_kernel_cpu final : public c10::OperatorKernel { - * > public: - * > explicit my_kernel_cpu(std::string some_configuration, int a, bool b) - * > : ... {...} - * > - * > Tensor operator()(Tensor a, Tensor b) {...} - * > }; - * > } - * > - * > static auto registry = c10::RegisterOperators() - * > .op("my_op", - * > c10::kernel("some_configuration", 3, true), - * > c10::dispatchKey(CPUTensorId())); - */ -template -// enable_if: only enable it if KernelFunctor is actually a functor -inline constexpr guts::enable_if_t::value, -detail::KernelRegistrationConfigParameter...>, detail::FunctionSchemaInferer>> -kernel(ConstructorParameters&&... constructorParameters) { - static_assert(std::is_base_of::value, "Tried to register a kernel functor using the kernel() API, but it doesn't inherit from c10::OperatorKernel. Please have the functor inherit from it."); - static_assert(std::is_constructible::value, "Wrong argument list for constructor of kernel functor. The arguments to kernel(arguments...) must match one of the constructors of Functor."); - - return detail::kernelFunctor(std::forward(constructorParameters)...); } } diff --git a/aten/src/ATen/core/op_registration/kernel_functor_test.cpp b/aten/src/ATen/core/op_registration/kernel_functor_test.cpp index 8dae0d26f2f7..b5dbf44fccff 100644 --- a/aten/src/ATen/core/op_registration/kernel_functor_test.cpp +++ b/aten/src/ATen/core/op_registration/kernel_functor_test.cpp @@ -7,8 +7,6 @@ using c10::RegisterOperators; using c10::OperatorKernel; -using c10::kernel; -using c10::dispatchKey; using c10::TensorTypeId; using c10::KernelCache; using c10::Stack; @@ -16,7 +14,6 @@ using c10::guts::make_unique; using c10::ivalue::TensorList; using c10::ivalue::IntList; using c10::intrusive_ptr; -using c10::ArrayRef; using c10::Dict; using at::Tensor; using std::unique_ptr; @@ -67,32 +64,32 @@ void expectCallsDecrement(TensorTypeId type_id) { } TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernel_whenRegistered_thenCanBeCalled) { - auto registrar = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", kernel(), dispatchKey(TensorType1())); + auto registrar = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", RegisterOperators::options().kernel().dispatchKey(TensorType1())); expectCallsIncrement(TensorType1()); } TEST(OperatorRegistrationTest_FunctorBasedKernel, givenMultipleOperatorsAndKernels_whenRegisteredInOneRegistrar_thenCallsRightKernel) { auto registrar = RegisterOperators() - .op("_test::my_op(Tensor dummy, int input) -> int", kernel(), dispatchKey(TensorType1())) - .op("_test::my_op(Tensor dummy, int input) -> int", kernel(), dispatchKey(TensorType2())) - .op("_test::error(Tensor dummy, int input) -> int", kernel(), dispatchKey(TensorType1())) - .op("_test::error(Tensor dummy, int input) -> int", kernel(), dispatchKey(TensorType2())); + .op("_test::my_op(Tensor dummy, int input) -> int", RegisterOperators::options().kernel().dispatchKey(TensorType1())) + .op("_test::my_op(Tensor dummy, int input) -> int", RegisterOperators::options().kernel().dispatchKey(TensorType2())) + .op("_test::error(Tensor dummy, int input) -> int", RegisterOperators::options().kernel().dispatchKey(TensorType1())) + .op("_test::error(Tensor dummy, int input) -> int", RegisterOperators::options().kernel().dispatchKey(TensorType2())); expectCallsIncrement(TensorType1()); } TEST(OperatorRegistrationTest_FunctorBasedKernel, givenMultipleOperatorsAndKernels_whenRegisteredInMultipleRegistrars_thenCallsRightKernel) { - auto registrar1 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", kernel(), dispatchKey(TensorType1())); - auto registrar2 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", kernel(), dispatchKey(TensorType2())); - auto registrar3 = RegisterOperators().op("_test::error(Tensor dummy, int input) -> int", kernel(), dispatchKey(TensorType1())); - auto registrar4 = RegisterOperators().op("_test::error(Tensor dummy, int input) -> int", kernel(), dispatchKey(TensorType2())); + auto registrar1 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", RegisterOperators::options().kernel().dispatchKey(TensorType1())); + auto registrar2 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", RegisterOperators::options().kernel().dispatchKey(TensorType2())); + auto registrar3 = RegisterOperators().op("_test::error(Tensor dummy, int input) -> int", RegisterOperators::options().kernel().dispatchKey(TensorType1())); + auto registrar4 = RegisterOperators().op("_test::error(Tensor dummy, int input) -> int", RegisterOperators::options().kernel().dispatchKey(TensorType2())); expectCallsIncrement(TensorType1()); } TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernel_whenRegistrationRunsOutOfScope_thenCannotBeCalledAnymore) { { - auto registrar1 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", kernel(), dispatchKey(TensorType1())); + auto registrar1 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", RegisterOperators::options().kernel().dispatchKey(TensorType1())); { - auto registrar2 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", kernel(), dispatchKey(TensorType2())); + auto registrar2 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", RegisterOperators::options().kernel().dispatchKey(TensorType2())); // assert that schema and cpu kernel are present expectCallsIncrement(TensorType1()); @@ -117,7 +114,7 @@ struct KernelWithoutOutput final : OperatorKernel { }; TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithoutOutput_whenRegistered_thenCanBeCalled) { - auto registrar = RegisterOperators().op("_test::no_return(Tensor dummy) -> ()", kernel(), dispatchKey(TensorType1())); + auto registrar = RegisterOperators().op("_test::no_return(Tensor dummy) -> ()", RegisterOperators::options().kernel().dispatchKey(TensorType1())); auto op = c10::Dispatcher::singleton().findSchema("_test::no_return", ""); ASSERT_TRUE(op.has_value()); @@ -135,7 +132,7 @@ struct KernelWithZeroOutputs final : OperatorKernel { }; TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithZeroOutputs_whenRegistered_thenCanBeCalled) { - auto registrar = RegisterOperators().op("_test::zero_outputs(Tensor dummy) -> ()", kernel(), dispatchKey(TensorType1())); + auto registrar = RegisterOperators().op("_test::zero_outputs(Tensor dummy) -> ()", RegisterOperators::options().kernel().dispatchKey(TensorType1())); auto op = c10::Dispatcher::singleton().findSchema("_test::zero_outputs", ""); ASSERT_TRUE(op.has_value()); @@ -153,7 +150,7 @@ struct KernelWithIntOutput final : OperatorKernel { TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithIntOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() - .op("_test::int_output(Tensor dummy, int a, int b) -> int", kernel(), dispatchKey(TensorType1())); + .op("_test::int_output(Tensor dummy, int a, int b) -> int", RegisterOperators::options().kernel().dispatchKey(TensorType1())); auto op = c10::Dispatcher::singleton().findSchema("_test::int_output", ""); ASSERT_TRUE(op.has_value()); @@ -171,8 +168,8 @@ struct KernelWithTensorOutput final : OperatorKernel { TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithTensorOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() - .op("_test::returning_tensor(Tensor input) -> Tensor", kernel(), dispatchKey(TensorType1())) - .op("_test::returning_tensor(Tensor input) -> Tensor", kernel(), dispatchKey(TensorType2())); + .op("_test::returning_tensor(Tensor input) -> Tensor", RegisterOperators::options().kernel().dispatchKey(TensorType1())) + .op("_test::returning_tensor(Tensor input) -> Tensor", RegisterOperators::options().kernel().dispatchKey(TensorType2())); auto op = c10::Dispatcher::singleton().findSchema("_test::returning_tensor", ""); ASSERT_TRUE(op.has_value()); @@ -194,7 +191,7 @@ struct KernelWithTensorListOutput final : OperatorKernel { TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithTensorListOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() - .op("_test::list_output(Tensor input1, Tensor input2, Tensor input3) -> Tensor[]", kernel(), dispatchKey(TensorType1())); + .op("_test::list_output(Tensor input1, Tensor input2, Tensor input3) -> Tensor[]", RegisterOperators::options().kernel().dispatchKey(TensorType1())); auto op = c10::Dispatcher::singleton().findSchema("_test::list_output", ""); ASSERT_TRUE(op.has_value()); @@ -215,7 +212,7 @@ struct KernelWithIntListOutput final : OperatorKernel { TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithIntListOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() - .op("_test::list_output(Tensor dummy, int input1, int input2, int input3) -> int[]", kernel(), dispatchKey(TensorType1())); + .op("_test::list_output(Tensor dummy, int input1, int input2, int input3) -> int[]", RegisterOperators::options().kernel().dispatchKey(TensorType1())); auto op = c10::Dispatcher::singleton().findSchema("_test::list_output", ""); ASSERT_TRUE(op.has_value()); @@ -245,7 +242,7 @@ struct KernelWithMultipleOutputs final : OperatorKernel { TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithMultipleOutputs_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() - .op("_test::multiple_outputs(Tensor dummy) -> (Tensor, int, Tensor[], int?, Dict(str, Tensor))", kernel(), dispatchKey(TensorType1())); + .op("_test::multiple_outputs(Tensor dummy) -> (Tensor, int, Tensor[], int?, Dict(str, Tensor))", RegisterOperators::options().kernel().dispatchKey(TensorType1())); auto op = c10::Dispatcher::singleton().findSchema("_test::multiple_outputs", ""); ASSERT_TRUE(op.has_value()); @@ -278,8 +275,8 @@ struct KernelWithTensorInputByValueWithOutput final : OperatorKernel { TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithTensorInputByReference_withOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() - .op("_test::tensor_input(Tensor input) -> Tensor", kernel(), dispatchKey(TensorType1())) - .op("_test::tensor_input(Tensor input) -> Tensor", kernel(), dispatchKey(TensorType2())); + .op("_test::tensor_input(Tensor input) -> Tensor", RegisterOperators::options().kernel().dispatchKey(TensorType1())) + .op("_test::tensor_input(Tensor input) -> Tensor", RegisterOperators::options().kernel().dispatchKey(TensorType2())); auto op = c10::Dispatcher::singleton().findSchema("_test::tensor_input", ""); ASSERT_TRUE(op.has_value()); @@ -295,8 +292,8 @@ TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithTensorInputByRe TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithTensorInputByValue_withOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() - .op("_test::tensor_input(Tensor input) -> Tensor", kernel(), dispatchKey(TensorType1())) - .op("_test::tensor_input(Tensor input) -> Tensor", kernel(), dispatchKey(TensorType2())); + .op("_test::tensor_input(Tensor input) -> Tensor", RegisterOperators::options().kernel().dispatchKey(TensorType1())) + .op("_test::tensor_input(Tensor input) -> Tensor", RegisterOperators::options().kernel().dispatchKey(TensorType2())); auto op = c10::Dispatcher::singleton().findSchema("_test::tensor_input", ""); ASSERT_TRUE(op.has_value()); @@ -326,8 +323,8 @@ struct KernelWithTensorInputByValueWithoutOutput final : OperatorKernel { TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithTensorInputByReference_withoutOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() - .op("_test::tensor_input(Tensor input) -> ()", kernel(), dispatchKey(TensorType1())) - .op("_test::tensor_input(Tensor input) -> ()", kernel(), dispatchKey(TensorType2())); + .op("_test::tensor_input(Tensor input) -> ()", RegisterOperators::options().kernel().dispatchKey(TensorType1())) + .op("_test::tensor_input(Tensor input) -> ()", RegisterOperators::options().kernel().dispatchKey(TensorType2())); auto op = c10::Dispatcher::singleton().findSchema("_test::tensor_input", ""); ASSERT_TRUE(op.has_value()); @@ -343,8 +340,8 @@ TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithTensorInputByRe TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithTensorInputByValue_withoutOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() - .op("_test::tensor_input(Tensor input) -> ()", kernel(), dispatchKey(TensorType1())) - .op("_test::tensor_input(Tensor input) -> ()", kernel(), dispatchKey(TensorType2())); + .op("_test::tensor_input(Tensor input) -> ()", RegisterOperators::options().kernel().dispatchKey(TensorType1())) + .op("_test::tensor_input(Tensor input) -> ()", RegisterOperators::options().kernel().dispatchKey(TensorType2())); auto op = c10::Dispatcher::singleton().findSchema("_test::tensor_input", ""); ASSERT_TRUE(op.has_value()); @@ -368,7 +365,7 @@ struct KernelWithIntInputWithoutOutput final : OperatorKernel { TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithIntInput_withoutOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() - .op("_test::int_input(Tensor dummy, int input) -> ()", kernel(), dispatchKey(TensorType1())); + .op("_test::int_input(Tensor dummy, int input) -> ()", RegisterOperators::options().kernel().dispatchKey(TensorType1())); auto op = c10::Dispatcher::singleton().findSchema("_test::int_input", ""); ASSERT_TRUE(op.has_value()); @@ -387,7 +384,7 @@ struct KernelWithIntInputWithOutput final : OperatorKernel { TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithIntInput_withOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() - .op("_test::int_input(Tensor dummy, int input) -> int", kernel(), dispatchKey(TensorType1())); + .op("_test::int_input(Tensor dummy, int input) -> int", RegisterOperators::options().kernel().dispatchKey(TensorType1())); auto op = c10::Dispatcher::singleton().findSchema("_test::int_input", ""); ASSERT_TRUE(op.has_value()); @@ -400,14 +397,14 @@ TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithIntInput_withOu int64_t captured_input_list_size = 0; struct KernelWithIntListInputWithoutOutput final : OperatorKernel { - void operator()(Tensor, ArrayRef input1) { + void operator()(Tensor, const std::vector& input1) { captured_input_list_size = input1.size(); } }; TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithIntListInput_withoutOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() - .op("_test::int_list_input(Tensor dummy, int[] input) -> ()", kernel(), dispatchKey(TensorType1())); + .op("_test::int_list_input(Tensor dummy, int[] input) -> ()", RegisterOperators::options().kernel().dispatchKey(TensorType1())); auto op = c10::Dispatcher::singleton().findSchema("_test::int_list_input", ""); ASSERT_TRUE(op.has_value()); @@ -419,14 +416,14 @@ TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithIntListInput_wi } struct KernelWithIntListInputWithOutput final : OperatorKernel { - int64_t operator()(Tensor, ArrayRef input1) { + int64_t operator()(Tensor, const std::vector& input1) { return input1.size(); } }; TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithIntListInput_withOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() - .op("_test::int_list_input(Tensor dummy, int[] input) -> int", kernel(), dispatchKey(TensorType1())); + .op("_test::int_list_input(Tensor dummy, int[] input) -> int", RegisterOperators::options().kernel().dispatchKey(TensorType1())); auto op = c10::Dispatcher::singleton().findSchema("_test::int_list_input", ""); ASSERT_TRUE(op.has_value()); @@ -437,14 +434,14 @@ TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithIntListInput_wi } struct KernelWithTensorListInputWithoutOutput final : OperatorKernel { - void operator()(ArrayRef input1) { + void operator()(const std::vector& input1) { captured_input_list_size = input1.size(); } }; TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithTensorListInput_withoutOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() - .op("_test::tensor_list_input(Tensor[] input) -> ()", kernel(), dispatchKey(TensorType1())); + .op("_test::tensor_list_input(Tensor[] input) -> ()", RegisterOperators::options().kernel().dispatchKey(TensorType1())); auto op = c10::Dispatcher::singleton().findSchema("_test::tensor_list_input", ""); ASSERT_TRUE(op.has_value()); @@ -456,14 +453,14 @@ TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithTensorListInput } struct KernelWithTensorListInputWithOutput final : OperatorKernel { - int64_t operator()(ArrayRef input1) { + int64_t operator()(const std::vector& input1) { return input1.size(); } }; TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithTensorListInput_withOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() - .op("_test::tensor_list_input(Tensor[] input) -> int", kernel(), dispatchKey(TensorType1())); + .op("_test::tensor_list_input(Tensor[] input) -> int", RegisterOperators::options().kernel().dispatchKey(TensorType1())); auto op = c10::Dispatcher::singleton().findSchema("_test::tensor_list_input", ""); ASSERT_TRUE(op.has_value()); @@ -483,7 +480,7 @@ struct KernelWithDictInputWithoutOutput final : OperatorKernel { TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithDictInput_withoutOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() - .op("_test::dict_input(Dict(str, Tensor) input) -> ()", kernel()); + .op("_test::dict_input(Dict(str, Tensor) input) -> ()", RegisterOperators::options().kernel()); auto op = c10::Dispatcher::singleton().findSchema("_test::dict_input", ""); ASSERT_TRUE(op.has_value()); @@ -505,7 +502,7 @@ struct KernelWithDictInputWithOutput final : OperatorKernel { TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithDictInput_withOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() - .op("_test::dict_input(Dict(str, str) input) -> str", kernel()); + .op("_test::dict_input(Dict(str, str) input) -> str", RegisterOperators::options().kernel()); auto op = c10::Dispatcher::singleton().findSchema("_test::dict_input", ""); ASSERT_TRUE(op.has_value()); @@ -526,7 +523,7 @@ struct KernelWithDictOutput final : OperatorKernel { TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithDictOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() - .op("_test::dict_output(Dict(str, str) input) -> Dict(str, str)", kernel()); + .op("_test::dict_output(Dict(str, str) input) -> Dict(str, str)", RegisterOperators::options().kernel()); auto op = c10::Dispatcher::singleton().findSchema("_test::dict_output", ""); ASSERT_TRUE(op.has_value()); @@ -556,7 +553,7 @@ class KernelWithCache final : public OperatorKernel { TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithCache_thenCacheIsKeptCorrectly) { auto registrar = RegisterOperators() - .op("_test::cache_op(Tensor input) -> int", kernel(), dispatchKey(TensorType1())); + .op("_test::cache_op(Tensor input) -> int", RegisterOperators::options().kernel().dispatchKey(TensorType1())); auto op = c10::Dispatcher::singleton().findSchema("_test::cache_op", ""); ASSERT_TRUE(op.has_value()); @@ -596,8 +593,8 @@ class KernelWithConstructorArg final : public OperatorKernel { TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithConstructorArg_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() - .op("_test::offset_op(Tensor tensor, int input) -> int", kernel(2), dispatchKey(TensorType1())) - .op("_test::offset_op(Tensor tensor, int input) -> int", kernel(4), dispatchKey(TensorType2())); + .op("_test::offset_op(Tensor tensor, int input) -> int", RegisterOperators::options().kernel(2).dispatchKey(TensorType1())) + .op("_test::offset_op(Tensor tensor, int input) -> int", RegisterOperators::options().kernel(4).dispatchKey(TensorType2())); auto op = c10::Dispatcher::singleton().findSchema("_test::offset_op", ""); ASSERT_TRUE(op.has_value()); @@ -626,8 +623,8 @@ class KernelWithMultipleConstructorArgs final : public OperatorKernel { TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithMultipleConstructorArgs_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() - .op("_test::offset_op(Tensor tensor, int input) -> int", kernel(2, 3), dispatchKey(TensorType1())) - .op("_test::offset_op(Tensor tensor, int input) -> int", kernel(4, 5), dispatchKey(TensorType2())); + .op("_test::offset_op(Tensor tensor, int input) -> int", RegisterOperators::options().kernel(2, 3).dispatchKey(TensorType1())) + .op("_test::offset_op(Tensor tensor, int input) -> int", RegisterOperators::options().kernel(4, 5).dispatchKey(TensorType2())); auto op = c10::Dispatcher::singleton().findSchema("_test::offset_op", ""); ASSERT_TRUE(op.has_value()); @@ -654,7 +651,7 @@ TEST(OperatorRegistrationTest_FunctorBasedKernel, givenFallbackKernelWithoutAnyA // is no way to get the dispatch key. For operators that only have a fallback // kernel, this must work for backwards compatibility. auto registrar = RegisterOperators() - .op("_test::no_tensor_args() -> ()", kernel()); + .op("_test::no_tensor_args() -> ()", RegisterOperators::options().kernel()); auto op = c10::Dispatcher::singleton().findSchema("_test::no_tensor_args", ""); ASSERT_TRUE(op.has_value()); @@ -675,7 +672,7 @@ TEST(OperatorRegistrationTest_FunctorBasedKernel, givenFallbackKernelWithoutTens // is no way to get the dispatch key. For operators that only have a fallback // kernel, this must work for backwards compatibility. auto registrar = RegisterOperators() - .op("_test::no_tensor_args(int arg) -> int", kernel()); + .op("_test::no_tensor_args(int arg) -> int", RegisterOperators::options().kernel()); auto op = c10::Dispatcher::singleton().findSchema("_test::no_tensor_args", ""); ASSERT_TRUE(op.has_value()); @@ -699,7 +696,7 @@ struct KernelWithOptInputWithoutOutput final : OperatorKernel { }; TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithOptionalInputs_withoutOutput_whenRegistered_thenCanBeCalled) { - auto registrar = RegisterOperators().op("_test::opt_input(Tensor arg1, Tensor? arg2, int? arg3, str? arg4) -> ()", kernel(), dispatchKey(TensorType1())); + auto registrar = RegisterOperators().op("_test::opt_input(Tensor arg1, Tensor? arg2, int? arg3, str? arg4) -> ()", RegisterOperators::options().kernel().dispatchKey(TensorType1())); auto op = c10::Dispatcher::singleton().findSchema("_test::opt_input", ""); ASSERT_TRUE(op.has_value()); @@ -736,7 +733,7 @@ struct KernelWithOptInputWithOutput final : OperatorKernel { }; TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithOptionalInputs_withOutput_whenRegistered_thenCanBeCalled) { - auto registrar = RegisterOperators().op("_test::opt_input(Tensor arg1, Tensor? arg2, int? arg3, str? arg4) -> Tensor?", kernel(), dispatchKey(TensorType1())); + auto registrar = RegisterOperators().op("_test::opt_input(Tensor arg1, Tensor? arg2, int? arg3, str? arg4) -> Tensor?", RegisterOperators::options().kernel().dispatchKey(TensorType1())); auto op = c10::Dispatcher::singleton().findSchema("_test::opt_input", ""); ASSERT_TRUE(op.has_value()); @@ -772,7 +769,7 @@ struct KernelWithOptInputWithMultipleOutputs final : OperatorKernel { }; TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithOptionalInputs_withMultipleOutputs_whenRegistered_thenCanBeCalled) { - auto registrar = RegisterOperators().op("_test::opt_input(Tensor arg1, Tensor? arg2, int? arg3, str? arg4) -> (Tensor?, int?, str?)", kernel(), dispatchKey(TensorType1())); + auto registrar = RegisterOperators().op("_test::opt_input(Tensor arg1, Tensor? arg2, int? arg3, str? arg4) -> (Tensor?, int?, str?)", RegisterOperators::options().kernel().dispatchKey(TensorType1())); auto op = c10::Dispatcher::singleton().findSchema("_test::opt_input", ""); ASSERT_TRUE(op.has_value()); @@ -790,14 +787,14 @@ TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithOptionalInputs_ } struct KernelForSchemaInference final : OperatorKernel { - std::tuple operator()(Tensor arg1, int64_t arg2, ArrayRef arg3) { + std::tuple operator()(Tensor arg1, int64_t arg2, const std::vector& arg3) { return {}; } }; TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernel_whenRegisteredWithoutSpecifyingSchema_thenInfersSchema) { auto registrar = RegisterOperators() - .op("_test::no_schema_specified", kernel()); + .op("_test::no_schema_specified", RegisterOperators::options().kernel()); auto op = c10::Dispatcher::singleton().findSchema("_test::no_schema_specified", ""); ASSERT_TRUE(op.has_value()); @@ -815,35 +812,35 @@ template struct KernelFunc final : OperatorKernel TEST(OperatorRegistrationTest_FunctorBasedKernel, givenMismatchedKernel_withDifferentNumArguments_whenRegistering_thenFails) { // assert this does not fail because it matches RegisterOperators() - .op("_test::mismatch(Tensor arg) -> int", kernel>(), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> int", RegisterOperators::options().kernel>().dispatchKey(TensorType1())); // and now a set of mismatching schemas expectThrows([] { RegisterOperators() - .op("_test::mismatch(Tensor arg, Tensor arg2) -> int", kernel>(), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg, Tensor arg2) -> int", RegisterOperators::options().kernel>().dispatchKey(TensorType1())); }, "The number of arguments is different. Specified 2 but inferred 1" ); // assert this does not fail because it matches RegisterOperators() - .op("_test::mismatch(Tensor arg, Tensor arg2) -> ()", kernel>(), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg, Tensor arg2) -> ()", RegisterOperators::options().kernel>().dispatchKey(TensorType1())); // and now a set of mismatching schemas expectThrows([] { RegisterOperators() - .op("_test::mismatch() -> ()", kernel>(), dispatchKey(TensorType1())); + .op("_test::mismatch() -> ()", RegisterOperators::options().kernel>().dispatchKey(TensorType1())); }, "The number of arguments is different. Specified 0 but inferred 2" ); expectThrows([] { RegisterOperators() - .op("_test::mismatch(Tensor arg) -> ()", kernel>(), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> ()", RegisterOperators::options().kernel>().dispatchKey(TensorType1())); }, "The number of arguments is different. Specified 1 but inferred 2" ); expectThrows([] { RegisterOperators() - .op("_test::mismatch(Tensor arg, Tensor arg2, Tensor arg3) -> ()", kernel>(), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg, Tensor arg2, Tensor arg3) -> ()", RegisterOperators::options().kernel>().dispatchKey(TensorType1())); }, "The number of arguments is different. Specified 3 but inferred 2" ); } @@ -851,18 +848,18 @@ TEST(OperatorRegistrationTest_FunctorBasedKernel, givenMismatchedKernel_withDiff TEST(OperatorRegistrationTest_FunctorBasedKernel, givenMismatchedKernel_withDifferentArgumentType_whenRegistering_thenFails) { // assert this does not fail because it matches RegisterOperators() - .op("_test::mismatch(Tensor arg1, int arg2) -> int", kernel>(), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg1, int arg2) -> int", RegisterOperators::options().kernel>().dispatchKey(TensorType1())); // and now a set of mismatching schemas expectThrows([] { RegisterOperators() - .op("_test::mismatch(Tensor arg1, float arg2) -> int", kernel>(), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg1, float arg2) -> int", RegisterOperators::options().kernel>().dispatchKey(TensorType1())); }, "Type mismatch in argument 2: specified float but inferred int" ); expectThrows([] { RegisterOperators() - .op("_test::mismatch(int arg1, int arg2) -> int", kernel>(), dispatchKey(TensorType1())); + .op("_test::mismatch(int arg1, int arg2) -> int", RegisterOperators::options().kernel>().dispatchKey(TensorType1())); }, "Type mismatch in argument 1: specified int but inferred Tensor" ); } @@ -870,58 +867,58 @@ TEST(OperatorRegistrationTest_FunctorBasedKernel, givenMismatchedKernel_withDiff TEST(OperatorRegistrationTest_FunctorBasedKernel, givenMismatchedKernel_withDifferentNumReturns_whenRegistering_thenFails) { // assert this does not fail because it matches RegisterOperators() - .op("_test::mismatch(Tensor arg) -> int", kernel>(), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> int", RegisterOperators::options().kernel>().dispatchKey(TensorType1())); // and now a set of mismatching schemas expectThrows([] { RegisterOperators() - .op("_test::mismatch(Tensor arg) -> ()", kernel>(), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> ()", RegisterOperators::options().kernel>().dispatchKey(TensorType1())); }, "The number of returns is different. Specified 0 but inferred 1" ); expectThrows([] { RegisterOperators() - .op("_test::mismatch(Tensor arg) -> (int, int)", kernel>(), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> (int, int)", RegisterOperators::options().kernel>().dispatchKey(TensorType1())); }, "The number of returns is different. Specified 2 but inferred 1" ); // assert this does not fail because it matches RegisterOperators() - .op("_test::mismatch(Tensor arg) -> ()", kernel>(), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> ()", RegisterOperators::options().kernel>().dispatchKey(TensorType1())); // and now a set of mismatching schemas expectThrows([] { RegisterOperators() - .op("_test::mismatch(Tensor arg) -> Tensor", kernel>(), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> Tensor", RegisterOperators::options().kernel>().dispatchKey(TensorType1())); }, "The number of returns is different. Specified 1 but inferred 0" ); expectThrows([] { RegisterOperators() - .op("_test::mismatch(Tensor arg) -> (Tensor, Tensor)", kernel>(), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> (Tensor, Tensor)", RegisterOperators::options().kernel>().dispatchKey(TensorType1())); }, "The number of returns is different. Specified 2 but inferred 0" ); // assert this does not fail because it matches RegisterOperators() - .op("_test::mismatch(Tensor arg) -> (Tensor, Tensor)", kernel, Tensor>>(), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> (Tensor, Tensor)", RegisterOperators::options().kernel, Tensor>>().dispatchKey(TensorType1())); // and now a set of mismatching schemas expectThrows([] { RegisterOperators() - .op("_test::mismatch(Tensor arg) -> ()", kernel, Tensor>>(), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> ()", RegisterOperators::options().kernel, Tensor>>().dispatchKey(TensorType1())); }, "The number of returns is different. Specified 0 but inferred 2" ); expectThrows([] { RegisterOperators() - .op("_test::mismatch(Tensor arg) -> Tensor", kernel, Tensor>>(), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> Tensor", RegisterOperators::options().kernel, Tensor>>().dispatchKey(TensorType1())); }, "The number of returns is different. Specified 1 but inferred 2" ); expectThrows([] { RegisterOperators() - .op("_test::mismatch(Tensor arg) -> (Tensor, Tensor, Tensor)", kernel, Tensor>>(), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> (Tensor, Tensor, Tensor)", RegisterOperators::options().kernel, Tensor>>().dispatchKey(TensorType1())); }, "The number of returns is different. Specified 3 but inferred 2" ); } @@ -929,46 +926,46 @@ TEST(OperatorRegistrationTest_FunctorBasedKernel, givenMismatchedKernel_withDiff TEST(OperatorRegistrationTest_FunctorBasedKernel, givenMismatchedKernel_withDifferentReturnTypes_whenRegistering_thenFails) { // assert this does not fail because it matches RegisterOperators() - .op("_test::mismatch(Tensor arg) -> int", kernel>(), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> int", RegisterOperators::options().kernel>().dispatchKey(TensorType1())); // and now a set of mismatching schemas expectThrows([] { RegisterOperators() - .op("_test::mismatch(Tensor arg) -> Tensor", kernel>(), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> Tensor", RegisterOperators::options().kernel>().dispatchKey(TensorType1())); }, "Type mismatch in return 1: specified Tensor but inferred int" ); expectThrows([] { RegisterOperators() - .op("_test::mismatch(Tensor arg) -> float", kernel>(), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> float", RegisterOperators::options().kernel>().dispatchKey(TensorType1())); }, "Type mismatch in return 1: specified float but inferred int" ); // assert this does not fail because it matches RegisterOperators() - .op("_test::mismatch(Tensor arg) -> Tensor", kernel>(), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> Tensor", RegisterOperators::options().kernel>().dispatchKey(TensorType1())); // and now a set of mismatching schemas expectThrows([] { RegisterOperators() - .op("_test::mismatch(Tensor arg) -> float", kernel>(), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> float", RegisterOperators::options().kernel>().dispatchKey(TensorType1())); }, "Type mismatch in return 1: specified float but inferred Tensor" ); // assert this does not fail because it matches RegisterOperators() - .op("_test::mismatch(Tensor arg) -> (Tensor, int)", kernel, Tensor>>(), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> (Tensor, int)", RegisterOperators::options().kernel, Tensor>>().dispatchKey(TensorType1())); // and now a set of mismatching schemas expectThrows([] { RegisterOperators() - .op("_test::mismatch(Tensor arg) -> (Tensor, float)", kernel, Tensor>>(), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> (Tensor, float)", RegisterOperators::options().kernel, Tensor>>().dispatchKey(TensorType1())); }, "Type mismatch in return 2: specified float but inferred int" ); expectThrows([] { RegisterOperators() - .op("_test::mismatch(Tensor arg) -> (int, int)", kernel, Tensor>>(), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> (int, int)", RegisterOperators::options().kernel, Tensor>>().dispatchKey(TensorType1())); }, "Type mismatch in return 1: specified int but inferred Tensor" ); } diff --git a/aten/src/ATen/core/op_registration/kernel_lambda.h b/aten/src/ATen/core/op_registration/kernel_lambda.h index 64952cb52db4..5c01bf21bce1 100644 --- a/aten/src/ATen/core/op_registration/kernel_lambda.h +++ b/aten/src/ATen/core/op_registration/kernel_lambda.h @@ -34,32 +34,4 @@ namespace detail { >; } -/** - * Use this to register an operator whose kernel is implemented as a stateless lambda. - * - * Example: - * - * > static auto registry = c10::RegisterOperators() - * > .op("my_op", - * > c10::kernel([] (Tensor a) -> Tensor{...}), - * > c10::dispatchKey(CPUTensorId())); - */ -template -inline constexpr auto kernel(Lambda&& functor) -> -// enable_if: only enable it if Lambda is a functor (note: lambdas are functors) -guts::enable_if_t>::value, -decltype(detail::kernelFunctor>>(std::forward(functor)))> { - static_assert(!std::is_base_of::value, "The kernel(x) API for registering a kernel is only meant to be used with lambdas. Your kernel is a functor. Please use the kernel() API instead."); - - // We don't support stateful lambdas (i.e. lambdas with a capture), because their - // behavior would be nonobvious. A functor kernel with cache gets a new instance of - // its cache each time the kernel is looked up from the dispatch table. - // A lambda with a capture would be global and share its capture between all kernel lookups. - // So, instead of making users having to think about it (including the thread-safety - // issues this causes), let's just forbid stateful lambdas alltogether. - static_assert(guts::is_stateless_lambda>::value, "The kernel(x) API for registering a kernel only works for stateless lambdas (i.e. lambdas without captures). If you need a cache, please use the functor based API kernel() instead."); - - return detail::kernelFunctor>>(std::forward(functor)); -} - } diff --git a/aten/src/ATen/core/op_registration/kernel_lambda_legacy_test.cpp b/aten/src/ATen/core/op_registration/kernel_lambda_legacy_test.cpp index 4cf9f170dbb9..0359720a781b 100644 --- a/aten/src/ATen/core/op_registration/kernel_lambda_legacy_test.cpp +++ b/aten/src/ATen/core/op_registration/kernel_lambda_legacy_test.cpp @@ -17,8 +17,6 @@ #pragma GCC diagnostic ignored "-Wdeprecated-declarations" using c10::RegisterOperators; -using c10::kernel; -using c10::dispatchKey; using c10::TensorTypeId; using c10::KernelCache; using c10::Stack; @@ -26,7 +24,6 @@ using c10::guts::make_unique; using c10::ivalue::TensorList; using c10::ivalue::IntList; using c10::intrusive_ptr; -using c10::ArrayRef; using c10::Dict; using at::Tensor; using std::string; @@ -334,7 +331,7 @@ int64_t captured_input_list_size = 0; TEST(OperatorRegistrationTest_LegacyLambdaBasedKernel, givenKernelWithIntListInput_withoutOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() - .op("_test::int_list_input(Tensor dummy, int[] input) -> ()", [] (Tensor, ArrayRef input1) -> void { + .op("_test::int_list_input(Tensor dummy, int[] input) -> ()", [] (Tensor, const std::vector& input1) -> void { captured_input_list_size = input1.size(); }); @@ -349,7 +346,7 @@ TEST(OperatorRegistrationTest_LegacyLambdaBasedKernel, givenKernelWithIntListInp TEST(OperatorRegistrationTest_LegacyLambdaBasedKernel, givenKernelWithIntListInput_withOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() - .op("_test::int_list_input(Tensor dummy, int[] input) -> int", [](Tensor, ArrayRef input1) -> int64_t { + .op("_test::int_list_input(Tensor dummy, int[] input) -> int", [](Tensor, const std::vector& input1) -> int64_t { return input1.size(); }); @@ -363,7 +360,7 @@ TEST(OperatorRegistrationTest_LegacyLambdaBasedKernel, givenKernelWithIntListInp TEST(OperatorRegistrationTest_LegacyLambdaBasedKernel, givenKernelWithTensorListInput_withoutOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() - .op("_test::tensor_list_input(Tensor[] input) -> ()", [] (ArrayRef input1) -> void { + .op("_test::tensor_list_input(Tensor[] input) -> ()", [] (const std::vector& input1) -> void { captured_input_list_size = input1.size(); }); @@ -378,7 +375,7 @@ TEST(OperatorRegistrationTest_LegacyLambdaBasedKernel, givenKernelWithTensorList TEST(OperatorRegistrationTest_LegacyLambdaBasedKernel, givenKernelWithTensorListRefInput_withOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() - .op("_test::tensor_list_input(Tensor[] input) -> int", [] (ArrayRef input1) -> int64_t { + .op("_test::tensor_list_input(Tensor[] input) -> int", [] (const std::vector& input1) -> int64_t { return input1.size(); }); @@ -448,6 +445,25 @@ TEST(OperatorRegistrationTest_LegacyLambdaBasedKernel, givenKernelWithLegacyTens EXPECT_EQ(2, outputs[0].toInt()); } +TEST(OperatorRegistrationTest_LegacyLambdaBasedKernel, givenKernelWithStringListOutput_whenRegistered_thenCanBeCalled) { + auto registrar = RegisterOperators() + .op("_test::stringlist_output(str[] input) -> str[]", [](std::vector input) { + return input; + }); + + auto op = c10::Dispatcher::singleton().findSchema("_test::stringlist_output", ""); + ASSERT_TRUE(op.has_value()); + + std::vector list{"value1", "value2"}; + auto outputs = callOp(*op, list); + EXPECT_EQ(1, outputs.size()); + auto output = std::move(outputs[0].toGenericList()->elements()); + + EXPECT_EQ(2, output.size()); + EXPECT_EQ("value1", output[0].toString()->string()); + EXPECT_EQ("value2", output[1].toString()->string()); +} + TEST(OperatorRegistrationTest_LegacyLambdaBasedKernel, givenKernelWithDictInput_withoutOutput_whenRegistered_thenCanBeCalled) { int captured_dict_size = 0; @@ -564,6 +580,111 @@ TEST(OperatorRegistrationTest_LegacyLambdaBasedKernel, givenKernelWithUnorderedM EXPECT_EQ("value2", output.at("key2")); } +TEST(OperatorRegistrationTest_LegacyLambdaBasedKernel, givenKernelWithMapOfList_whenRegistered_thenCanBeCalled) { + auto registrar = RegisterOperators() + .op("_test::dict_output(Dict(str, int[]) input) -> Dict(str, int[])", [](std::unordered_map> input) { + return input; + }); + + auto op = c10::Dispatcher::singleton().findSchema("_test::dict_output", ""); + ASSERT_TRUE(op.has_value()); + + std::unordered_map> dict; + dict.insert({"key1", std::vector{10, 20}}); + dict.insert({"key2", std::vector{30, 40}}); + auto outputs = callOp(*op, dict); + EXPECT_EQ(1, outputs.size()); + auto output = c10::impl::toTypedDict>(std::move(outputs[0].toGenericDict()->elements())); + + EXPECT_EQ(2, output.size()); + EXPECT_EQ(2, output.at("key1").size()); + EXPECT_EQ(10, output.at("key1")[0]); + EXPECT_EQ(20, output.at("key1")[1]); + EXPECT_EQ(2, output.at("key2").size()); + EXPECT_EQ(30, output.at("key2")[0]); + EXPECT_EQ(40, output.at("key2")[1]); +} + + +TEST(OperatorRegistrationTest_LegacyLambdaBasedKernel, givenKernelWithMapOfListOfMap_whenRegistered_thenCanBeCalled) { + auto registrar = RegisterOperators() + .op("_test::dict_output(Dict(str, Dict(int,str)[]) input) -> Dict(str, Dict(int,str)[])", [](std::unordered_map>> input) { + return input; + }); + + auto op = c10::Dispatcher::singleton().findSchema("_test::dict_output", ""); + ASSERT_TRUE(op.has_value()); + + std::unordered_map>> dict; + dict.insert({"key1", {{{10, "10"}, {20, "20"}}}}); + dict.insert({"key2", {{{30, "30"}, {40, "40"}}}}); + auto outputs = callOp(*op, dict); + EXPECT_EQ(1, outputs.size()); + auto output = c10::impl::toTypedDict>>(std::move(outputs[0].toGenericDict()->elements())); + + EXPECT_EQ(2, output.size()); + EXPECT_EQ(1, output.at("key1").size()); + EXPECT_EQ(2, output.at("key1")[0].size()); + EXPECT_EQ("10", output.at("key1")[0][10]); + EXPECT_EQ("20", output.at("key1")[0][20]); + EXPECT_EQ(2, output.at("key2")[0].size()); + EXPECT_EQ("30", output.at("key2")[0][30]); + EXPECT_EQ("40", output.at("key2")[0][40]); +} + +TEST(OperatorRegistrationTest_LegacyLambdaBasedKernel, givenKernelWithListOfMap_whenRegistered_thenCanBeCalled) { + auto registrar = RegisterOperators() + .op("_test::list_output(Dict(str, int)[] input) -> Dict(str, int)[]", [](std::vector> input) { + return input; + }); + + auto op = c10::Dispatcher::singleton().findSchema("_test::list_output", ""); + ASSERT_TRUE(op.has_value()); + + std::vector> list{{{"1", 1}, {"2", 2}}, {{"3", 3}, {"4", 4}}}; + auto outputs = callOp(*op, list); + EXPECT_EQ(1, outputs.size()); + std::vector output = std::move(outputs[0].toGenericList()->elements()); + + EXPECT_EQ(2, output.size()); + EXPECT_EQ(2, output[0].toGenericDictRef().size()); + EXPECT_EQ(1, output[0].toGenericDictRef().at("1").toInt()); + EXPECT_EQ(2, output[0].toGenericDictRef().at("2").toInt()); + EXPECT_EQ(2, output[1].toGenericDictRef().size()); + EXPECT_EQ(3, output[1].toGenericDictRef().at("3").toInt()); + EXPECT_EQ(4, output[1].toGenericDictRef().at("4").toInt()); +} + +TEST(OperatorRegistrationTest_LegacyLambdaBasedKernel, givenKernelWithListOfMapOfIntList_whenRegistered_thenCanBeCalled) { + auto registrar = RegisterOperators() + .op("_test::list_output(Dict(str, int[])[] input) -> Dict(str, int[])[]", [](std::vector>> input) { + return input; + }); + + auto op = c10::Dispatcher::singleton().findSchema("_test::list_output", ""); + ASSERT_TRUE(op.has_value()); + + std::vector>> list{{{"1", {1, 2}}, {"3", {3, 4}}}, {{"5", {5, 6}}, {"7", {7, 8}}}}; + auto outputs = callOp(*op, list); + EXPECT_EQ(1, outputs.size()); + std::vector output = std::move(outputs[0].toGenericList()->elements()); + + EXPECT_EQ(2, output.size()); + EXPECT_EQ(2, output[0].toGenericDictRef().size()); + EXPECT_EQ(2, output[0].toGenericDictRef().at("1").toIntListRef().size()); + EXPECT_EQ(1, output[0].toGenericDictRef().at("1").toIntListRef()[0]); + EXPECT_EQ(2, output[0].toGenericDictRef().at("1").toIntListRef()[1]); + EXPECT_EQ(2, output[0].toGenericDictRef().at("3").toIntListRef().size()); + EXPECT_EQ(3, output[0].toGenericDictRef().at("3").toIntListRef()[0]); + EXPECT_EQ(4, output[0].toGenericDictRef().at("3").toIntListRef()[1]); + EXPECT_EQ(2, output[1].toGenericDictRef().at("5").toIntListRef().size()); + EXPECT_EQ(5, output[1].toGenericDictRef().at("5").toIntListRef()[0]); + EXPECT_EQ(6, output[1].toGenericDictRef().at("5").toIntListRef()[1]); + EXPECT_EQ(2, output[1].toGenericDictRef().at("7").toIntListRef().size()); + EXPECT_EQ(7, output[1].toGenericDictRef().at("7").toIntListRef()[0]); + EXPECT_EQ(8, output[1].toGenericDictRef().at("7").toIntListRef()[1]); +} + TEST(OperatorRegistrationTest_LegacyLambdaBasedKernel, givenFallbackKernelWithoutAnyArguments_whenRegistered_thenCanBeCalled) { // note: non-fallback kernels without tensor arguments don't work because there // is no way to get the dispatch key. For operators that only have a fallback @@ -705,7 +826,7 @@ TEST(OperatorRegistrationTest_LegacyLambdaBasedKernel, givenKernelWithOptionalIn TEST(OperatorRegistrationTest_LegacyLambdaBasedKernel, givenKernel_whenRegisteredWithoutSpecifyingSchema_thenInfersSchema) { auto registrar = RegisterOperators() - .op("_test::no_schema_specified", [] (Tensor arg1, int64_t arg2, ArrayRef arg3) -> std::tuple {return {};}); + .op("_test::no_schema_specified", [] (Tensor arg1, int64_t arg2, const std::vector& arg3) -> std::tuple {return {};}); auto op = c10::Dispatcher::singleton().findSchema("_test::no_schema_specified", ""); ASSERT_TRUE(op.has_value()); diff --git a/aten/src/ATen/core/op_registration/kernel_lambda_test.cpp b/aten/src/ATen/core/op_registration/kernel_lambda_test.cpp index 5ae3ffcdcd75..fd60342793d6 100644 --- a/aten/src/ATen/core/op_registration/kernel_lambda_test.cpp +++ b/aten/src/ATen/core/op_registration/kernel_lambda_test.cpp @@ -6,8 +6,6 @@ #include using c10::RegisterOperators; -using c10::kernel; -using c10::dispatchKey; using c10::TensorTypeId; using c10::KernelCache; using c10::Stack; @@ -15,7 +13,6 @@ using c10::guts::make_unique; using c10::ivalue::TensorList; using c10::ivalue::IntList; using c10::intrusive_ptr; -using c10::ArrayRef; using c10::Dict; using at::Tensor; using std::string; @@ -47,38 +44,38 @@ void expectCallsDecrement(TensorTypeId type_id) { } TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernel_whenRegistered_thenCanBeCalled) { - auto registrar = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", kernel([] (Tensor, int64_t i) {return i+1;}), dispatchKey(TensorType1())); + auto registrar = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", RegisterOperators::options().kernel([] (Tensor, int64_t i) {return i+1;}).dispatchKey(TensorType1())); expectCallsIncrement(TensorType1()); } TEST(OperatorRegistrationTest_LambdaBasedKernel, givenOutOfLineKernel_whenRegistered_thenCanBeCalled) { auto my_kernel = [] (Tensor, int64_t i) {return i+1;}; - auto registrar = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", kernel(my_kernel), dispatchKey(TensorType1())); + auto registrar = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", RegisterOperators::options().kernel(my_kernel).dispatchKey(TensorType1())); expectCallsIncrement(TensorType1()); } TEST(OperatorRegistrationTest_LambdaBasedKernel, givenMultipleOperatorsAndKernels_whenRegisteredInOneRegistrar_thenCallsRightKernel) { auto registrar = RegisterOperators() - .op("_test::my_op(Tensor dummy, int input) -> int", kernel([] (Tensor, int64_t i) {return i+1;}), dispatchKey(TensorType1())) - .op("_test::my_op(Tensor dummy, int input) -> int", kernel([] (Tensor, int64_t) -> int64_t {EXPECT_TRUE(false); return 0;}), dispatchKey(TensorType2())) - .op("_test::error(Tensor dummy, int input) -> int", kernel([] (Tensor, int64_t) -> int64_t {EXPECT_TRUE(false); return 0;}), dispatchKey(TensorType1())) - .op("_test::error(Tensor dummy, int input) -> int", kernel([] (Tensor, int64_t) -> int64_t {EXPECT_TRUE(false); return 0;}), dispatchKey(TensorType2())); + .op("_test::my_op(Tensor dummy, int input) -> int", RegisterOperators::options().kernel([] (Tensor, int64_t i) {return i+1;}).dispatchKey(TensorType1())) + .op("_test::my_op(Tensor dummy, int input) -> int", RegisterOperators::options().kernel([] (Tensor, int64_t) -> int64_t {EXPECT_TRUE(false); return 0;}).dispatchKey(TensorType2())) + .op("_test::error(Tensor dummy, int input) -> int", RegisterOperators::options().kernel([] (Tensor, int64_t) -> int64_t {EXPECT_TRUE(false); return 0;}).dispatchKey(TensorType1())) + .op("_test::error(Tensor dummy, int input) -> int", RegisterOperators::options().kernel([] (Tensor, int64_t) -> int64_t {EXPECT_TRUE(false); return 0;}).dispatchKey(TensorType2())); expectCallsIncrement(TensorType1()); } TEST(OperatorRegistrationTest_LambdaBasedKernel, givenMultipleOperatorsAndKernels_whenRegisteredInMultipleRegistrars_thenCallsRightKernel) { - auto registrar1 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", kernel([] (Tensor, int64_t i) {return i+1;}), dispatchKey(TensorType1())); - auto registrar2 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", kernel([] (Tensor, int64_t) -> int64_t {EXPECT_TRUE(false); return 0;}), dispatchKey(TensorType2())); - auto registrar3 = RegisterOperators().op("_test::error(Tensor dummy, int input) -> int", kernel([] (Tensor, int64_t) -> int64_t {EXPECT_TRUE(false); return 0;}), dispatchKey(TensorType1())); - auto registrar4 = RegisterOperators().op("_test::error(Tensor dummy, int input) -> int", kernel([] (Tensor, int64_t) -> int64_t {EXPECT_TRUE(false); return 0;}), dispatchKey(TensorType2())); + auto registrar1 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", RegisterOperators::options().kernel([] (Tensor, int64_t i) {return i+1;}).dispatchKey(TensorType1())); + auto registrar2 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", RegisterOperators::options().kernel([] (Tensor, int64_t) -> int64_t {EXPECT_TRUE(false); return 0;}).dispatchKey(TensorType2())); + auto registrar3 = RegisterOperators().op("_test::error(Tensor dummy, int input) -> int", RegisterOperators::options().kernel([] (Tensor, int64_t) -> int64_t {EXPECT_TRUE(false); return 0;}).dispatchKey(TensorType1())); + auto registrar4 = RegisterOperators().op("_test::error(Tensor dummy, int input) -> int", RegisterOperators::options().kernel([] (Tensor, int64_t) -> int64_t {EXPECT_TRUE(false); return 0;}).dispatchKey(TensorType2())); expectCallsIncrement(TensorType1()); } TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernel_whenRegistrationRunsOutOfScope_thenCannotBeCalledAnymore) { { - auto registrar1 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", kernel([] (Tensor, int64_t i) {return i+1;}), dispatchKey(TensorType1())); + auto registrar1 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", RegisterOperators::options().kernel([] (Tensor, int64_t i) {return i+1;}).dispatchKey(TensorType1())); { - auto registrar2 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", kernel([] (Tensor, int64_t i) {return i-1;}), dispatchKey(TensorType2())); + auto registrar2 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", RegisterOperators::options().kernel([] (Tensor, int64_t i) {return i-1;}).dispatchKey(TensorType2())); // assert that schema and cpu kernel are present expectCallsIncrement(TensorType1()); @@ -98,8 +95,9 @@ bool was_called = false; TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithoutOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators().op("_test::no_return(Tensor dummy) -> ()", - kernel([] (const Tensor&) -> void {was_called = true;}), - dispatchKey(TensorType1())); + RegisterOperators::options() + .kernel([] (const Tensor&) -> void {was_called = true;}) + .dispatchKey(TensorType1())); auto op = c10::Dispatcher::singleton().findSchema("_test::no_return", ""); ASSERT_TRUE(op.has_value()); @@ -111,8 +109,8 @@ TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithoutOutput_whenRe TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithZeroOutputs_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators().op("_test::zero_outputs(Tensor dummy) -> ()", - kernel([] (const Tensor&) -> std::tuple<> {was_called = true; return {};}), - dispatchKey(TensorType1())); + RegisterOperators::options().kernel([] (const Tensor&) -> std::tuple<> {was_called = true; return {};}) + .dispatchKey(TensorType1())); auto op = c10::Dispatcher::singleton().findSchema("_test::zero_outputs", ""); ASSERT_TRUE(op.has_value()); @@ -125,8 +123,8 @@ TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithZeroOutputs_when TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithIntOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() .op("_test::int_output(Tensor dummy, int a, int b) -> int", - kernel([] (Tensor, int64_t a, int64_t b) {return a+b;}), - dispatchKey(TensorType1())); + RegisterOperators::options().kernel([] (Tensor, int64_t a, int64_t b) {return a+b;}) + .dispatchKey(TensorType1())); auto op = c10::Dispatcher::singleton().findSchema("_test::int_output", ""); ASSERT_TRUE(op.has_value()); @@ -139,11 +137,11 @@ TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithIntOutput_whenRe TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithTensorOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() .op("_test::returning_tensor(Tensor input) -> Tensor", - kernel([] (const Tensor& a) {return a;}), - dispatchKey(TensorType1())) + RegisterOperators::options().kernel([] (const Tensor& a) {return a;}) + .dispatchKey(TensorType1())) .op("_test::returning_tensor(Tensor input) -> Tensor", - kernel([] (const Tensor& a) {return a;}), - dispatchKey(TensorType2())); + RegisterOperators::options().kernel([] (const Tensor& a) {return a;}) + .dispatchKey(TensorType2())); auto op = c10::Dispatcher::singleton().findSchema("_test::returning_tensor", ""); ASSERT_TRUE(op.has_value()); @@ -160,8 +158,8 @@ TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithTensorOutput_whe TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithTensorListOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() .op("_test::list_output(Tensor input1, Tensor input2, Tensor input3) -> Tensor[]", - kernel([] (const Tensor& a, const Tensor& b, const Tensor& c) -> std::vector {return {a, b, c};}), - dispatchKey(TensorType1())); + RegisterOperators::options().kernel([] (const Tensor& a, const Tensor& b, const Tensor& c) -> std::vector {return {a, b, c};}) + .dispatchKey(TensorType1())); auto op = c10::Dispatcher::singleton().findSchema("_test::list_output", ""); ASSERT_TRUE(op.has_value()); @@ -177,8 +175,8 @@ TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithTensorListOutput TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithIntListOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() .op("_test::list_output(Tensor dummy, int input1, int input2, int input3) -> int[]", - kernel([] (const Tensor&, int64_t a, int64_t b, int64_t c) -> std::vector {return {a,b,c};}), - dispatchKey(TensorType1())); + RegisterOperators::options().kernel([] (const Tensor&, int64_t a, int64_t b, int64_t c) -> std::vector {return {a,b,c};}) + .dispatchKey(TensorType1())); auto op = c10::Dispatcher::singleton().findSchema("_test::list_output", ""); ASSERT_TRUE(op.has_value()); @@ -194,7 +192,7 @@ TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithIntListOutput_wh TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithMultipleOutputs_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() .op("_test::multiple_outputs(Tensor dummy) -> (Tensor, int, Tensor[], int?, Dict(str, Tensor))", - kernel([] (Tensor) -> std::tuple, c10::optional, Dict> { + RegisterOperators::options().kernel([] (Tensor) -> std::tuple, c10::optional, Dict> { Dict dict; dict.insert("first", dummyTensor(TensorType1())); dict.insert("second", dummyTensor(TensorType2())); @@ -205,8 +203,8 @@ TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithMultipleOutputs_ c10::optional(c10::in_place, 0), dict ); - }), - dispatchKey(TensorType1())); + }) + .dispatchKey(TensorType1())); auto op = c10::Dispatcher::singleton().findSchema("_test::multiple_outputs", ""); ASSERT_TRUE(op.has_value()); @@ -228,11 +226,11 @@ TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithMultipleOutputs_ TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithTensorInputByReference_withOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() .op("_test::tensor_input(Tensor input) -> Tensor", - kernel([] (const Tensor& a) {return a;}), - dispatchKey(TensorType1())) + RegisterOperators::options().kernel([] (const Tensor& a) {return a;}) + .dispatchKey(TensorType1())) .op("_test::tensor_input(Tensor input) -> Tensor", - kernel([] (const Tensor& a) {return a;}), - dispatchKey(TensorType2())); + RegisterOperators::options().kernel([] (const Tensor& a) {return a;}) + .dispatchKey(TensorType2())); auto op = c10::Dispatcher::singleton().findSchema("_test::tensor_input", ""); ASSERT_TRUE(op.has_value()); @@ -249,11 +247,11 @@ TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithTensorInputByRef TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithTensorInputByValue_withOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() .op("_test::tensor_input(Tensor input) -> Tensor", - kernel([] (Tensor a) {return a;}), - dispatchKey(TensorType1())) + RegisterOperators::options().kernel([] (Tensor a) {return a;}) + .dispatchKey(TensorType1())) .op("_test::tensor_input(Tensor input) -> Tensor", - kernel([] (Tensor a) {return a;}), - dispatchKey(TensorType2())); + RegisterOperators::options().kernel([] (Tensor a) {return a;}) + .dispatchKey(TensorType2())); auto op = c10::Dispatcher::singleton().findSchema("_test::tensor_input", ""); ASSERT_TRUE(op.has_value()); @@ -272,11 +270,11 @@ Tensor captured_input; TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithTensorInputByReference_withoutOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() .op("_test::tensor_input(Tensor input) -> ()", - kernel([] (const Tensor& a) -> void {captured_input = a;}), - dispatchKey(TensorType1())) + RegisterOperators::options().kernel([] (const Tensor& a) -> void {captured_input = a;}) + .dispatchKey(TensorType1())) .op("_test::tensor_input(Tensor input) -> ()", - kernel([] (const Tensor& a) -> void {captured_input = a;}), - dispatchKey(TensorType2())); + RegisterOperators::options().kernel([] (const Tensor& a) -> void {captured_input = a;}) + .dispatchKey(TensorType2())); auto op = c10::Dispatcher::singleton().findSchema("_test::tensor_input", ""); ASSERT_TRUE(op.has_value()); @@ -293,11 +291,11 @@ TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithTensorInputByRef TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithTensorInputByValue_withoutOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() .op("_test::tensor_input(Tensor input) -> ()", - kernel([] (Tensor a) -> void {captured_input = a;}), - dispatchKey(TensorType1())) + RegisterOperators::options().kernel([] (Tensor a) -> void {captured_input = a;}) + .dispatchKey(TensorType1())) .op("_test::tensor_input(Tensor input) -> ()", - kernel([] (Tensor a) -> void {captured_input = a;}), - dispatchKey(TensorType2())); + RegisterOperators::options().kernel([] (Tensor a) -> void {captured_input = a;}) + .dispatchKey(TensorType2())); auto op = c10::Dispatcher::singleton().findSchema("_test::tensor_input", ""); ASSERT_TRUE(op.has_value()); @@ -316,8 +314,8 @@ int64_t captured_int_input = 0; TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithIntInput_withoutOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() .op("_test::int_input(Tensor dummy, int input) -> ()", - kernel([] (Tensor, int64_t a) -> void {captured_int_input = a;}), - dispatchKey(TensorType1())); + RegisterOperators::options().kernel([] (Tensor, int64_t a) -> void {captured_int_input = a;}) + .dispatchKey(TensorType1())); auto op = c10::Dispatcher::singleton().findSchema("_test::int_input", ""); ASSERT_TRUE(op.has_value()); @@ -331,8 +329,8 @@ TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithIntInput_without TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithIntInput_withOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() .op("_test::int_input(Tensor dummy, int input) -> int", - kernel([] (Tensor, int64_t a) {return a + 1;}), - dispatchKey(TensorType1())); + RegisterOperators::options().kernel([] (Tensor, int64_t a) {return a + 1;}) + .dispatchKey(TensorType1())); auto op = c10::Dispatcher::singleton().findSchema("_test::int_input", ""); ASSERT_TRUE(op.has_value()); @@ -347,8 +345,8 @@ int64_t captured_input_list_size = 0; TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithIntListInput_withoutOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() .op("_test::int_list_input(Tensor dummy, int[] input) -> ()", - kernel([] (Tensor, ArrayRef a) {captured_input_list_size = a.size();}), - dispatchKey(TensorType1())); + RegisterOperators::options().kernel([] (Tensor, const std::vector& a) {captured_input_list_size = a.size();}) + .dispatchKey(TensorType1())); auto op = c10::Dispatcher::singleton().findSchema("_test::int_list_input", ""); ASSERT_TRUE(op.has_value()); @@ -362,8 +360,8 @@ TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithIntListInput_wit TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithIntListInput_withOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() .op("_test::int_list_input(Tensor dummy, int[] input) -> int", - kernel([] (Tensor, ArrayRef a) -> int64_t {return a.size();}), - dispatchKey(TensorType1())); + RegisterOperators::options().kernel([] (Tensor, const std::vector& a) -> int64_t {return a.size();}) + .dispatchKey(TensorType1())); auto op = c10::Dispatcher::singleton().findSchema("_test::int_list_input", ""); ASSERT_TRUE(op.has_value()); @@ -376,8 +374,8 @@ TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithIntListInput_wit TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithTensorListInput_withoutOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() .op("_test::tensor_list_input(Tensor[] input) -> ()", - kernel([] (ArrayRef a) -> void {captured_input_list_size = a.size();}), - dispatchKey(TensorType1())); + RegisterOperators::options().kernel([] (const std::vector& a) -> void {captured_input_list_size = a.size();}) + .dispatchKey(TensorType1())); auto op = c10::Dispatcher::singleton().findSchema("_test::tensor_list_input", ""); ASSERT_TRUE(op.has_value()); @@ -391,8 +389,8 @@ TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithTensorListInput_ TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithTensorListInput_withOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() .op("_test::tensor_list_input(Tensor[] input) -> int", - kernel([] (ArrayRef a) -> int64_t {return a.size();}), - dispatchKey(TensorType1())); + RegisterOperators::options().kernel([] (const std::vector& a) -> int64_t {return a.size();}) + .dispatchKey(TensorType1())); auto op = c10::Dispatcher::singleton().findSchema("_test::tensor_list_input", ""); ASSERT_TRUE(op.has_value()); @@ -406,7 +404,7 @@ int captured_dict_size = 0; TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithDictInput_withoutOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() - .op("_test::dict_input(Dict(str, Tensor) input) -> ()", kernel([] (Dict input1) { + .op("_test::dict_input(Dict(str, Tensor) input) -> ()", RegisterOperators::options().kernel([] (Dict input1) { captured_dict_size = input1.size(); })); @@ -424,7 +422,7 @@ TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithDictInput_withou TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithDictInput_withOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() - .op("_test::dict_input(Dict(str, str) input) -> str", kernel([] (Dict input1) { + .op("_test::dict_input(Dict(str, str) input) -> str", RegisterOperators::options().kernel([] (Dict input1) { return input1.at("key2"); })); @@ -441,7 +439,7 @@ TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithDictInput_withOu TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithDictOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators() - .op("_test::dict_output(Dict(str, str) input) -> Dict(str, str)", kernel([] (Dict input) { + .op("_test::dict_output(Dict(str, str) input) -> Dict(str, str)", RegisterOperators::options().kernel([] (Dict input) { return input; })); @@ -467,7 +465,7 @@ TEST(OperatorRegistrationTest_LambdaBasedKernel, givenFallbackKernelWithoutAnyAr // is no way to get the dispatch key. For operators that only have a fallback // kernel, this must work for backwards compatibility. auto registrar = RegisterOperators() - .op("_test::no_tensor_args() -> ()", kernel([] () {called = true;})); + .op("_test::no_tensor_args() -> ()", RegisterOperators::options().kernel([] () {called = true;})); auto op = c10::Dispatcher::singleton().findSchema("_test::no_tensor_args", ""); ASSERT_TRUE(op.has_value()); @@ -482,7 +480,7 @@ TEST(OperatorRegistrationTest_LambdaBasedKernel, givenFallbackKernelWithoutTenso // is no way to get the dispatch key. For operators that only have a fallback // kernel, this must work for backwards compatibility. auto registrar = RegisterOperators() - .op("_test::no_tensor_args(int arg) -> int", kernel([] (int64_t arg) {return arg + 1;})); + .op("_test::no_tensor_args(int arg) -> int", RegisterOperators::options().kernel([] (int64_t arg) {return arg + 1;})); auto op = c10::Dispatcher::singleton().findSchema("_test::no_tensor_args", ""); ASSERT_TRUE(op.has_value()); @@ -499,13 +497,13 @@ c10::optional called_arg4; TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithOptionalInputs_withoutOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators().op( "_test::opt_input(Tensor arg1, Tensor? arg2, int? arg3, str? arg4) -> ()", - kernel([] (Tensor arg1, const c10::optional& arg2, c10::optional arg3, c10::optional arg4) { + RegisterOperators::options().kernel([] (Tensor arg1, const c10::optional& arg2, c10::optional arg3, c10::optional arg4) { called = true; called_arg2 = arg2; called_arg3 = arg3; called_arg4 = arg4; - }), - dispatchKey(TensorType1())); + }) + .dispatchKey(TensorType1())); auto op = c10::Dispatcher::singleton().findSchema("_test::opt_input", ""); ASSERT_TRUE(op.has_value()); @@ -534,14 +532,14 @@ TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithOptionalInputs_w TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithOptionalInputs_withOutput_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators().op( "_test::opt_input(Tensor arg1, Tensor? arg2, int? arg3, str? arg4) -> Tensor?", - kernel([] (Tensor arg1, const c10::optional& arg2, c10::optional arg3, c10::optional arg4) { + RegisterOperators::options().kernel([] (Tensor arg1, const c10::optional& arg2, c10::optional arg3, c10::optional arg4) { called = true; called_arg2 = arg2; called_arg3 = arg3; called_arg4 = arg4; return arg2; - }), - dispatchKey(TensorType1())); + }) + .dispatchKey(TensorType1())); auto op = c10::Dispatcher::singleton().findSchema("_test::opt_input", ""); ASSERT_TRUE(op.has_value()); @@ -572,10 +570,10 @@ TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithOptionalInputs_w TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithOptionalInputs_withMultipleOutputs_whenRegistered_thenCanBeCalled) { auto registrar = RegisterOperators().op( "_test::opt_input(Tensor arg1, Tensor? arg2, int? arg3, str? arg4) -> (Tensor?, int?, str?)", - kernel([] (Tensor arg1, const c10::optional& arg2, c10::optional arg3, c10::optional arg4) { + RegisterOperators::options().kernel([] (Tensor arg1, const c10::optional& arg2, c10::optional arg3, c10::optional arg4) { return std::make_tuple(arg2, arg3, arg4); - }), - dispatchKey(TensorType1())); + }) + .dispatchKey(TensorType1())); auto op = c10::Dispatcher::singleton().findSchema("_test::opt_input", ""); ASSERT_TRUE(op.has_value()); @@ -594,7 +592,7 @@ TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithOptionalInputs_w TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernel_whenRegisteredWithoutSpecifyingSchema_thenInfersSchema) { auto registrar = RegisterOperators() - .op("_test::no_schema_specified", kernel([] (Tensor arg1, int64_t arg2, ArrayRef arg3) -> std::tuple {return {};})); + .op("_test::no_schema_specified", RegisterOperators::options().kernel([] (Tensor arg1, int64_t arg2, const std::vector& arg3) -> std::tuple {return {};})); auto op = c10::Dispatcher::singleton().findSchema("_test::no_schema_specified", ""); ASSERT_TRUE(op.has_value()); @@ -605,35 +603,35 @@ TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernel_whenRegisteredWitho TEST(OperatorRegistrationTest_LambdaBasedKernel, givenMismatchedKernel_withDifferentNumArguments_whenRegistering_thenFails) { // assert this does not fail because it matches RegisterOperators() - .op("_test::mismatch(Tensor arg) -> int", kernel([] (Tensor) -> int64_t {return {};}), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> int", RegisterOperators::options().kernel([] (Tensor) -> int64_t {return {};}).dispatchKey(TensorType1())); // and now a set of mismatching schemas expectThrows([] { RegisterOperators() - .op("_test::mismatch(Tensor arg, Tensor arg2) -> int", kernel([] (Tensor) -> int64_t {return {};}), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg, Tensor arg2) -> int", RegisterOperators::options().kernel([] (Tensor) -> int64_t {return {};}).dispatchKey(TensorType1())); }, "The number of arguments is different. Specified 2 but inferred 1" ); // assert this does not fail because it matches RegisterOperators() - .op("_test::mismatch(Tensor arg, Tensor arg2) -> ()", kernel([] (Tensor, Tensor) -> void {}), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg, Tensor arg2) -> ()", RegisterOperators::options().kernel([] (Tensor, Tensor) -> void {}).dispatchKey(TensorType1())); // and now a set of mismatching schemas expectThrows([] { RegisterOperators() - .op("_test::mismatch() -> ()", kernel([] (Tensor, Tensor) -> void {}), dispatchKey(TensorType1())); + .op("_test::mismatch() -> ()", RegisterOperators::options().kernel([] (Tensor, Tensor) -> void {}).dispatchKey(TensorType1())); }, "The number of arguments is different. Specified 0 but inferred 2" ); expectThrows([] { RegisterOperators() - .op("_test::mismatch(Tensor arg) -> ()", kernel([] (Tensor, Tensor) -> void {}), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> ()", RegisterOperators::options().kernel([] (Tensor, Tensor) -> void {}).dispatchKey(TensorType1())); }, "The number of arguments is different. Specified 1 but inferred 2" ); expectThrows([] { RegisterOperators() - .op("_test::mismatch(Tensor arg, Tensor arg2, Tensor arg3) -> ()", kernel([] (Tensor, Tensor) -> void {}), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg, Tensor arg2, Tensor arg3) -> ()", RegisterOperators::options().kernel([] (Tensor, Tensor) -> void {}).dispatchKey(TensorType1())); }, "The number of arguments is different. Specified 3 but inferred 2" ); } @@ -641,18 +639,18 @@ TEST(OperatorRegistrationTest_LambdaBasedKernel, givenMismatchedKernel_withDiffe TEST(OperatorRegistrationTest_LambdaBasedKernel, givenMismatchedKernel_withDifferentArgumentType_whenRegistering_thenFails) { // assert this does not fail because it matches RegisterOperators() - .op("_test::mismatch(Tensor arg1, int arg2) -> int", kernel([] (Tensor, int64_t) -> int64_t {return {};}), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg1, int arg2) -> int", RegisterOperators::options().kernel([] (Tensor, int64_t) -> int64_t {return {};}).dispatchKey(TensorType1())); // and now a set of mismatching schemas expectThrows([] { RegisterOperators() - .op("_test::mismatch(Tensor arg1, float arg2) -> int", kernel([] (Tensor, int64_t) -> int64_t {return {};}), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg1, float arg2) -> int", RegisterOperators::options().kernel([] (Tensor, int64_t) -> int64_t {return {};}).dispatchKey(TensorType1())); }, "Type mismatch in argument 2: specified float but inferred int" ); expectThrows([] { RegisterOperators() - .op("_test::mismatch(int arg1, int arg2) -> int", kernel([] (Tensor, int64_t) -> int64_t {return {};}), dispatchKey(TensorType1())); + .op("_test::mismatch(int arg1, int arg2) -> int", RegisterOperators::options().kernel([] (Tensor, int64_t) -> int64_t {return {};}).dispatchKey(TensorType1())); }, "Type mismatch in argument 1: specified int but inferred Tensor" ); } @@ -660,58 +658,58 @@ TEST(OperatorRegistrationTest_LambdaBasedKernel, givenMismatchedKernel_withDiffe TEST(OperatorRegistrationTest_LambdaBasedKernel, givenMismatchedKernel_withDifferentNumReturns_whenRegistering_thenFails) { // assert this does not fail because it matches RegisterOperators() - .op("_test::mismatch(Tensor arg) -> int", kernel([] (Tensor) -> int64_t {return {};}), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> int", RegisterOperators::options().kernel([] (Tensor) -> int64_t {return {};}).dispatchKey(TensorType1())); // and now a set of mismatching schemas expectThrows([] { RegisterOperators() - .op("_test::mismatch(Tensor arg) -> ()", kernel([] (Tensor) -> int64_t {return {};}), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> ()", RegisterOperators::options().kernel([] (Tensor) -> int64_t {return {};}).dispatchKey(TensorType1())); }, "The number of returns is different. Specified 0 but inferred 1" ); expectThrows([] { RegisterOperators() - .op("_test::mismatch(Tensor arg) -> (int, int)", kernel([] (Tensor) -> int64_t {return {};}), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> (int, int)", RegisterOperators::options().kernel([] (Tensor) -> int64_t {return {};}).dispatchKey(TensorType1())); }, "The number of returns is different. Specified 2 but inferred 1" ); // assert this does not fail because it matches RegisterOperators() - .op("_test::mismatch(Tensor arg) -> ()", kernel([] (Tensor) -> void {}), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> ()", RegisterOperators::options().kernel([] (Tensor) -> void {}).dispatchKey(TensorType1())); // and now a set of mismatching schemas expectThrows([] { RegisterOperators() - .op("_test::mismatch(Tensor arg) -> Tensor", kernel([] (Tensor) -> void {}), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> Tensor", RegisterOperators::options().kernel([] (Tensor) -> void {}).dispatchKey(TensorType1())); }, "The number of returns is different. Specified 1 but inferred 0" ); expectThrows([] { RegisterOperators() - .op("_test::mismatch(Tensor arg) -> (Tensor, Tensor)", kernel([] (Tensor) -> void {}), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> (Tensor, Tensor)", RegisterOperators::options().kernel([] (Tensor) -> void {}).dispatchKey(TensorType1())); }, "The number of returns is different. Specified 2 but inferred 0" ); // assert this does not fail because it matches RegisterOperators() - .op("_test::mismatch(Tensor arg) -> (Tensor, Tensor)", kernel([] (Tensor) -> std::tuple {return {};}), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> (Tensor, Tensor)", RegisterOperators::options().kernel([] (Tensor) -> std::tuple {return {};}).dispatchKey(TensorType1())); // and now a set of mismatching schemas expectThrows([] { RegisterOperators() - .op("_test::mismatch(Tensor arg) -> ()", kernel([] (Tensor) -> std::tuple {return {};}), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> ()", RegisterOperators::options().kernel([] (Tensor) -> std::tuple {return {};}).dispatchKey(TensorType1())); }, "The number of returns is different. Specified 0 but inferred 2" ); expectThrows([] { RegisterOperators() - .op("_test::mismatch(Tensor arg) -> Tensor", kernel([] (Tensor) -> std::tuple {return {};}), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> Tensor", RegisterOperators::options().kernel([] (Tensor) -> std::tuple {return {};}).dispatchKey(TensorType1())); }, "The number of returns is different. Specified 1 but inferred 2" ); expectThrows([] { RegisterOperators() - .op("_test::mismatch(Tensor arg) -> (Tensor, Tensor, Tensor)", kernel([] (Tensor) -> std::tuple {return {};}), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> (Tensor, Tensor, Tensor)", RegisterOperators::options().kernel([] (Tensor) -> std::tuple {return {};}).dispatchKey(TensorType1())); }, "The number of returns is different. Specified 3 but inferred 2" ); } @@ -719,46 +717,46 @@ TEST(OperatorRegistrationTest_LambdaBasedKernel, givenMismatchedKernel_withDiffe TEST(OperatorRegistrationTest_LambdaBasedKernel, givenMismatchedKernel_withDifferentReturnTypes_whenRegistering_thenFails) { // assert this does not fail because it matches RegisterOperators() - .op("_test::mismatch(Tensor arg) -> int", kernel([] (Tensor) -> int64_t {return {};}), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> int", RegisterOperators::options().kernel([] (Tensor) -> int64_t {return {};}).dispatchKey(TensorType1())); // and now a set of mismatching schemas expectThrows([] { RegisterOperators() - .op("_test::mismatch(Tensor arg) -> Tensor", kernel([] (Tensor) -> int64_t {return {};}), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> Tensor", RegisterOperators::options().kernel([] (Tensor) -> int64_t {return {};}).dispatchKey(TensorType1())); }, "Type mismatch in return 1: specified Tensor but inferred int" ); expectThrows([] { RegisterOperators() - .op("_test::mismatch(Tensor arg) -> float", kernel([] (Tensor) -> int64_t {return {};}), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> float", RegisterOperators::options().kernel([] (Tensor) -> int64_t {return {};}).dispatchKey(TensorType1())); }, "Type mismatch in return 1: specified float but inferred int" ); // assert this does not fail because it matches RegisterOperators() - .op("_test::mismatch(Tensor arg) -> Tensor", kernel([] (Tensor) -> Tensor {return {};}), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> Tensor", RegisterOperators::options().kernel([] (Tensor) -> Tensor {return {};}).dispatchKey(TensorType1())); // and now a set of mismatching schemas expectThrows([] { RegisterOperators() - .op("_test::mismatch(Tensor arg) -> float", kernel([] (Tensor) -> Tensor {return {};}), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> float", RegisterOperators::options().kernel([] (Tensor) -> Tensor {return {};}).dispatchKey(TensorType1())); }, "Type mismatch in return 1: specified float but inferred Tensor" ); // assert this does not fail because it matches RegisterOperators() - .op("_test::mismatch(Tensor arg) -> (Tensor, int)", kernel([] (Tensor) -> std::tuple {return {};}), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> (Tensor, int)", RegisterOperators::options().kernel([] (Tensor) -> std::tuple {return {};}).dispatchKey(TensorType1())); // and now a set of mismatching schemas expectThrows([] { RegisterOperators() - .op("_test::mismatch(Tensor arg) -> (Tensor, float)", kernel([] (Tensor) -> std::tuple {return {};}), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> (Tensor, float)", RegisterOperators::options().kernel([] (Tensor) -> std::tuple {return {};}).dispatchKey(TensorType1())); }, "Type mismatch in return 2: specified float but inferred int" ); expectThrows([] { RegisterOperators() - .op("_test::mismatch(Tensor arg) -> (int, int)", kernel([] (Tensor) -> std::tuple {return {};}), dispatchKey(TensorType1())); + .op("_test::mismatch(Tensor arg) -> (int, int)", RegisterOperators::options().kernel([] (Tensor) -> std::tuple {return {};}).dispatchKey(TensorType1())); }, "Type mismatch in return 1: specified int but inferred Tensor" ); } diff --git a/aten/src/ATen/core/op_registration/kernel_stackbased.h b/aten/src/ATen/core/op_registration/kernel_stackbased.h deleted file mode 100644 index 8ec981979f31..000000000000 --- a/aten/src/ATen/core/op_registration/kernel_stackbased.h +++ /dev/null @@ -1,81 +0,0 @@ -#pragma once - -/** - * This file implements c10::kernel(stack_based_kernel) which is used in the - * kernel registration API to set the dispatch key for a registered kernel. - * You probably don't want to use this API, stack based kernels are internal - * only. There's other, better kernel APIs which are built on top of this one. - * - * You probably don't want to include this file directly but include - * op_registration.h instead since that adds more functionality you'll - * likely need to register your operators. - */ - -#include - -namespace c10 { - -namespace detail { - - struct NoFunctionSchemaInference final { - std::unique_ptr operator()() const { - return nullptr; - } - }; - - template - struct KernelRegistrationConfigParameter final { - template - constexpr KernelRegistrationConfigParameter(KernelFunction* kernel_func, KernelCacheCreatorFunction__&& cache_creator_func, InferFunctionSchemaFunction&& infer_function_schema_func) - : kernel_func_(kernel_func) - , cache_creator_func_(std::forward(cache_creator_func)) - , infer_function_schema_func_(std::forward(infer_function_schema_func)) { - } - - void apply(KernelRegistrationConfig* registration) const & { - registration->kernel_func = kernel_func_; - registration->cache_creator_func = cache_creator_func_; - registration->inferred_function_schema = infer_function_schema_func_(); - } - - void apply(KernelRegistrationConfig* registration) && { - registration->kernel_func = kernel_func_; - registration->cache_creator_func = std::move(cache_creator_func_); - registration->inferred_function_schema = std::move(infer_function_schema_func_)(); - } - - private: - KernelFunction* kernel_func_; - KernelCacheCreatorFunction_ cache_creator_func_; - InferFunctionSchemaFunction infer_function_schema_func_; - }; - - static_assert(is_registration_config_parameter>::value, "KernelRegistrationConfigParameter must fulfill the registration config parameter concept"); -} - -/** - * Use this to register an operator whose kernel is implemented by a stack - * based function. This is meant to be used internally, for example for writing - * wrappers for other ways of writing operators. This is not part of the - * public API. - * - * Example: - * - * > namespace { - * > void my_kernel_cpu(Stack* stack, KernelCache* cache) {...} - * > unique_ptr my_cache_creator() {...} - * > } - * > - * > static auto registry = c10::RegisterOperators() - * > .op("my_op", - * > c10::kernel(&my_kernel_cpu, &my_cache_creator), - * > c10::dispatchKey(CPUTensorId())); - */ -template -inline constexpr detail::KernelRegistrationConfigParameter, detail::NoFunctionSchemaInference> kernel(KernelFunction* kernel_func, KernelCacheCreatorFunction_&& cache_creator) { - static_assert(detail::is_registration_config_parameter, detail::NoFunctionSchemaInference>>::value, "KernelRegistrationConfigParameter must fulfill the registration config parameter concept"); - - return {kernel_func, std::forward(cache_creator), detail::NoFunctionSchemaInference()}; -} - -} diff --git a/aten/src/ATen/core/op_registration/kernel_stackbased_test.cpp b/aten/src/ATen/core/op_registration/kernel_stackbased_test.cpp index 0fc2060601a5..e5cf67e16db5 100644 --- a/aten/src/ATen/core/op_registration/kernel_stackbased_test.cpp +++ b/aten/src/ATen/core/op_registration/kernel_stackbased_test.cpp @@ -6,8 +6,6 @@ #include using c10::RegisterOperators; -using c10::kernel; -using c10::dispatchKey; using c10::TensorTypeId; using c10::KernelCache; using c10::Stack; @@ -60,32 +58,32 @@ void expectCallsDecrement(TensorTypeId type_id) { } TEST(OperatorRegistrationTest_StackBasedKernel, givenKernel_whenRegistered_thenCanBeCalled) { - auto registrar = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", kernel(&incrementKernel, &noCache), dispatchKey(TensorType1())); + auto registrar = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", RegisterOperators::options().kernel(&incrementKernel, &noCache).dispatchKey(TensorType1())); expectCallsIncrement(TensorType1()); } TEST(OperatorRegistrationTest_StackBasedKernel, givenMultipleOperatorsAndKernels_whenRegisteredInOneRegistrar_thenCallsRightKernel) { auto registrar = RegisterOperators() - .op("_test::my_op(Tensor dummy, int input) -> int", kernel(&incrementKernel, &noCache), dispatchKey(TensorType1())) - .op("_test::my_op(Tensor dummy, int input) -> int", kernel(&errorKernel, &noCache), dispatchKey(TensorType2())) - .op("_test::error(Tensor dummy, int input) -> int", kernel(&errorKernel, &noCache), dispatchKey(TensorType1())) - .op("_test::error(Tensor dummy, int input) -> int", kernel(&errorKernel, &noCache), dispatchKey(TensorType2())); + .op("_test::my_op(Tensor dummy, int input) -> int", RegisterOperators::options().kernel(&incrementKernel, &noCache).dispatchKey(TensorType1())) + .op("_test::my_op(Tensor dummy, int input) -> int", RegisterOperators::options().kernel(&errorKernel, &noCache).dispatchKey(TensorType2())) + .op("_test::error(Tensor dummy, int input) -> int", RegisterOperators::options().kernel(&errorKernel, &noCache).dispatchKey(TensorType1())) + .op("_test::error(Tensor dummy, int input) -> int", RegisterOperators::options().kernel(&errorKernel, &noCache).dispatchKey(TensorType2())); expectCallsIncrement(TensorType1()); } TEST(OperatorRegistrationTest_StackBasedKernel, givenMultipleOperatorsAndKernels_whenRegisteredInMultipleRegistrars_thenCallsRightKernel) { - auto registrar1 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", kernel(&incrementKernel, &noCache), dispatchKey(TensorType1())); - auto registrar2 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", kernel(&errorKernel, &noCache), dispatchKey(TensorType2())); - auto registrar3 = RegisterOperators().op("_test::error(Tensor dummy, int input) -> int", kernel(&errorKernel, &noCache), dispatchKey(TensorType1())); - auto registrar4 = RegisterOperators().op("_test::error(Tensor dummy, int input) -> int", kernel(&errorKernel, &noCache), dispatchKey(TensorType2())); + auto registrar1 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", RegisterOperators::options().kernel(&incrementKernel, &noCache).dispatchKey(TensorType1())); + auto registrar2 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", RegisterOperators::options().kernel(&errorKernel, &noCache).dispatchKey(TensorType2())); + auto registrar3 = RegisterOperators().op("_test::error(Tensor dummy, int input) -> int", RegisterOperators::options().kernel(&errorKernel, &noCache).dispatchKey(TensorType1())); + auto registrar4 = RegisterOperators().op("_test::error(Tensor dummy, int input) -> int", RegisterOperators::options().kernel(&errorKernel, &noCache).dispatchKey(TensorType2())); expectCallsIncrement(TensorType1()); } TEST(OperatorRegistrationTest_StackBasedKernel, givenKernel_whenRegistrationRunsOutOfScope_thenCannotBeCalledAnymore) { { - auto registrar1 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", kernel(&incrementKernel, &noCache), dispatchKey(TensorType1())); + auto registrar1 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", RegisterOperators::options().kernel(&incrementKernel, &noCache).dispatchKey(TensorType1())); { - auto registrar2 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", kernel(&decrementKernel, &noCache), dispatchKey(TensorType2())); + auto registrar2 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", RegisterOperators::options().kernel(&decrementKernel, &noCache).dispatchKey(TensorType2())); // assert that schema and cpu kernel are present expectCallsIncrement(TensorType1()); @@ -112,7 +110,7 @@ TEST(OperatorRegistrationTest_StackBasedKernel, givenFallbackKernelWithoutAnyArg // is no way to get the dispatch key. For operators that only have a fallback // kernel, this must work for backwards compatibility. auto registrar = RegisterOperators() - .op("_test::no_tensor_args() -> ()", kernel(&kernelWithoutInputs, &noCache)); + .op("_test::no_tensor_args() -> ()", RegisterOperators::options().kernel(&kernelWithoutInputs, &noCache)); auto op = c10::Dispatcher::singleton().findSchema("_test::no_tensor_args", ""); ASSERT_TRUE(op.has_value()); @@ -131,7 +129,7 @@ TEST(OperatorRegistrationTest_StackBasedKernel, givenFallbackKernelWithoutTensor // is no way to get the dispatch key. For operators that only have a fallback // kernel, this must work for backwards compatibility. auto registrar = RegisterOperators() - .op("_test::no_tensor_args(int arg) -> int", kernel(&kernelWithoutTensorInputs, &noCache)); + .op("_test::no_tensor_args(int arg) -> int", RegisterOperators::options().kernel(&kernelWithoutTensorInputs, &noCache)); auto op = c10::Dispatcher::singleton().findSchema("_test::no_tensor_args", ""); ASSERT_TRUE(op.has_value()); @@ -146,7 +144,7 @@ void kernelForSchemaInference(Stack* stack, KernelCache* cache) { TEST(OperatorRegistrationTest_StackBasedKernel, givenKernel_whenRegisteredWithoutSpecifyingSchema_thenFailsBecauseItCannotInferFromStackBasedKernel) { expectThrows([] { - RegisterOperators().op("_test::no_schema_specified", kernel(&kernelForSchemaInference, &noCache)); + RegisterOperators().op("_test::no_schema_specified", RegisterOperators::options().kernel(&kernelForSchemaInference, &noCache)); }, "Cannot infer schema from this kernel function. Please explicitly specify the operator schema."); } @@ -165,7 +163,7 @@ void increment_sequence_kernel(Stack* stack, KernelCache* cache) { } TEST(OperatorRegistrationTest_StackBasedKernel, givenKernelWithCache_whenCalled_thenCacheIsHandledCorrectly) { - auto registrar = RegisterOperators().op("_test::increment_sequence(Tensor dummy) -> int", kernel(&increment_sequence_kernel, &make_cache), dispatchKey(TensorType1())); + auto registrar = RegisterOperators().op("_test::increment_sequence(Tensor dummy) -> int", RegisterOperators::options().kernel(&increment_sequence_kernel, &make_cache).dispatchKey(TensorType1())); auto op = c10::Dispatcher::singleton().findSchema("_test::increment_sequence", ""); ASSERT_TRUE(op.has_value()); diff --git a/aten/src/ATen/core/op_registration/op_registration.cpp b/aten/src/ATen/core/op_registration/op_registration.cpp index ec0677ea4cec..a651cb26ae60 100644 --- a/aten/src/ATen/core/op_registration/op_registration.cpp +++ b/aten/src/ATen/core/op_registration/op_registration.cpp @@ -38,48 +38,48 @@ class RegisterOperators::OperatorRegistrar final { c10::optional kernel_registration_handle_; }; -void RegisterOperators::checkSchemaAndRegisterOp_(const std::string& schemaOrNameStr, detail::KernelRegistrationConfig&& config) { +void RegisterOperators::checkSchemaAndRegisterOp_(const std::string& schemaOrNameStr, Options&& options) { #if defined(CAFFE2_IS_XPLAT_BUILD) throw std::logic_error("We don't support registering c10 ops on mobile yet because the function schema parser isn't present in the mobile build."); #else either schemaOrName = torch::jit::parseSchemaOrName(schemaOrNameStr); if (schemaOrName.is_right()) { // schema was explicitly specified. Check it matches the inferred one and register the op. - checkSchemaAndRegisterOp_(std::move(schemaOrName).right(), std::move(config)); + checkSchemaAndRegisterOp_(std::move(schemaOrName).right(), std::move(options)); } else { // schema wasn't explicitly specified. Take the inferred schema for registering the op. - AT_ASSERTM(nullptr != config.inferred_function_schema.get(), "Cannot infer schema from this kernel function. Please explicitly specify the operator schema."); + AT_ASSERTM(nullptr != options.config.inferred_function_schema.get(), "Cannot infer schema from this kernel function. Please explicitly specify the operator schema."); OperatorName name = std::move(schemaOrName).left(); FunctionSchema inferredSchema( std::move(name.name), std::move(name.overload_name), - config.inferred_function_schema->arguments(), - config.inferred_function_schema->returns(), - config.inferred_function_schema->is_vararg(), - config.inferred_function_schema->is_varret() + options.config.inferred_function_schema->arguments(), + options.config.inferred_function_schema->returns(), + options.config.inferred_function_schema->is_vararg(), + options.config.inferred_function_schema->is_varret() ); - registerOp_(std::move(inferredSchema), std::move(config)); + registerOp_(std::move(inferredSchema), std::move(options)); } #endif } -void RegisterOperators::checkSchemaAndRegisterOp_(FunctionSchema&& schema, detail::KernelRegistrationConfig&& config) { - if (config.inferred_function_schema.get() != nullptr) { - assertSchemasHaveSameSignature(*config.inferred_function_schema, schema); +void RegisterOperators::checkSchemaAndRegisterOp_(FunctionSchema&& schema, Options&& options) { + if (options.config.inferred_function_schema.get() != nullptr) { + assertSchemasHaveSameSignature(*options.config.inferred_function_schema, schema); } - registerOp_(std::move(schema), std::move(config)); + registerOp_(std::move(schema), std::move(options)); } -void RegisterOperators::registerOp_(FunctionSchema&& schema, detail::KernelRegistrationConfig&& config) { - AT_CHECK(!config.dispatch_key.has_value() || config.kernel_func != nullptr, +void RegisterOperators::registerOp_(FunctionSchema&& schema, Options&& options) { + TORCH_CHECK(!options.config.dispatch_key.has_value() || options.config.kernel_func != nullptr, "Tried to register an operator with a dispatch key but without a kernel. " "Please either specify a kernel or omit the dispatch key to only register the schema."); // if kernel_func is set, so must be cache_creator_func, the API shouldn't allow anything else. - AT_ASSERT((config.kernel_func != nullptr) == static_cast(config.cache_creator_func)); + AT_ASSERT((options.config.kernel_func != nullptr) == static_cast(options.config.cache_creator_func)); - registrars_.emplace_back(std::move(schema), config.dispatch_key, config.kernel_func, std::move(config.cache_creator_func)); + registrars_.emplace_back(std::move(schema), options.config.dispatch_key, options.config.kernel_func, std::move(options.config.cache_creator_func)); } RegisterOperators::RegisterOperators() = default; diff --git a/aten/src/ATen/core/op_registration/op_registration.h b/aten/src/ATen/core/op_registration/op_registration.h index d5c576b7e604..076101f436ee 100644 --- a/aten/src/ATen/core/op_registration/op_registration.h +++ b/aten/src/ATen/core/op_registration/op_registration.h @@ -5,9 +5,7 @@ * functionality needed to do so for you. */ -#include -#include -#include +#include #include #include #include @@ -30,11 +28,11 @@ namespace c10 { * > } * > * > static auto registry = c10::RegisterOperators() - * > .op("my_op", - * > c10::kernel(), - * > c10::dispatchKey(CPUTensorId())); + * > .op("my_op", c10::RegisterOperators::options() + * > .kernel() + * > .dispatchKey(CPUTensorId())); */ -class C10_API RegisterOperators final { +class CAFFE2_API RegisterOperators final { public: RegisterOperators(); ~RegisterOperators(); @@ -44,83 +42,238 @@ class C10_API RegisterOperators final { RegisterOperators(RegisterOperators&&) noexcept; RegisterOperators& operator=(RegisterOperators&&) noexcept; + class CAFFE2_API Options final { + public: + Options(const Options&) = delete; + Options(Options&&) noexcept = delete; + Options& operator=(const Options&) = delete; + Options& operator=(Options&&) noexcept = delete; + + // internal-only for registering stack based kernels + Options&& kernel(KernelFunction* kernel_func, KernelCacheCreatorFunction&& cache_creator) && { + return std::move(*this).kernel(kernel_func, std::move(cache_creator), nullptr); + } + + /** + * Use this to register an operator whose kernel is implemented as a functor + * + * Example: + * + * > namespace { + * > class my_kernel_cpu final : public c10::OperatorKernel { + * > public: + * > Tensor operator()(Tensor a, Tensor b) {...} + * > }; + * > } + * > + * > static auto registry = c10::RegisterOperators() + * > .op("my_op", c10::RegisterOperators::options() + * > .kernel() + * > .dispatchKey(CPUTensorId())); + * + * The functor constructor can take arguments to configure the kernel. + * The arguments are defined in the kernel registration. + * Example: + * + * > namespace { + * > class my_kernel_cpu final : public c10::OperatorKernel { + * > public: + * > explicit my_kernel_cpu(std::string some_configuration, int a, bool b) + * > : ... {...} + * > + * > Tensor operator()(Tensor a, Tensor b) {...} + * > }; + * > } + * > + * > static auto registry = c10::RegisterOperators() + * > .op("my_op", c10::RegisterOperators::options() + * > .kernel("some_configuration", 3, true) + * > .dispatchKey(CPUTensorId())); + */ + template + // enable_if: only enable it if KernelFunctor is actually a functor + guts::enable_if_t::value, Options&&> kernel(ConstructorParameters&&... constructorParameters) { + static_assert(std::is_base_of::value, "Tried to register a kernel functor using the kernel() API, but it doesn't inherit from c10::OperatorKernel. Please have the functor inherit from it."); + static_assert(std::is_constructible::value, "Wrong argument list for constructor of kernel functor. The arguments to kernel(arguments...) must match one of the constructors of Functor."); + + return std::move(*this).kernelFunctor(std::forward(constructorParameters)...); + } + + /** + * Use this to register an operator whose kernel is implemented by a function: + * + * Example: + * + * > namespace { Tensor my_kernel_cpu(Tensor a, Tensor b) {...} } + * > + * > static auto registry = c10::RegisterOperators() + * > .op("my_op", c10::RegisterOperators() + * > .kernel() + * > .dispatchKey(CPUTensorId())); + */ + template + // enable_if: only enable it if FuncType is actually a function + guts::enable_if_t::value, Options&&> kernel() { + static_assert(!std::is_same::value, "Tried to register a stackbased (i.e. internal) kernel function using the public kernel<...>() API. Please either use the internal kernel(...) API or also implement the kernel function as defined by the public API."); + + return kernel::type>(); + } + + /** + * Use this to register an operator whose kernel is implemented as a lambda. + * The lambda must be stateless, i.e. not have a capture. If your kernel + * needs to store some configuration parameters, write the kernel as a + * functor instead. + * + * Example: + * + * > static auto registry = c10::RegisterOperators() + * > .op("my_op", c10::RegisterOperators::options() + * > .kernel([] (Tensor a) -> Tensor {...}) + * > .dispatchKey(CPUTensorId())); + */ + template + // enable_if: only enable it if Lambda is a functor (note: lambdas are functors) + guts::enable_if_t>::value, Options&&> kernel(Lambda&& functor) { + static_assert(!std::is_base_of::value, "The kernel(x) API for registering a kernel is only meant to be used with lambdas. Your kernel is a functor. Please use the kernel() API instead."); + + // We don't support stateful lambdas (i.e. lambdas with a capture), because their + // behavior would be nonobvious. A functor kernel with cache gets a new instance of + // its cache each time the kernel is looked up from the dispatch table. + // A lambda with a capture would be global and share its capture between all kernel lookups. + // So, instead of making users having to think about it (including the thread-safety + // issues this causes), let's just forbid stateful lambdas alltogether. + static_assert(guts::is_stateless_lambda>::value, "The kernel(x) API for registering a kernel only works for stateless lambdas (i.e. lambdas without captures). If you need a cache, please use the functor based API kernel() instead."); + + return std::move(*this).kernelFunctor>>(std::forward(functor)); + } + + /** + * Use this to register an operator with a kernel for a certain dispatch key. + * + * Example: + * + * > namespace { + * > class my_kernel_cpu final : public c10::OperatorKernel { + * > public: + * > Tensor operator()(Tensor a, Tensor b) {...} + * > }; + * > class my_kernel_cuda final : public c10::OperatorKernel { + * > public: + * > Tensor operator()(Tensor a, Tensor b) {...} + * > }; + * > } + * > + * > static auto registry = c10::RegisterOperators() + * > .op("my_op", c10::RegisterOperators::options() + * > .kernel() + * > .dispatchKey(CPUTensorId())) + * > .op("my_op", c10::RegisterOperators::options() + * > .kernel() + * > .dispatchKey(CUDATensorId())); + */ + Options&& dispatchKey(TensorTypeId dispatch_key) && { + if (config.dispatch_key.has_value()) { + AT_ERROR("Operator registration: Cannot register multiple dispatch keys in the same op() call. Please call op() multiple times if you want to register multiple kernels."); + } + config.dispatch_key = dispatch_key; + return std::move(*this); + } + + private: + Options&& kernel(KernelFunction* kernel_func, KernelCacheCreatorFunction&& cache_creator, std::unique_ptr&& inferred_function_schema) && { + if (nullptr != config.kernel_func) { + AT_ERROR("Operator registration: Cannot register multiple kernels in the same op() call. Please call op() multiple times if you want to register multiple kernels."); + } + AT_ASSERTM(nullptr == config.cache_creator_func, "kernel_func was nullptr, so cache_creator_func must be too"); + AT_ASSERTM(nullptr == config.inferred_function_schema, "kernel_func was nullptr, so inferred_function_schema must be too"); + + config.kernel_func = kernel_func; + config.cache_creator_func = std::move(cache_creator); + config.inferred_function_schema = std::move(inferred_function_schema); + return std::move(*this); + } + + template + Options&& kernelFunctor(ConstructorParameters&&... constructorParameters) && { + return std::move(*this).kernel( + &detail::wrap_kernel_functor::call, + detail::KernelFactory...>(std::forward(constructorParameters)...), + detail::FunctionSchemaInferer()() + ); + } + + Options() = default; + + // KernelRegistrationConfig accumulates all information from the config + // parameters passed to a RegisterOperators::op() call into one object. + struct KernelRegistrationConfig final { + KernelRegistrationConfig() + : dispatch_key(c10::nullopt) + , kernel_func(nullptr) + , cache_creator_func(nullptr) + , inferred_function_schema(nullptr) + {} + + c10::optional dispatch_key; + KernelFunction* kernel_func; + KernelCacheCreatorFunction cache_creator_func; + std::unique_ptr inferred_function_schema; + }; + + KernelRegistrationConfig config; + friend class RegisterOperators; + }; + /** - * Register an operator based on a function schema and a set of configuration - * parameters (i.e. kernel function, dispatch key, ...). - * - * Example: - * - * > namespace { - * > class my_kernel_cpu final : public c10::OperatorKernel { - * > public: - * > Tensor operator()(Tensor a, Tensor b) {...} - * > }; - * > } - * > - * > static auto registry = c10::RegisterOperators() - * > .op("my_op", - * > c10::kernel(), - * > c10::dispatchKey(CPUTensorId())); + * Call this to get an instance of registration options, which + * can be passed to a call to RegisterOperators::op() to specify + * these options for the operator registration. + * See class doc comment for examples. */ - template - RegisterOperators op(const std::string& schemaOrName, ConfigParameters&&... configParameters) && { - static_assert(guts::conjunction>...>::value, - "Invalid argument passed to op(). Examples for valid arguments are c10::kernel(...) for defining a kernel " - " and c10::dispatchKey(...) for defining a dispatch key. Please see the documentation for registering c10 operators."); - - op_(schemaOrName, std::forward(configParameters)...); - return std::move(*this); + static Options options() { + return {}; } - // This FunctionSchema based variant is only meant to be used for internal - // purposes when we already have a pre-parsed FunctionSchema. - // This is for example used for exposing legacy caffe2 operators to c10. - template - RegisterOperators op(FunctionSchema schema, ConfigParameters&&... configParameters) && { - static_assert(guts::conjunction>...>::value, - "Invalid argument passed to op(). Examples for valid arguments are c10::kernel(...) for defining a kernel " - " and c10::dispatchKey(...) for defining a dispatch key. Please see the documentation for registering c10 operators."); - - op_(std::move(schema), std::forward(configParameters)...); + /** + * Call this to register an operator. See class doc comment for examples. + */ + RegisterOperators&& op(const std::string& schemaOrName, Options&& options = RegisterOperators::options()) && { + checkSchemaAndRegisterOp_(schemaOrName, std::move(options)); return std::move(*this); } - template - C10_DEPRECATED_MESSAGE("Registering kernels via passing arguments to RegisterOperators(...) is deprecated. " \ - "Please use RegisterOperators().op(...) instead.") - // enable_if: only enable it if FuncType is actually a function, but not a stack based KernelFunction. - explicit RegisterOperators(guts::enable_if_t::value && !std::is_same::value, const std::string&> schemaOrName, FuncType* func) - : RegisterOperators() { - legacyAPIOp_(schemaOrName, func); + // internal only for registering caffe2 ops + RegisterOperators&& op(FunctionSchema schema, Options&& options) && { + checkSchemaAndRegisterOp_(std::move(schema), std::move(options)); + return std::move(*this); } template - C10_DEPRECATED_MESSAGE("Registering kernels via passing arguments to RegisterOperators(...) is deprecated. " \ - "Please use RegisterOperators().op(...) instead.") - // enable_if: only enable it if FuncType is actually a functor - explicit RegisterOperators(guts::enable_if_t::value, const std::string&> schemaOrName, FuncType&& func) + explicit RegisterOperators(const std::string& schemaOrName, FuncType&& func, Options&& options = RegisterOperators::options()) : RegisterOperators() { - legacyAPIOp_(schemaOrName, std::forward(func)); + std::move(*this).op(schemaOrName, std::forward(func), std::move(options)); } /** - * Deprecated. For backwards compatibility only. - * Don't use this, it introduces a performance overhead on each kernel call - * due to the kernel being stored in the wrapper as a runtime function pointer. + * This API registers an operator based on a kernel function pointer. * * Given a kernel * * > namespace { Tensor my_kernel_cpu(Tensor a, Tensor b) {...} } * - * This deprecated API looks like: + * This API looks like: * * > static auto registry = c10::RegisterOperators() * > .op("my_op", &my_kernel_cpu); * - * But you should use the new API instead: + * If your kernel is small and the overhead of calling it matters, + * then this API might be the wrong choice since the followig API + * has a slightly lower overhead for calling into the kernel: * * > static auto registry = c10::RegisterOperators() - * > .op("my_op", kernel()); + * > .op("my_op", c10::RegisterOperators::options() + * > .kernel()); * * Or, alternatively, write your kernel as a functor: * @@ -132,83 +285,47 @@ class C10_API RegisterOperators final { * > } * > * > static auto registry = c10::RegisterOperators() - * > .op("my_op", c10::kernel()); + * > .op("my_op", c10::RegisterOperators::options() + * > .kernel()); */ - template - C10_DEPRECATED_MESSAGE("Registering kernels via passing function pointers to op() directly is deprecated. " \ - "Please use the new c10::kernel() based API instead.") + template // enable_if: only enable it if FuncType is actually a function, but not a stack based KernelFunction. - guts::enable_if_t::value && !std::is_same::value, RegisterOperators> - op(const std::string& schemaOrName, FuncType* func, OtherArgs...) && { - // We intentionally don't extend this deprecated API to support dispatch keys - // and the like to push people towards using the new API. - static_assert(sizeof...(OtherArgs) == 0, "The deprecated function pointer based API to register kernels doesn't allow additional arguments for dispatch keys or other things. Please use the new c10::kernel() based API instead."); - - legacyAPIOp_(schemaOrName, func); - return std::move(*this); + guts::enable_if_t::value && !std::is_same::value, RegisterOperators&&> + op(const std::string& schemaOrName, FuncType* func, Options&& options = RegisterOperators::options()) && { + constexpr bool AllowLegacyTypes = true; + return std::move(*this).op(schemaOrName, std::move(options).kernelFunctor>, AllowLegacyTypes>(func)); } /** - * Deprecated. For backwards compatibility only. + * This API registers an operator based on a kernel lambda. * - * This deprecated API looks like: + * This API looks like: * * > static auto registry = c10::RegisterOperators() * > .op("my_op", [] (Tensor a, Tensor b) {...}); * - * But you should use the new API instead: + * This is equivalent to: * * > static auto registry = c10::RegisterOperators() - * > .op("my_op", kernel([] (Tensor a, Tensor b) {...})); - * - * Or, alternatively, write your kernel as a functor: + * > .op("my_op", c10::RegisterOperators::options() + * > .kernel([] (Tensor a, Tensor b) {...})); * - * > namespace { - * > class my_kernel_cpu final : public c10::OperatorKernel { - * > public: - * > Tensor operator()(Tensor a, Tensor b) {...} - * > }; - * > } - * > - * > static auto registry = c10::RegisterOperators() - * > .op("my_op", c10::kernel()); */ - template - C10_DEPRECATED_MESSAGE("Registering kernels via passing lambdas to op() directly is deprecated. " \ - "Please use the new c10::kernel() based API instead.") + template // enable_if: only enable it if FuncType is actually a functor - guts::enable_if_t::value, RegisterOperators> - op(const std::string& schemaOrName, FuncType&& func, OtherArgs...) && { - // We intentionally don't extend this deprecated API to support dispatch keys - // and the like to push people towards using the new API. - static_assert(sizeof...(OtherArgs) == 0, "The deprecated lambda based API to register kernels doesn't allow additional arguments for dispatch keys or other things. Please use the new c10::kernel() based API instead."); - - static_assert(!std::is_base_of::value, "c10::OperatorKernel is part of the new kernel registration API and shouldn't be used together with the deprecated registration API. Please use the new c10::kernel() based API instead."); + guts::enable_if_t::value, RegisterOperators&&> + op(const std::string& schemaOrName, FuncType&& func, Options&& options = RegisterOperators::options()) && { + static_assert(!std::is_base_of::value, "c10::OperatorKernel is part of the new kernel registration API and shouldn't be used together with the deprecated registration API. Please use the new RegisterOperators::options().kernel() based API instead."); - legacyAPIOp_(schemaOrName, std::forward(func)); - return std::move(*this); + constexpr bool AllowLegacyTypes = true; + return std::move(*this).op(schemaOrName, std::move(options).kernelFunctor>, AllowLegacyTypes>(std::forward(func))); } private: - template - void op_(FunctionSchema&& schema, ConfigParameters&&... configParameters) { - checkSchemaAndRegisterOp_(std::move(schema), detail::make_registration_config(std::forward(configParameters)...)); - } - template - void op_(const std::string& schemaOrName, ConfigParameters&&... configParameters) { - checkSchemaAndRegisterOp_(schemaOrName, detail::make_registration_config(std::forward(configParameters)...)); - } - - template - void legacyAPIOp_(const std::string& schemaOrName, FuncType&& func) { - constexpr bool AllowLegacyTypes = true; - op_(schemaOrName, detail::kernelFunctor>, AllowLegacyTypes>(std::forward(func))); - } - - void checkSchemaAndRegisterOp_(FunctionSchema&& schema, detail::KernelRegistrationConfig&& config); - void checkSchemaAndRegisterOp_(const std::string& schemaOrName, detail::KernelRegistrationConfig&& config); + void checkSchemaAndRegisterOp_(FunctionSchema&& schema, Options&& config); + void checkSchemaAndRegisterOp_(const std::string& schemaOrName, Options&& config); - void registerOp_(FunctionSchema&& schema, detail::KernelRegistrationConfig&& config); + void registerOp_(FunctionSchema&& schema, Options&& config); class OperatorRegistrar; diff --git a/aten/src/ATen/core/op_registration/op_registration_test.cpp b/aten/src/ATen/core/op_registration/op_registration_test.cpp index b44838a78eb3..63dd7b5391d9 100644 --- a/aten/src/ATen/core/op_registration/op_registration_test.cpp +++ b/aten/src/ATen/core/op_registration/op_registration_test.cpp @@ -13,8 +13,6 @@ using c10::RegisterOperators; using c10::OperatorKernel; -using c10::kernel; -using c10::dispatchKey; using c10::Dispatcher; using c10::IValue; using at::Tensor; @@ -40,7 +38,7 @@ struct MockKernel final : OperatorKernel { bool* called_; }; TEST(OperatorRegistrationTest, givenOpWithoutFallbackKernel_whenCallingOpWithWrongDispatchKey_thenFails) { - auto registrar = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", kernel(), dispatchKey(TensorType1())); + auto registrar = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel().dispatchKey(TensorType1())); auto op = Dispatcher::singleton().findSchema("_test::dummy", ""); ASSERT_TRUE(op.has_value()); @@ -50,9 +48,9 @@ TEST(OperatorRegistrationTest, givenOpWithoutFallbackKernel_whenCallingOpWithWro } TEST(OperatorRegistrationTest, givenOpWithFallbackKernelOutOfScope_whenCallingOpWithWrongDispatchKey_thenFails) { - auto registrar = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", kernel(), dispatchKey(TensorType1())); + auto registrar = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel().dispatchKey(TensorType1())); { - auto inner_registrar = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", kernel()); + auto inner_registrar = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel()); // this registered a fallback kernel, but now that registration goes out of scope and deregisters it } @@ -65,7 +63,7 @@ TEST(OperatorRegistrationTest, givenOpWithFallbackKernelOutOfScope_whenCallingOp TEST(OperatorRegistrationTest, givenOpWithOnlyFallbackKernel_whenCallingOp_thenCallsFallbackKernel) { bool called = false; - auto registrar = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", kernel(&called)); // note: no dispatch key means this is the fallback kernel + auto registrar = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel(&called)); // note: no dispatch key means this is the fallback kernel auto op = Dispatcher::singleton().findSchema("_test::dummy", ""); ASSERT_TRUE(op.has_value()); @@ -77,9 +75,9 @@ TEST(OperatorRegistrationTest, givenOpWithOnlyFallbackKernel_whenCallingOp_thenC TEST(OperatorRegistrationTest, givenOpWithOnlyFallbackKernelAndOtherKernelOutOfScope_whenCallingOp_thenCallsFallbackKernel) { bool called = false; bool other_called = false; - auto registrar = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", kernel(&called)); // note: no dispatch key means this is the fallback kernel + auto registrar = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel(&called)); // note: no dispatch key means this is the fallback kernel { - auto inner_registrar = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", kernel(&other_called), dispatchKey(TensorType2())); + auto inner_registrar = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel(&other_called).dispatchKey(TensorType2())); } auto op = Dispatcher::singleton().findSchema("_test::dummy", ""); @@ -94,8 +92,8 @@ TEST(OperatorRegistrationTest, givenOpWithFirstFallbackAndThenOtherKernel_whenCa bool called_kernel = false; bool called_fallback = false; auto registrar = c10::RegisterOperators() - .op("_test::dummy(Tensor dummy) -> ()", kernel(&called_fallback)) // note: no dispatch key means this is the fallback kernel - .op("_test::dummy(Tensor dummy) -> ()", kernel(&called_kernel), dispatchKey(TensorType1())); + .op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel(&called_fallback)) // note: no dispatch key means this is the fallback kernel + .op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel(&called_kernel).dispatchKey(TensorType1())); auto op = Dispatcher::singleton().findSchema("_test::dummy", ""); ASSERT_TRUE(op.has_value()); @@ -110,8 +108,8 @@ TEST(OperatorRegistrationTest, givenOpWithFirstFallbackAndThenOtherKernel_whenCa bool called_kernel = false; bool called_fallback = false; auto registrar = c10::RegisterOperators() - .op("_test::dummy(Tensor dummy) -> ()", kernel(&called_fallback)) // note: no dispatch key means this is the fallback kernel - .op("_test::dummy(Tensor dummy) -> ()", kernel(&called_kernel), dispatchKey(TensorType1())); + .op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel(&called_fallback)) // note: no dispatch key means this is the fallback kernel + .op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel(&called_kernel).dispatchKey(TensorType1())); auto op = Dispatcher::singleton().findSchema("_test::dummy", ""); ASSERT_TRUE(op.has_value()); @@ -127,8 +125,8 @@ TEST(OperatorRegistrationTest, givenOpWithFirstOtherAndThenFallbackKernel_whenCa bool called_kernel = false; bool called_fallback = false; auto registrar = c10::RegisterOperators() - .op("_test::dummy(Tensor dummy) -> ()", kernel(&called_kernel), dispatchKey(TensorType1())) - .op("_test::dummy(Tensor dummy) -> ()", kernel(&called_fallback)); // note: no dispatch key means this is the fallback kernel + .op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel(&called_kernel).dispatchKey(TensorType1())) + .op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel(&called_fallback)); // note: no dispatch key means this is the fallback kernel auto op = Dispatcher::singleton().findSchema("_test::dummy", ""); ASSERT_TRUE(op.has_value()); @@ -143,8 +141,8 @@ TEST(OperatorRegistrationTest, givenOpWithFirstOtherAndThenFallbackKernel_whenCa bool called_kernel = false; bool called_fallback = false; auto registrar = c10::RegisterOperators() - .op("_test::dummy(Tensor dummy) -> ()", kernel(&called_kernel), dispatchKey(TensorType1())) - .op("_test::dummy(Tensor dummy) -> ()", kernel(&called_fallback)); // note: no dispatch key means this is the fallback kernel + .op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel(&called_kernel).dispatchKey(TensorType1())) + .op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel(&called_fallback)); // note: no dispatch key means this is the fallback kernel auto op = Dispatcher::singleton().findSchema("_test::dummy", ""); ASSERT_TRUE(op.has_value()); @@ -178,7 +176,7 @@ TEST(OperatorRegistrationTest, givenOpWithoutKernels_whenRegisteringKernelAfterw auto registrar1 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()"); bool called_kernel = false; - auto registrar2 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", kernel(&called_kernel), dispatchKey(TensorType1())); + auto registrar2 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel(&called_kernel).dispatchKey(TensorType1())); auto op = Dispatcher::singleton().findSchema("_test::dummy", ""); ASSERT_TRUE(op.has_value()); // assert schema is registered @@ -190,7 +188,7 @@ TEST(OperatorRegistrationTest, givenOpWithoutKernels_whenRegisteringKernelAfterw auto registrar1 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()"); { - auto registrar2 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", kernel(), dispatchKey(TensorType1())); + auto registrar2 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel().dispatchKey(TensorType1())); } auto op = Dispatcher::singleton().findSchema("_test::dummy", ""); @@ -211,22 +209,22 @@ TEST(OperatorRegistrationTest, givenOpWithoutKernelsWithoutTensorInputs_whenRegi TEST(OperatorRegistrationTest, givenKernelsWithSameDispatchKey_whenRegistering_thenShowsWarning) { auto registrar = c10::RegisterOperators() - .op("_test::dummy(Tensor dummy) -> ()", kernel(), dispatchKey(TensorType1())); + .op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel().dispatchKey(TensorType1())); auto op = Dispatcher::singleton().findSchema("_test::dummy", ""); ASSERT_TRUE(op.has_value()); // assert schema is registered testing::internal::CaptureStderr(); - c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", kernel(), dispatchKey(TensorType1())); + c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel().dispatchKey(TensorType1())); std::string output = testing::internal::GetCapturedStderr(); - EXPECT_THAT(output, testing::HasSubstr("Registered a kernel that overwrote a previoulsy registered kernel with same dispatch key")); + EXPECT_THAT(output, testing::HasSubstr("Registered a kernel that overwrote a previously registered kernel with same dispatch key")); } TEST(OperatorRegistrationTest, givenKernelsWithSameDispatchKey_whenCalled_thenCallsNewerKernel) { bool called_kernel1 = false; bool called_kernel2 = false; - auto registrar1 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", kernel(&called_kernel1), dispatchKey(TensorType1())); - auto registrar2 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", kernel(&called_kernel2), dispatchKey(TensorType1())); + auto registrar1 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel(&called_kernel1).dispatchKey(TensorType1())); + auto registrar2 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel(&called_kernel2).dispatchKey(TensorType1())); auto op = Dispatcher::singleton().findSchema("_test::dummy", ""); ASSERT_TRUE(op.has_value()); // assert schema is registered @@ -239,8 +237,8 @@ TEST(OperatorRegistrationTest, givenKernelsWithSameDispatchKey_whenCalled_thenCa TEST(OperatorRegistrationTest, givenKernelsWithSameFallbackDispatchKey_whenCalled_thenCallsNewerKernel) { bool called_kernel1 = false; bool called_kernel2 = false; - auto registrar1 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", kernel(&called_kernel1)); - auto registrar2 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", kernel(&called_kernel2)); + auto registrar1 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel(&called_kernel1)); + auto registrar2 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel(&called_kernel2)); auto op = Dispatcher::singleton().findSchema("_test::dummy", ""); ASSERT_TRUE(op.has_value()); // assert schema is registered @@ -253,8 +251,8 @@ TEST(OperatorRegistrationTest, givenKernelsWithSameFallbackDispatchKey_whenCalle TEST(OperatorRegistrationTest, givenKernelsWithSameDispatchKey_whenNewerKernelDeletedAndOpCalled_thenCallsOlderKernel) { bool called_kernel1 = false; bool called_kernel2 = false; - auto registrar1 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", kernel(&called_kernel1), dispatchKey(TensorType1())); - auto registrar2 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", kernel(&called_kernel2), dispatchKey(TensorType1())); + auto registrar1 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel(&called_kernel1).dispatchKey(TensorType1())); + auto registrar2 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel(&called_kernel2).dispatchKey(TensorType1())); registrar2 = c10::RegisterOperators(); // destruct the registrar @@ -269,8 +267,8 @@ TEST(OperatorRegistrationTest, givenKernelsWithSameDispatchKey_whenNewerKernelDe TEST(OperatorRegistrationTest, givenKernelsWithSameFallbackDispatchKey_whenNewerKernelDeletedAndOpCalled_thenCallsOlderKernel) { bool called_kernel1 = false; bool called_kernel2 = false; - auto registrar1 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", kernel(&called_kernel1)); - auto registrar2 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", kernel(&called_kernel2)); + auto registrar1 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel(&called_kernel1)); + auto registrar2 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel(&called_kernel2)); registrar2 = c10::RegisterOperators(); // destruct the registrar @@ -285,8 +283,8 @@ TEST(OperatorRegistrationTest, givenKernelsWithSameFallbackDispatchKey_whenNewer TEST(OperatorRegistrationTest, givenKernelsWithSameDispatchKey_whenOlderKernelDeletedAndOpCalled_thenCallsNewerKernel) { bool called_kernel1 = false; bool called_kernel2 = false; - auto registrar1 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", kernel(&called_kernel1), dispatchKey(TensorType1())); - auto registrar2 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", kernel(&called_kernel2), dispatchKey(TensorType1())); + auto registrar1 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel(&called_kernel1).dispatchKey(TensorType1())); + auto registrar2 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel(&called_kernel2).dispatchKey(TensorType1())); registrar1 = c10::RegisterOperators(); // destruct the registrar @@ -301,8 +299,8 @@ TEST(OperatorRegistrationTest, givenKernelsWithSameDispatchKey_whenOlderKernelDe TEST(OperatorRegistrationTest, givenKernelsWithSameFallbackDispatchKey_whenOlderKernelDeletedAndOpCalled_thenCallsNewerKernel) { bool called_kernel1 = false; bool called_kernel2 = false; - auto registrar1 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", kernel(&called_kernel1)); - auto registrar2 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", kernel(&called_kernel2)); + auto registrar1 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel(&called_kernel1)); + auto registrar2 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel(&called_kernel2)); registrar1 = c10::RegisterOperators(); // destruct the registrar @@ -318,8 +316,8 @@ TEST(OperatorRegistrationTest, givenKernelsWithSameDispatchKey_whenOlderAndThenN bool called_kernel1 = false; bool called_kernel2 = false; auto registrar0 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()"); - auto registrar1 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", kernel(&called_kernel1), dispatchKey(TensorType1())); - auto registrar2 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", kernel(&called_kernel2), dispatchKey(TensorType1())); + auto registrar1 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel(&called_kernel1).dispatchKey(TensorType1())); + auto registrar2 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel(&called_kernel2).dispatchKey(TensorType1())); registrar1 = c10::RegisterOperators(); // destruct the registrar registrar2 = c10::RegisterOperators(); // destruct the registrar @@ -336,8 +334,8 @@ TEST(OperatorRegistrationTest, givenKernelsWithSameFallbackDispatchKey_whenOlder bool called_kernel1 = false; bool called_kernel2 = false; auto registrar0 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()"); - auto registrar1 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", kernel(&called_kernel1)); - auto registrar2 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", kernel(&called_kernel2)); + auto registrar1 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel(&called_kernel1)); + auto registrar2 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel(&called_kernel2)); registrar1 = c10::RegisterOperators(); // destruct the registrar registrar2 = c10::RegisterOperators(); // destruct the registrar @@ -354,8 +352,8 @@ TEST(OperatorRegistrationTest, givenKernelsWithSameDispatchKey_whenNewerAndThenO bool called_kernel1 = false; bool called_kernel2 = false; auto registrar0 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()"); - auto registrar1 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", kernel(&called_kernel1), dispatchKey(TensorType1())); - auto registrar2 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", kernel(&called_kernel2), dispatchKey(TensorType1())); + auto registrar1 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel(&called_kernel1).dispatchKey(TensorType1())); + auto registrar2 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel(&called_kernel2).dispatchKey(TensorType1())); registrar2 = c10::RegisterOperators(); // destruct the registrar registrar1 = c10::RegisterOperators(); // destruct the registrar @@ -372,8 +370,8 @@ TEST(OperatorRegistrationTest, givenKernelsWithSameFallbackDispatchKey_whenNewer bool called_kernel1 = false; bool called_kernel2 = false; auto registrar0 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()"); - auto registrar1 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", kernel(&called_kernel1)); - auto registrar2 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", kernel(&called_kernel2)); + auto registrar1 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel(&called_kernel1)); + auto registrar2 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel(&called_kernel2)); registrar2 = c10::RegisterOperators(); // destruct the registrar registrar1 = c10::RegisterOperators(); // destruct the registrar @@ -386,7 +384,23 @@ TEST(OperatorRegistrationTest, givenKernelsWithSameFallbackDispatchKey_whenNewer }, "Didn't find kernel to dispatch to for operator '_test::dummy'"); } +TEST(OperatorRegistrationTest, whenTryingToRegisterWithMultipleKernels_thenFails) { + expectThrows([&] { + c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel().kernel()); + }, "Cannot register multiple kernels in the same op() call"); +} + +TEST(OperatorRegistrationTest, whenTryingToRegisterWithMultipleDispatchKeys_thenFails) { + expectThrows([&] { + c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel().dispatchKey(TensorType1()).dispatchKey(TensorType2())); + }, "Cannot register multiple dispatch keys in the same op() call"); +} +TEST(OperatorRegistrationTest, whenTryingToRegisterWithDispatchKeyWithoutKernel_thenFails) { + expectThrows([&] { + c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().dispatchKey(TensorType1())); + }, "Tried to register an operator with a dispatch key but without a kernel"); +} /** * This is used to check that a given type works correctly when passed as input @@ -416,7 +430,7 @@ struct ArgTypeTestKernel final : OperatorKernel { } static void test(InputType input, std::function inputExpectation, OutputType output, std::function outputExpectation, const std::string& schema) { - auto registry = c10::RegisterOperators().op("_test::my_op" + schema, kernel(input, std::move(inputExpectation), std::move(output))); + auto registry = c10::RegisterOperators().op("_test::my_op" + schema, c10::RegisterOperators::options().kernel(input, std::move(inputExpectation), std::move(output))); auto op = Dispatcher::singleton().findSchema("_test::my_op", ""); ASSERT_TRUE(op.has_value()); // assert schema is registered auto actualOutput = callOp(*op, std::move(input)); @@ -551,43 +565,47 @@ TEST(OperatorRegistrationTest, testAvailableArgTypes) { // list types (with empty list) - testArgTypes, std::vector>::test( - c10::ArrayRef(), [] (c10::ArrayRef v) {EXPECT_EQ(0, v.size());}, + testArgTypes>::test( + std::vector(), [] (const std::vector& v) {EXPECT_EQ(0, v.size());}, std::vector(), [] (const IValue& v) {EXPECT_EQ(0, v.toDoubleListRef().size());}, "(float[] a) -> float[]"); - testArgTypes, std::vector>::test( - c10::ArrayRef(), [] (c10::ArrayRef v) {EXPECT_EQ(0, v.size());}, + testArgTypes, std::vector>::test( + std::vector(), [] (const std::vector& v) {EXPECT_EQ(0, v.size());}, std::vector(), [] (const IValue& v) {EXPECT_EQ(0, v.toIntListRef().size());}, "(int[] a) -> int[]"); - // TODO Converting std::vector to ArrayRef doesn't work, so we - // need to find an alternative - // testArgTypes, std::vector>::test( - // c10::ArrayRef(), [] (c10::ArrayRef v) {EXPECT_EQ(0, v.size());}, - // std::vector(), [] (const IValue& v) {EXPECT_EQ(0, v.toBoolListRef().size());}, - // "(bool[] a) -> bool[]"); - // testArgTypes, std::vector>::test( - // c10::ArrayRef(), [] (c10::ArrayRef v) {EXPECT_EQ(0, v.size());}, - // std::vector(), [] (const IValue& v) {EXPECT_EQ(0, v.toBoolListRef().size());}, - // "(bool[] a) -> bool[]"); - // TODO We currently don't support str[] (i.e. string list) as type. Do we want to? - // testArgTypes, std::vector>::test( - // c10::ArrayRef(), [] (c10::ArrayRef v) {EXPECT_EQ(0, v.size());}, - // std::vector(), [] (const IValue& v) {EXPECT_EQ(0, v.toStringListRef().size());}, - // "(str[] a) -> str[]"); + testArgTypes>::test( + std::vector(), [] (const std::vector& v) {EXPECT_EQ(0, v.size());}, + std::vector(), [] (const IValue& v) {EXPECT_EQ(0, v.toBoolListRef().size());}, + "(bool[] a) -> bool[]"); + testArgTypes>::test( + std::vector(), [] (const std::vector& v) {EXPECT_EQ(0, v.size());}, + std::vector(), [] (const IValue& v) {EXPECT_EQ(0, v.toGenericListRef().size());}, + "(str[] a) -> str[]"); // list types (with non-empty list) - testArgTypes, std::vector>::test( - c10::ArrayRef({1.5, 2.5}), [] (c10::ArrayRef v) {EXPECT_EQ(c10::ArrayRef({1.5, 2.5}), v);}, + testArgTypes>::test( + std::vector({1.5, 2.5}), [] (const std::vector& v) {EXPECT_EQ(std::vector({1.5, 2.5}), v);}, std::vector({3.5, 4.5}), [] (const IValue& v) {EXPECT_EQ(std::vector({3.5, 4.5}), v.toDoubleListRef());}, "(float[] a) -> float[]"); - testArgTypes, std::vector>::test( - c10::ArrayRef({1, 2}), [] (c10::ArrayRef v) {EXPECT_EQ(c10::ArrayRef({1, 2}), v);}, + testArgTypes>::test( + std::vector({1, 2}), [] (const std::vector& v) {EXPECT_EQ(std::vector({1, 2}), v);}, std::vector({3, 4}), [] (const IValue& v) {EXPECT_EQ(std::vector({3, 4}), v.toIntListRef());}, "(int[] a) -> int[]"); - // TODO When fixing bool[] and str[] (see above), also add them here - testArgTypes, std::vector>::test( - c10::ArrayRef({dummyTensor(TensorType1()), dummyTensor(TensorType2())}), [] (c10::ArrayRef v) { + testArgTypes>::test( + std::vector({true, false}), [] (const std::vector& v) {EXPECT_EQ(std::vector({true, false}), v);}, + std::vector({true, false}), [] (const IValue& v) {EXPECT_EQ(std::vector({true, false}), v.toBoolListRef());}, + "(bool[] a) -> bool[]"); + testArgTypes>::test( + std::vector({"first", "second"}), [] (const std::vector& v) {EXPECT_EQ(std::vector({"first", "second"}), v);}, + std::vector({"first", "second"}), [] (const IValue& v) { + EXPECT_EQ(2, v.toGenericListRef().size()); + EXPECT_EQ("first", v.toGenericListRef()[0].toStringRef()); + EXPECT_EQ("second", v.toGenericListRef()[1].toStringRef()); + }, + "(str[] a) -> str[]"); + testArgTypes>::test( + std::vector({dummyTensor(TensorType1()), dummyTensor(TensorType2())}), [] (const std::vector& v) { EXPECT_EQ(2, v.size()); EXPECT_EQ(TensorType1(), v[0].type_id()); EXPECT_EQ(TensorType2(), v[1].type_id()); @@ -600,20 +618,20 @@ TEST(OperatorRegistrationTest, testAvailableArgTypes) { "(Tensor[] a) -> Tensor[]"); // Test optional of list (with nullopt) - testArgTypes>, c10::optional>>::test( - c10::optional>(c10::nullopt), [] (c10::optional> v) {EXPECT_FALSE(v.has_value());}, + testArgTypes>>::test( + c10::optional>(c10::nullopt), [] (const c10::optional>& v) {EXPECT_FALSE(v.has_value());}, c10::optional>(c10::nullopt), [] (const IValue& v) {EXPECT_TRUE(v.isNone());}, "(int[]? a) -> int[]?"); // Test optional of list (with empty list) - testArgTypes>, c10::optional>>::test( - c10::optional>(c10::ArrayRef{}), [] (c10::optional> v) {EXPECT_EQ(0, v.value().size());}, + testArgTypes>>::test( + c10::optional>(std::vector{}), [] (const c10::optional>& v) {EXPECT_EQ(0, v.value().size());}, c10::optional>(std::vector{}), [] (const IValue& v) {EXPECT_EQ(0, v.toIntListRef().size());}, "(int[]? a) -> int[]?"); // Test optional of list (with values) - testArgTypes>, c10::optional>>::test( - c10::optional>({1, 2}), [] (c10::optional> v) {EXPECT_EQ(c10::ArrayRef({1, 2}), v.value());}, + testArgTypes>>::test( + c10::optional>({1, 2}), [] (const c10::optional>& v) {EXPECT_EQ(std::vector({1, 2}), v.value());}, c10::optional>({3, 4}), [] (const IValue& v) {EXPECT_EQ(std::vector({3, 4}), v.toIntListRef());}, "(int[]? a) -> int[]?"); diff --git a/aten/src/ATen/core/op_registration/test_helpers.h b/aten/src/ATen/core/op_registration/test_helpers.h index 5110ec6df065..fcf4c7a3e570 100644 --- a/aten/src/ATen/core/op_registration/test_helpers.h +++ b/aten/src/ATen/core/op_registration/test_helpers.h @@ -36,6 +36,30 @@ struct InputToIValue> final { } }; template +struct InputToIValue>> final { + template + static c10::IValue call(T_&& v) { + auto list = c10::ivalue::GenericList::create({}); + list->elements().reserve(v.size()); + for (std::unordered_map& e : v) { + list->elements().push_back(InputToIValue>::call(std::move(e))); + } + return list; + } +}; +template<> +struct InputToIValue> final { + template + static c10::IValue call(T_&& v) { + auto list = c10::ivalue::GenericList::create({}); + list->elements().reserve(v.size()); + for (std::string& e : v) { + list->elements().push_back(InputToIValue::call(std::move(e))); + } + return list; + } +}; +template struct InputToIValue> final { template static c10::IValue call(T_&& v) { @@ -46,12 +70,12 @@ template struct InputToIValue> final { template static c10::IValue call(T_&& v) { - c10::Dict dict; + c10::impl::GenericDict dict; dict.reserve(v.size()); for (auto& element : v) { - dict.insert(element.first, element.second); + dict.insert(InputToIValue::call(element.first), InputToIValue::call(element.second)); } - return InputToIValue>::call(std::move(dict)); + return c10::IValue(std::move(dict)); } }; } diff --git a/aten/src/ATen/core/type.cpp b/aten/src/ATen/core/type.cpp index 60dc0c973208..2520c90f1d2e 100644 --- a/aten/src/ATen/core/type.cpp +++ b/aten/src/ATen/core/type.cpp @@ -25,7 +25,7 @@ std::ostream& operator<<(std::ostream & out, const Type & t) { } out << ")"; } else if (auto value = t.cast()) { - out << "Tensor(dtype = "; + out << "ProfiledTensor(dtype = "; if (value->scalarType().has_value()) { out << *value->scalarType(); @@ -157,6 +157,8 @@ TypePtr incompleteInferTypeFrom(const IValue& value) { return TupleType::create(fmap(value.toTuple()->elements(), incompleteInferTypeFrom)); } else if (value.isDevice()) { return DeviceObjType::get(); + } else if (value.isObject()) { + return value.toObject()->type(); } AT_ERROR("Type cannot be accurately recovered from this IValue."); } diff --git a/aten/src/ATen/cpp_custom_type_hack.h b/aten/src/ATen/cpp_custom_type_hack.h index 660c4bb6ff82..4b3b48583f24 100644 --- a/aten/src/ATen/cpp_custom_type_hack.h +++ b/aten/src/ATen/cpp_custom_type_hack.h @@ -14,9 +14,9 @@ namespace cpp_custom_type_hack { template T& cast(const Tensor& packed) { - AT_CHECK( + TORCH_CHECK( packed.scalar_type() == kByte, "Expected temporary cpp type wrapper"); - AT_CHECK( + TORCH_CHECK( packed.storage().data_ptr().get_deleter() == caffe2::TypeMeta::Make().deleteFn(), "Expected temporary cpp type wrapper of type ", diff --git a/aten/src/ATen/cuda/CUDAApplyUtils.cuh b/aten/src/ATen/cuda/CUDAApplyUtils.cuh index 724510b94870..f68880367123 100644 --- a/aten/src/ATen/cuda/CUDAApplyUtils.cuh +++ b/aten/src/ATen/cuda/CUDAApplyUtils.cuh @@ -728,10 +728,6 @@ inline bool CUDA_tensor_apply1(at::Tensor a, rearrangeDims(&aInfo); aInfo.collapseDims(); -#if CUDA_VERSION < 9000 - if (!aInfo.isContiguous()) - grid.x = std::min((unsigned int)at::cuda::getCurrentDeviceProperties()->multiProcessorCount * AT_APPLY_BLOCKS_PER_SM , grid.x); -#endif HANDLE_A_CASE(unsigned int, aInfo.dims); } else { @@ -748,9 +744,6 @@ inline bool CUDA_tensor_apply1(at::Tensor a, if (aInfo.dims == 1) { HANDLE_CASE(uint64_t, 1); } else { -#if CUDA_VERSION < 9000 - grid.x = std::min((unsigned int)at::cuda::getCurrentDeviceProperties()->multiProcessorCount * AT_APPLY_BLOCKS_PER_SM , grid.x); -#endif HANDLE_CASE(uint64_t, -1); } } @@ -881,10 +874,6 @@ inline bool CUDA_tensor_apply2(at::Tensor a, rearrangeDims(&aInfo, &bInfo); aInfo.collapseDims(); bInfo.collapseDims(); -#if CUDA_VERSION < 9000 - if (!(aInfo.isContiguous() && bInfo.isContiguous())) - grid.x = std::min((unsigned int)at::cuda::getCurrentDeviceProperties()->multiProcessorCount * AT_APPLY_BLOCKS_PER_SM , grid.x); -#endif HANDLE_A_CASE(unsigned int, aInfo.dims, bInfo.dims); } else { @@ -904,9 +893,6 @@ inline bool CUDA_tensor_apply2(at::Tensor a, if (aInfo.dims == 1 && bInfo.dims == 1) { HANDLE_CASE(uint64_t, 1, 1); } else { -#if CUDA_VERSION < 9000 - grid.x = std::min((unsigned int)at::cuda::getCurrentDeviceProperties()->multiProcessorCount * AT_APPLY_BLOCKS_PER_SM , grid.x); -#endif HANDLE_CASE(uint64_t, -1, -1); } } @@ -1071,10 +1057,6 @@ inline bool CUDA_tensor_apply3(at::Tensor a, bInfo.collapseDims(); cInfo.collapseDims(); -#if CUDA_VERSION < 9000 - if (!(aInfo.isContiguous() && bInfo.isContiguous() && cInfo.isContiguous())) - grid.x = std::min((unsigned int)at::cuda::getCurrentDeviceProperties()->multiProcessorCount * AT_APPLY_BLOCKS_PER_SM , grid.x); -#endif HANDLE_A_CASE(unsigned int, aInfo.dims, bInfo.dims, cInfo.dims); } else { detail::TensorInfo aInfo = @@ -1098,10 +1080,6 @@ inline bool CUDA_tensor_apply3(at::Tensor a, if (aInfo.dims == 1 && bInfo.dims == 1 && cInfo.dims == 1) { HANDLE_CASE(uint64_t, 1, 1, 1); } else { -#if CUDA_VERSION < 9000 - grid.x = std::min((unsigned int)at::cuda::getCurrentDeviceProperties()->multiProcessorCount * AT_APPLY_BLOCKS_PER_SM , grid.x); -#endif - HANDLE_CASE(uint64_t, -1, -1, -1); } } @@ -1311,10 +1289,6 @@ inline bool CUDA_tensor_apply4(at::Tensor a, cInfo.collapseDims(); dInfo.collapseDims(); -#if CUDA_VERSION < 9000 - if (!(aInfo.isContiguous() && bInfo.isContiguous() && cInfo.isContiguous() && dInfo.isContiguous())) - grid.x = std::min((unsigned int)at::cuda::getCurrentDeviceProperties()->multiProcessorCount * AT_APPLY_BLOCKS_PER_SM , grid.x); -#endif HANDLE_A_CASE(unsigned int, aInfo.dims, bInfo.dims, cInfo.dims, dInfo.dims); } else { detail::TensorInfo aInfo = @@ -1342,9 +1316,6 @@ inline bool CUDA_tensor_apply4(at::Tensor a, if (aInfo.dims == 1 && bInfo.dims == 1 && cInfo.dims == 1 && dInfo.dims == 1) { HANDLE_CASE(uint64_t, 1, 1, 1, 1); } else { -#if CUDA_VERSION < 9000 - grid.x = std::min((unsigned int)at::cuda::getCurrentDeviceProperties()->multiProcessorCount * AT_APPLY_BLOCKS_PER_SM , grid.x); -#endif HANDLE_CASE(uint64_t, -1, -1, -1, -1); } } diff --git a/aten/src/ATen/cuda/CUDAEvent.h b/aten/src/ATen/cuda/CUDAEvent.h index 53eadd4459bc..f27886722730 100644 --- a/aten/src/ATen/cuda/CUDAEvent.h +++ b/aten/src/ATen/cuda/CUDAEvent.h @@ -112,7 +112,7 @@ struct AT_CUDA_API CUDAEvent { createEvent(stream.device_index()); } - AT_CHECK(device_index_ == stream.device_index(), "Event device ", device_index_, + TORCH_CHECK(device_index_ == stream.device_index(), "Event device ", device_index_, " does not match recording stream's device ", stream.device_index(), "."); CUDAGuard guard(device_index_); AT_CUDA_CHECK(cudaEventRecord(event_, stream)); @@ -130,7 +130,7 @@ struct AT_CUDA_API CUDAEvent { // Note: cudaEventElapsedTime can be safely called from any device float elapsed_time(const CUDAEvent& other) const { - AT_CHECK(is_created_ && other.isCreated(), + TORCH_CHECK(is_created_ && other.isCreated(), "Both events must be recorded before calculating elapsed time."); float time_ms = 0; // raise cudaErrorNotReady if either event is recorded but not yet completed diff --git a/aten/src/ATen/cuda/detail/OffsetCalculator.cuh b/aten/src/ATen/cuda/detail/OffsetCalculator.cuh index b750cf326b39..2f475d501628 100644 --- a/aten/src/ATen/cuda/detail/OffsetCalculator.cuh +++ b/aten/src/ATen/cuda/detail/OffsetCalculator.cuh @@ -17,7 +17,7 @@ struct OffsetCalculator { using offset_type = at::cuda::Array; OffsetCalculator(int dims, const int64_t* sizes, const int64_t* const* strides) : dims(dims) { - AT_CHECK(dims <= MAX_DIMS, "tensor has too many (>25) dims"); + TORCH_CHECK(dims <= MAX_DIMS, "tensor has too many (>25) dims"); for (int i = 0; i < MAX_DIMS; ++i) { if (i < dims) { sizes_[i] = IntDivider(sizes[i]); diff --git a/aten/src/ATen/cuda/detail/TensorInfo.cuh b/aten/src/ATen/cuda/detail/TensorInfo.cuh index 7dfa9051e103..b5fcbe222391 100644 --- a/aten/src/ATen/cuda/detail/TensorInfo.cuh +++ b/aten/src/ATen/cuda/detail/TensorInfo.cuh @@ -62,7 +62,7 @@ TensorInfo::TensorInfo(T* p, template void TensorInfo::reduceDim(int dim) { - AT_CHECK(dim < dims && dim >= 0, "expected dim between 0 and dims - 1"); + TORCH_CHECK(dim < dims && dim >= 0, "expected dim between 0 and dims - 1"); sizes[dim] = 1; } diff --git a/aten/src/ATen/detail/FunctionTraits.h b/aten/src/ATen/detail/FunctionTraits.h index a8f84e6994cf..547bb71ea8aa 100644 --- a/aten/src/ATen/detail/FunctionTraits.h +++ b/aten/src/ATen/detail/FunctionTraits.h @@ -49,6 +49,12 @@ struct function_traits { }; }; +template +struct nullary_function_traits { + using traits = function_traits; + using result_type = typename traits::result_type; +}; + template struct unary_function_traits { using traits = function_traits; diff --git a/aten/src/ATen/function_wrapper.py b/aten/src/ATen/function_wrapper.py index d4915a7fd0d0..68f0eb4dbc98 100644 --- a/aten/src/ATen/function_wrapper.py +++ b/aten/src/ATen/function_wrapper.py @@ -115,19 +115,9 @@ def TypedDict(name, attrs, total=True): # type: ignore TYPE_DERIVED_DEFINITION_NATIVE = CodeTemplate("""\ ${return_type} ${Type}::${api_name}(${type_method_formals}) const { ${device_guard_declaration} - ${dispatch_scalar_type_declaration} - switch (dispatch_scalar_type) { - ${cases} - ${return_call} at::native::${native_type_method_dispatch}(/* actuals */ ${actuals}); - break; - default: - AT_ERROR("${api_name} not supported on ${Type} for ", dispatch_scalar_type); - } + ${type_definition_body} } """) -TYPE_DERIVED_DEFINITION_NATIVE_CASE = CodeTemplate("""\ -case ScalarType::${ScalarName}: -""") TYPE_DERIVED_DEFINITION_NATIVE_MISSING = CodeTemplate("""\ ${return_type} ${Type}::${api_name}(${type_method_formals}) const { AT_ERROR("${api_name} not supported on ${Type}"); @@ -1660,10 +1650,9 @@ def process_native(option): TYPE_DERIVED_DEFINITION_NATIVE_MISSING.substitute(env)) else: option['native_type_method_dispatch'] = native_dispatch - cases = [] - for scalar_type in option['backend_types'][backend]: - cases.append(TYPE_DERIVED_DEFINITION_NATIVE_CASE.substitute(env, ScalarName=scalar_type)) - type_object_definitions.append(TYPE_DERIVED_DEFINITION_NATIVE.substitute(env, cases=cases)) + body = TYPE_DEFINITION_BODY_NATIVE.substitute(env) + type_object_definitions.append( + TYPE_DERIVED_DEFINITION_NATIVE.substitute(env, type_definition_body=body)) for declaration in declarations: for option in declaration['options']: diff --git a/aten/src/ATen/gen.py b/aten/src/ATen/gen.py index f4bdf22621a6..1960b95dd300 100644 --- a/aten/src/ATen/gen.py +++ b/aten/src/ATen/gen.py @@ -188,7 +188,9 @@ def backend_to_devicetype(backend): # scalar_name, c_type, accreal, is_floating_type quantized_scalar_types = [ - ('QInt8', 'qint8', 'QInt8AccrealNotDefined', 'Qint8IsFloatingTypeNotDefined'), + ('QInt8', 'qint8', 'QInt8AccrealNotDefined', 'QInt8IsFloatingTypeNotDefined'), + ('QUInt8', 'quint8', 'QUInt8AccrealNotDefined', 'QUInt8IsFloatingTypeNotDefined'), + ('QInt32', 'qint32', 'QInt32AccrealNotDefined', 'Qint32IsFloatingTypeNotDefined'), ] @@ -390,9 +392,6 @@ def legacy_iterate_types(): for scalar_type in (scalar_types + quantized_scalar_types): if density == 'Mkldnn' and (backend != 'CPU' or scalar_type[0] != 'Float'): continue - if density == 'Sparse' and scalar_type[0] == 'Half': - # THS does not do half type yet. - continue else: yield (backend, density, scalar_type) for backend in quantized_backends: diff --git a/aten/src/ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h b/aten/src/ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h index 77176a3acada..b80fc02baeeb 100644 --- a/aten/src/ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h +++ b/aten/src/ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h @@ -16,7 +16,7 @@ class HIPStreamMasqueradingAsCUDA { explicit HIPStreamMasqueradingAsCUDA(Stream stream) : HIPStreamMasqueradingAsCUDA(UNCHECKED, stream) { // We did the coercion unchecked; check that it was right. - AT_CHECK(stream.device().type() == DeviceType::CUDA /* !!! */); + TORCH_CHECK(stream.device().type() == DeviceType::CUDA /* !!! */); } explicit HIPStreamMasqueradingAsCUDA(Unchecked, Stream stream) diff --git a/aten/src/ATen/native/Activation.cpp b/aten/src/ATen/native/Activation.cpp index 55bc3c44df50..b35017ec0737 100644 --- a/aten/src/ATen/native/Activation.cpp +++ b/aten/src/ATen/native/Activation.cpp @@ -147,8 +147,8 @@ Tensor prelu_cpu(const Tensor& self, const Tensor& weight_) { auto input = self.contiguous(); auto weight = weight_.contiguous(); - AT_CHECK(input.is_contiguous()); - AT_CHECK(weight.is_contiguous()); + TORCH_CHECK(input.is_contiguous()); + TORCH_CHECK(weight.is_contiguous()); int64_t weight_num = weight.numel(); Tensor result = at::empty_like(input); @@ -162,7 +162,7 @@ Tensor prelu_cpu(const Tensor& self, const Tensor& weight_) { } else { // case2: multiple weights, one for each channel int64_t input_ndim = input.dim(); - AT_CHECK(input_ndim > 0, "Not allow zero-dim input tensor."); + TORCH_CHECK(input_ndim > 0, "Not allow zero-dim input tensor."); int64_t channel_size = 1; // channel_size default to 1 int64_t input_dim0_size = 1, input_stride0 = 1, input_stride1 = 1; @@ -173,7 +173,7 @@ Tensor prelu_cpu(const Tensor& self, const Tensor& weight_) { input_stride0 = strides[0]; input_stride1 = strides[1]; } - AT_CHECK(channel_size == weight_num, + TORCH_CHECK(channel_size == weight_num, "Mismatch of parameter numbers and input channel size. Found parameter numbers = ", weight_num, " and channel size = ", channel_size, "."); @@ -276,9 +276,9 @@ std::tuple prelu_backward_cpu(const Tensor& grad_out_, const Ten auto grad_out = grad_out_.contiguous(); auto weight = weight_.contiguous(); - AT_CHECK(input.is_contiguous()); - AT_CHECK(grad_out.is_contiguous()); - AT_CHECK(weight.is_contiguous()); + TORCH_CHECK(input.is_contiguous()); + TORCH_CHECK(grad_out.is_contiguous()); + TORCH_CHECK(weight.is_contiguous()); int64_t weight_num = weight.numel(); auto strides = input.strides(); @@ -296,7 +296,7 @@ std::tuple prelu_backward_cpu(const Tensor& grad_out_, const Ten } else { // case2: multiple parameters, one for each channel int64_t input_ndim = input.dim(); - AT_CHECK(input_ndim > 0, "Not allow zero-dim input tensor."); + TORCH_CHECK(input_ndim > 0, "Not allow zero-dim input tensor."); int64_t channel_size = 1; // channel_size default to 1 int64_t input_dim0_size = 1, input_stride0 = 1, input_stride1 = 1; @@ -307,7 +307,7 @@ std::tuple prelu_backward_cpu(const Tensor& grad_out_, const Ten input_stride0 = strides[0]; input_stride1 = strides[1]; } - AT_CHECK(channel_size == weight_num, + TORCH_CHECK(channel_size == weight_num, "Mismatch of parameter numbers and input channel size. Found parameter numbers = ", weight_num, " and channel size = ", channel_size, "."); diff --git a/aten/src/ATen/native/AdaptiveAveragePooling.cpp b/aten/src/ATen/native/AdaptiveAveragePooling.cpp index 1546621da55f..e2badf7d6f05 100644 --- a/aten/src/ATen/native/AdaptiveAveragePooling.cpp +++ b/aten/src/ATen/native/AdaptiveAveragePooling.cpp @@ -77,13 +77,13 @@ namespace { IntArrayRef output_size) { for (int64_t i = 0; i < input.ndimension(); i++) { - AT_CHECK(input.size(i) > 0, + TORCH_CHECK(input.size(i) > 0, "adaptive_avg_pooling2d(): expected input to have non-empty spatial dimensions, " "but input has sizes ", input.sizes(), " with dimension ", i, " being " "empty"); } - AT_CHECK((input.ndimension() == 3 || input.ndimension() == 4), + TORCH_CHECK((input.ndimension() == 3 || input.ndimension() == 4), "non-empty 3D or 4D (batch mode) tensor expected for input"); /* sizes */ diff --git a/aten/src/ATen/native/AdaptiveAveragePooling3d.cpp b/aten/src/ATen/native/AdaptiveAveragePooling3d.cpp new file mode 100644 index 000000000000..38ae3d5f63ce --- /dev/null +++ b/aten/src/ATen/native/AdaptiveAveragePooling3d.cpp @@ -0,0 +1,312 @@ +#include +#include + +namespace at { +namespace native { + +namespace { + +inline int start_index(int a, int b, int c) { + return (int)std::floor((float)(a * c) / b); +} + +inline int end_index(int a, int b, int c) { + return (int)std::ceil((float)((a + 1) * c) / b); +} + +template +static void adaptive_avg_pool3d_out_frame( + scalar_t* input_p, + scalar_t* output_p, + int64_t sizeD, + int64_t isizeT, + int64_t isizeH, + int64_t isizeW, + int64_t osizeT, + int64_t osizeH, + int64_t osizeW, + int64_t istrideD, + int64_t istrideT, + int64_t istrideH, + int64_t istrideW) { + int64_t d; +#pragma omp parallel for private(d) + for (d = 0; d < sizeD; d++) { + /* loop over output */ + int64_t ot, oh, ow; + for (ot = 0; ot < osizeT; ot++) { + int istartT = start_index(ot, osizeT, isizeT); + int iendT = end_index(ot, osizeT, isizeT); + int kT = iendT - istartT; + + for (oh = 0; oh < osizeH; oh++) { + int istartH = start_index(oh, osizeH, isizeH); + int iendH = end_index(oh, osizeH, isizeH); + int kH = iendH - istartH; + + for (ow = 0; ow < osizeW; ow++) { + int istartW = start_index(ow, osizeW, isizeW); + int iendW = end_index(ow, osizeW, isizeW); + int kW = iendW - istartW; + + /* local pointers */ + scalar_t* ip = input_p + d * istrideD + istartT * istrideT + + istartH * istrideH + istartW * istrideW; + scalar_t* op = output_p + d * osizeT * osizeH * osizeW + + ot * osizeH * osizeW + oh * osizeW + ow; + + /* compute local average: */ + scalar_t sum = 0; + int it, ih, iw; + for (it = 0; it < kT; it++) { + for (ih = 0; ih < kH; ih++) { + for (iw = 0; iw < kW; iw++) { + scalar_t val = + *(ip + it * istrideT + ih * istrideH + iw * istrideW); + sum += val; + } + } + } + + /* set output to local average */ + *op = sum / kT / kH / kW; + } + } + } + } +} + +void adaptive_avg_pool3d_out_cpu_template( + Tensor& output, + Tensor const& input, + IntArrayRef output_size) { + for (int64_t i = 0; i < input.ndimension(); i++) { + TORCH_CHECK( + input.size(i) > 0, + "adaptive_avg_pool3d(): expected input to have non-empty spatial dimensions, " + "but input has sizes ", + input.sizes(), + " with dimension ", + i, + " being " + "empty"); + } + + TORCH_CHECK( + (input.ndimension() == 4 || input.ndimension() == 5), + "non-empty 4D or 5D (batch mode) tensor expected for input"); + + /* sizes */ + int64_t sizeD = input.size(-4); + int64_t isizeT = input.size(-3); + int64_t isizeH = input.size(-2); + int64_t isizeW = input.size(-1); + /* strides */ + int64_t istrideD = input.stride(-4); + int64_t istrideT = input.stride(-3); + int64_t istrideH = input.stride(-2); + int64_t istrideW = input.stride(-1); + /* output sizes */ + auto osizeT = output_size[0]; + auto osizeH = output_size[1]; + auto osizeW = output_size[2]; + + if (input.ndimension() == 4) { + output.resize_({sizeD, osizeT, osizeH, osizeW}); + + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + input.scalar_type(), "adaptive_avg_pool3d_cpu", [&] { + auto input_data = input.data(); + auto output_data = output.data(); + adaptive_avg_pool3d_out_frame( + input_data, + output_data, + sizeD, + isizeT, + isizeH, + isizeW, + osizeT, + osizeH, + osizeW, + istrideD, + istrideT, + istrideH, + istrideW); + }); + } else { + output.resize_({input.size(-5), sizeD, osizeT, osizeH, osizeW}); + int64_t b; +#pragma omp parallel for private(b) + for (b = 0; b < input.size(0); b++) { + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + input.scalar_type(), "adaptive_avg_pool3d_cpu", [&] { + auto input_data = input.data(); + auto output_data = output.data(); + adaptive_avg_pool3d_out_frame( + input_data + b * input.stride(0), + output_data + b * sizeD * osizeT * osizeH * osizeW, + sizeD, + isizeT, + isizeH, + isizeW, + osizeT, + osizeH, + osizeW, + istrideD, + istrideT, + istrideH, + istrideW); + }); + } + } +} + +template +static void adaptive_avg_pool3d_backward_out_frame( + scalar_t* gradInput_p, + scalar_t* gradOutput_p, + int64_t sizeD, + int64_t isizeT, + int64_t isizeH, + int64_t isizeW, + int64_t osizeT, + int64_t osizeH, + int64_t osizeW) { + int64_t d; +#pragma omp parallel for private(d) + for (d = 0; d < sizeD; d++) { + scalar_t* gradInput_p_d = gradInput_p + d * isizeT * isizeW * isizeH; + scalar_t* gradOutput_p_d = gradOutput_p + d * osizeT * osizeW * osizeH; + + /* calculate average */ + int64_t ot, oh, ow; + for (ot = 0; ot < osizeT; ot++) { + int istartT = start_index(ot, osizeT, isizeT); + int iendT = end_index(ot, osizeT, isizeT); + int kT = iendT - istartT; + + for (oh = 0; oh < osizeH; oh++) { + int istartH = start_index(oh, osizeH, isizeH); + int iendH = end_index(oh, osizeH, isizeH); + int kH = iendH - istartH; + + for (ow = 0; ow < osizeW; ow++) { + int istartW = start_index(ow, osizeW, isizeW); + int iendW = end_index(ow, osizeW, isizeW); + int kW = iendW - istartW; + + scalar_t grad_delta = + gradOutput_p_d[ot * osizeH * osizeW + oh * osizeW + ow] / kT / + kH / kW; + + int it, ih, iw; + for (it = istartT; it < iendT; it++) { + for (ih = istartH; ih < iendH; ih++) { + for (iw = istartW; iw < iendW; iw++) { + /* update gradient */ + gradInput_p_d[it * isizeH * isizeW + ih * isizeW + iw] += + grad_delta; + } + } + } + } + } + } + } +} + +Tensor& adaptive_avg_pool3d_backward_out_cpu_template( + Tensor& gradInput, + const Tensor& gradOutput_, + const Tensor& input) { + /* get contiguous gradOutput */ + auto gradOutput = gradOutput_.contiguous(); + + /* sizes */ + int64_t sizeD = input.size(-4); + int64_t isizeT = input.size(-3); + int64_t isizeH = input.size(-2); + int64_t isizeW = input.size(-1); + int64_t osizeT = gradOutput.size(-3); + int64_t osizeH = gradOutput.size(-2); + int64_t osizeW = gradOutput.size(-1); + + /* backprop */ + if (input.ndimension() == 4) { + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + input.scalar_type(), "adaptive_avg_pool3d_backward_cpu", [&] { + /* get raw pointers */ + scalar_t* gradInput_data = gradInput.data(); + scalar_t* gradOutput_data = gradOutput.data(); + + adaptive_avg_pool3d_backward_out_frame( + gradInput_data, + gradOutput_data, + sizeD, + isizeT, + isizeH, + isizeW, + osizeT, + osizeH, + osizeW); + }); + } else { + int64_t b; +#pragma omp parallel for private(b) + for (b = 0; b < input.size(0); b++) { + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + input.scalar_type(), "adaptive_avg_pool3d_backward_cpu", [&] { + /* get raw pointers */ + scalar_t* gradInput_data = gradInput.data(); + scalar_t* gradOutput_data = gradOutput.data(); + adaptive_avg_pool3d_backward_out_frame( + gradInput_data + b * sizeD * isizeT * isizeH * isizeW, + gradOutput_data + b * sizeD * osizeT * osizeH * osizeW, + sizeD, + isizeT, + isizeH, + isizeW, + osizeT, + osizeH, + osizeW); + }); + } + } + return gradInput; +} + +} // namespace + +Tensor& adaptive_avg_pool3d_out_cpu( + Tensor& output, + const Tensor& input, + IntArrayRef output_size) { + adaptive_avg_pool3d_out_cpu_template(output, input, output_size); + return output; +} + +Tensor adaptive_avg_pool3d_cpu(Tensor const& input, IntArrayRef output_size) { + auto output = at::empty({0}, input.options()); + adaptive_avg_pool3d_out_cpu_template(output, input, output_size); + return output; +} + +Tensor& adaptive_avg_pool3d_backward_out_cpu( + Tensor& gradInput, + const Tensor& gradOutput_, + const Tensor& input) { + gradInput.resize_as_(input).zero_(); + adaptive_avg_pool3d_backward_out_cpu_template(gradInput, gradOutput_, input); + return gradInput; +} + +Tensor adaptive_avg_pool3d_backward_cpu( + const Tensor& gradOutput_, + const Tensor& input) { + auto gradInput = at::zeros_like(input); + adaptive_avg_pool3d_backward_out_cpu_template(gradInput, gradOutput_, input); + return gradInput; +} + +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/AdaptiveMaxPooling2d.cpp b/aten/src/ATen/native/AdaptiveMaxPooling2d.cpp index c6a774e98486..ae36afe5290e 100644 --- a/aten/src/ATen/native/AdaptiveMaxPooling2d.cpp +++ b/aten/src/ATen/native/AdaptiveMaxPooling2d.cpp @@ -134,16 +134,16 @@ void adaptive_max_pool2d_out_cpu_template( int64_t istrideB = 0; for (int64_t i = 0; i < input.ndimension(); i++) { - AT_CHECK(input.size(i) > 0, + TORCH_CHECK(input.size(i) > 0, "adaptive_max_pool2d: expected input to have non-empty spatial dimensions, " "but input has sizes ", input.sizes(), " with dimension ", i, " being " "empty"); } - AT_CHECK((input.ndimension() == 3 || input.ndimension() == 4), + TORCH_CHECK((input.ndimension() == 3 || input.ndimension() == 4), "non-empty 3D or 4D (batch mode) tensor expected for input"); - AT_CHECK(output_size.size() == 2, + TORCH_CHECK(output_size.size() == 2, "adaptive_max_pool2d: internal error: output_size.size() must be 2"); if (input.ndimension() == 4) diff --git a/aten/src/ATen/native/AdaptiveMaxPooling3d.cpp b/aten/src/ATen/native/AdaptiveMaxPooling3d.cpp index 5b9cdb08595c..7d6581d78bef 100644 --- a/aten/src/ATen/native/AdaptiveMaxPooling3d.cpp +++ b/aten/src/ATen/native/AdaptiveMaxPooling3d.cpp @@ -156,16 +156,16 @@ void adaptive_max_pool3d_out_cpu_template( int64_t istrideW = 0; for (int64_t i = 0; i < input.ndimension(); i++) { - AT_CHECK(input.size(i) > 0, + TORCH_CHECK(input.size(i) > 0, "adaptive_max_pool3d: expected input to have non-empty spatial dimensions, " "but input has sizes ", input.sizes(), " with dimension ", i, " being " "empty"); } - AT_CHECK((input.ndimension() == 4 || input.ndimension() == 5), + TORCH_CHECK((input.ndimension() == 4 || input.ndimension() == 5), "non-empty 4D or 5D (batch mode) tensor expected for input"); - AT_CHECK(output_size.size() == 3, + TORCH_CHECK(output_size.size() == 3, "adaptive_max_pool3d: internal error: output_size.size() must be 3"); if (input.ndimension() == 5) diff --git a/aten/src/ATen/native/AffineGridGenerator.cpp b/aten/src/ATen/native/AffineGridGenerator.cpp index 7ab91d2d8a50..e54aa6e56061 100644 --- a/aten/src/ATen/native/AffineGridGenerator.cpp +++ b/aten/src/ATen/native/AffineGridGenerator.cpp @@ -67,7 +67,7 @@ Tensor affine_grid_generator_5D( } Tensor affine_grid_generator(const Tensor& theta, IntArrayRef size) { - AT_CHECK( + TORCH_CHECK( size.size() == 4 || size.size() == 5, "AffineGridGenerator needs 4d (spatial) or 5d (volumetric) inputs."); if (size.size() == 4) { @@ -108,7 +108,7 @@ Tensor affine_grid_generator_5D_backward( } Tensor affine_grid_generator_backward(const Tensor& grad, IntArrayRef size) { - AT_CHECK( + TORCH_CHECK( size.size() == 4 || size.size() == 5, "AffineGridGenerator needs 4d (spatial) or 5d (volumetric) inputs."); if (size.size() == 4) { diff --git a/aten/src/ATen/native/BatchLinearAlgebra.cpp b/aten/src/ATen/native/BatchLinearAlgebra.cpp index ef6268c6bad1..c71ff615d858 100644 --- a/aten/src/ATen/native/BatchLinearAlgebra.cpp +++ b/aten/src/ATen/native/BatchLinearAlgebra.cpp @@ -138,28 +138,22 @@ static void apply_solve(Tensor& b, Tensor& A, std::vector& infos) { #else auto A_data = A.data(); auto b_data = b.data(); + auto A_mat_stride = matrixStride(A); + auto b_mat_stride = matrixStride(b); + auto batch_size = batchCount(A); auto n = A.size(-2); auto nrhs = b.size(-1); auto ipiv = at::empty({n}, b.options().dtype(kInt)); int info; - if (b.dim() == 2) { - lapackSolve(n, nrhs, A_data, n, ipiv.data(), b_data, n, &info); - infos[0] = info; - } else { - auto A_mat_stride = matrixStride(A); - auto b_mat_stride = matrixStride(b); - auto batch_size = batchCount(A); - - for (int64_t i = 0; i < batch_size; i++) { - scalar_t* A_working_ptr = &A_data[i * A_mat_stride]; - scalar_t* b_working_ptr = &b_data[i * b_mat_stride]; - lapackSolve(n, nrhs, A_working_ptr, n, ipiv.data(), b_working_ptr, n, &info); - infos[i] = info; - if (info != 0) { - return; - } + for (int64_t i = 0; i < batch_size; i++) { + scalar_t* A_working_ptr = &A_data[i * A_mat_stride]; + scalar_t* b_working_ptr = &b_data[i * b_mat_stride]; + lapackSolve(n, nrhs, A_working_ptr, n, ipiv.data(), b_working_ptr, n, &info); + infos[i] = info; + if (info != 0) { + return; } } #endif @@ -182,9 +176,9 @@ std::tuple _solve_helper_cpu(const Tensor& self, const Tensor& A // Supports arbitrary batch dimensions for self and A std::tuple solve(const Tensor& self, const Tensor& A) { - AT_CHECK(self.dim() >= 2, + TORCH_CHECK(self.dim() >= 2, "B should have at least 2 dimensions, but has ", self.dim(), " dimensions instead"); - AT_CHECK(A.dim() >= 2, + TORCH_CHECK(A.dim() >= 2, "A should have at least 2 dimensions, but has ", A.dim(), " dimensions instead"); Tensor self_broadcasted, A_broadcasted; std::tie(self_broadcasted, A_broadcasted) = _linear_solve_broadcast_args(self, A); @@ -208,7 +202,6 @@ static void apply_inverse(Tensor& self, std::vector& infos) { #else auto self_data = self.data(); auto self_matrix_stride = matrixStride(self); - auto batch_size = batchCount(self); auto n = self.size(-2); @@ -217,8 +210,8 @@ static void apply_inverse(Tensor& self, std::vector& infos) { scalar_t wkopt; Tensor work; + int info; for (int64_t i = 0; i < batch_size; i++) { - int info; scalar_t* self_working_ptr = &self_data[i * self_matrix_stride]; lapackLu(n, n, self_working_ptr, n, ipiv.data(), &info); infos[i] = info; @@ -249,7 +242,11 @@ Tensor _inverse_helper_cpu(const Tensor& self) { AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "inverse_cpu", [&]{ apply_inverse(self_working_copy, infos); }); - batchCheckErrors(infos, "inverse_cpu"); + if (self.dim() > 2) { + batchCheckErrors(infos, "inverse_cpu"); + } else { + singleCheckErrors(infos[0], "inverse_cpu"); + } return self_working_copy; } @@ -257,9 +254,6 @@ Tensor inverse(const Tensor &self) { if (self.size(-1) == 0) { return at::empty_like(self); } - if (self.dim() == 2) { - return at::legacy::th::_th_getri_single(self); - } squareCheckInputs(self); return at::_inverse_helper(self); } @@ -283,25 +277,20 @@ static void apply_cholesky_solve(Tensor& b, Tensor& A, bool upper, std::vector(); auto b_data = b.data(); + auto A_mat_stride = matrixStride(A); + auto b_mat_stride = matrixStride(b); + auto batch_size = batchCount(A); auto n = A.size(-2); auto nrhs = b.size(-1); int info; - if (b.dim() == 2) { - lapackCholeskySolve(uplo, n, nrhs, A_data, n, b_data, n, &info); - infos[0] = info; - } else { - auto A_mat_stride = matrixStride(A); - auto b_mat_stride = matrixStride(b); - auto batch_size = batchCount(A); - for (int64_t i = 0; i < batch_size; i++) { - scalar_t* A_working_ptr = &A_data[i * A_mat_stride]; - scalar_t* b_working_ptr = &b_data[i * b_mat_stride]; - lapackCholeskySolve(uplo, n, nrhs, A_working_ptr, n, b_working_ptr, n, &info); - infos[i] = info; - if (info != 0) { - return; - } + for (int64_t i = 0; i < batch_size; i++) { + scalar_t* A_working_ptr = &A_data[i * A_mat_stride]; + scalar_t* b_working_ptr = &b_data[i * b_mat_stride]; + lapackCholeskySolve(uplo, n, nrhs, A_working_ptr, n, b_working_ptr, n, &info); + infos[i] = info; + if (info != 0) { + return; } } #endif @@ -324,9 +313,9 @@ Tensor _cholesky_solve_helper_cpu(const Tensor& self, const Tensor& A, bool uppe // Supports arbitrary batch dimensions for self and A Tensor cholesky_solve(const Tensor& self, const Tensor& A, bool upper) { - AT_CHECK(self.dim() >= 2, + TORCH_CHECK(self.dim() >= 2, "b should have at least 2 dimensions, but has ", self.dim(), " dimensions instead"); - AT_CHECK(A.dim() >= 2, + TORCH_CHECK(A.dim() >= 2, "u should have at least 2 dimensions, but has ", A.dim(), " dimensions instead"); Tensor self_broadcasted, A_broadcasted; std::tie(self_broadcasted, A_broadcasted) = _linear_solve_broadcast_args(self, A); @@ -350,22 +339,17 @@ static void apply_cholesky(Tensor& self, bool upper, std::vector& infos char uplo = upper ? 'U' : 'L'; auto self_data = self.data(); + auto self_matrix_stride = matrixStride(self); + auto batch_size = batchCount(self); auto n = self.size(-2); int info; - if (self.dim() == 2) { - lapackCholesky(uplo, n, self_data, n, &info); - infos[0] = info; - } else { - auto self_matrix_stride = matrixStride(self); - auto batch_size = batchCount(self); - for (int64_t i = 0; i < batch_size; i++) { - scalar_t* self_working_ptr = &self_data[i * self_matrix_stride]; - lapackCholesky(uplo, n, self_working_ptr, n, &info); - infos[i] = info; - if (info != 0) { - return; - } + for (int64_t i = 0; i < batch_size; i++) { + scalar_t* self_working_ptr = &self_data[i * self_matrix_stride]; + lapackCholesky(uplo, n, self_working_ptr, n, &info); + infos[i] = info; + if (info != 0) { + return; } } #endif @@ -417,28 +401,23 @@ static void apply_lu(Tensor& self, Tensor& pivots, Tensor& infos) { auto self_data = self.data(); auto pivots_data = pivots.data(); auto infos_data = infos.data(); - + auto self_matrix_stride = matrixStride(self); + auto pivots_matrix_stride = pivots.size(-1); + auto batch_size = batchCount(self); auto n = self.size(-1); - if (self.dim() == 2) { - lapackLu(n, n, self_data, n, pivots_data, infos_data); - } else { - auto self_matrix_stride = matrixStride(self); - auto batch_size = batchCount(self); - auto pivots_matrix_stride = pivots.size(-1); - for (int64_t i = 0; i < batch_size; i++) { - scalar_t* self_working_ptr = &self_data[i * self_matrix_stride]; - int* pivots_working_ptr = &pivots_data[i * pivots_matrix_stride]; - int* infos_working_ptr = &infos_data[i]; - lapackLu(n, n, self_working_ptr, n, pivots_working_ptr, infos_working_ptr); - } + for (int64_t i = 0; i < batch_size; i++) { + scalar_t* self_working_ptr = &self_data[i * self_matrix_stride]; + int* pivots_working_ptr = &pivots_data[i * pivots_matrix_stride]; + int* infos_working_ptr = &infos_data[i]; + lapackLu(n, n, self_working_ptr, n, pivots_working_ptr, infos_working_ptr); } #endif } std::tuple _lu_with_info_cpu(const Tensor& self, bool pivot, bool check_errors) { - AT_CHECK(pivot, "lu without pivoting is not implemented on the CPU"); - AT_CHECK(self.dim() >= 2, + TORCH_CHECK(pivot, "lu without pivoting is not implemented on the CPU"); + TORCH_CHECK(self.dim() >= 2, "expected tensor with 2 or more dimensions, got size: ", self.sizes(), " instead"); squareCheckInputs(self); @@ -458,10 +437,10 @@ std::tuple _lu_with_info_cpu(const Tensor& self, bool pi }); } if (check_errors) { - if (self.dim() == 2) { - singleCheckErrors(infos_tensor.item(), "lu"); - } else { + if (self.dim() > 2) { batchCheckErrors(infos_tensor, "lu"); + } else { + singleCheckErrors(infos_tensor.item(), "lu"); } } return std::make_tuple(self_working_copy, pivots_tensor, infos_tensor); @@ -621,21 +600,17 @@ static void apply_triangular_solve(Tensor& b, Tensor& A, bool upper, bool transp auto A_data = A.data(); auto b_data = b.data(); + auto A_mat_stride = matrixStride(A); + auto b_mat_stride = matrixStride(b); + auto batch_size = batchCount(A); auto n = A.size(-2); auto nrhs = b.size(-1); int info; - if (b.dim() == 2) { - lapackTriangularSolve(uplo, trans, diag, n, nrhs, A_data, n, b_data, n, &info); - } else { - auto A_mat_stride = matrixStride(A); - auto b_mat_stride = matrixStride(b); - auto batch_size = batchCount(A); - for (int64_t i = 0; i < batch_size; i++) { - scalar_t* A_working_ptr = &A_data[i * A_mat_stride]; - scalar_t* b_working_ptr = &b_data[i * b_mat_stride]; - lapackTriangularSolve(uplo, trans, diag, n, nrhs, A_working_ptr, n, b_working_ptr, n, &info); - } + for (int64_t i = 0; i < batch_size; i++) { + scalar_t* A_working_ptr = &A_data[i * A_mat_stride]; + scalar_t* b_working_ptr = &b_data[i * b_mat_stride]; + lapackTriangularSolve(uplo, trans, diag, n, nrhs, A_working_ptr, n, b_working_ptr, n, &info); } #endif } @@ -653,9 +628,9 @@ std::tuple _triangular_solve_helper_cpu(const Tensor& self, cons // Supports arbitrary batch dimensions for self and A std::tuple triangular_solve(const Tensor& self, const Tensor& A, bool upper, bool transpose, bool unitriangular) { - AT_CHECK(self.dim() >= 2, + TORCH_CHECK(self.dim() >= 2, "b should have at least 2 dimensions, but has ", self.dim(), " dimensions instead"); - AT_CHECK(A.dim() >= 2, + TORCH_CHECK(A.dim() >= 2, "u should have at least 2 dimensions, but has ", A.dim(), " dimensions instead"); Tensor self_broadcasted, A_broadcasted; std::tie(self_broadcasted, A_broadcasted) = _linear_solve_broadcast_args(self, A); diff --git a/aten/src/ATen/native/ConstantPadNd.cpp b/aten/src/ATen/native/ConstantPadNd.cpp index 594604688149..48caea7f54ba 100644 --- a/aten/src/ATen/native/ConstantPadNd.cpp +++ b/aten/src/ATen/native/ConstantPadNd.cpp @@ -3,7 +3,7 @@ namespace at { namespace native { Tensor constant_pad_nd(const Tensor& self, IntArrayRef pad, Scalar value) { - AT_CHECK(pad.size() % 2 == 0, "Length of pad must be even but instead it equals ", + TORCH_CHECK(pad.size() % 2 == 0, "Length of pad must be even but instead it equals ", pad.size()); auto input_sizes = self.sizes(); @@ -11,7 +11,7 @@ Tensor constant_pad_nd(const Tensor& self, IntArrayRef pad, Scalar value) { auto l_pad = pad.size() / 2; auto l_diff = l_inp - l_pad; - AT_CHECK(l_inp >= l_pad, "Length of pad should be no more than twice the number of " + TORCH_CHECK(l_inp >= l_pad, "Length of pad should be no more than twice the number of " "dimensions of the input. Pad length is ", pad.size(), "while the input has ", l_inp, "dimensions."); @@ -48,7 +48,7 @@ Tensor constant_pad_nd(const Tensor& self, IntArrayRef pad, Scalar value) { for (int i = 0; i < l_pad; i++) { auto pad_idx = pad.size() - ((i + 1) * 2); auto new_dim = input_sizes[l_diff + i] + pad[pad_idx] + pad[pad_idx + 1]; - AT_CHECK(new_dim > 0, "The input size ", input_sizes[l_diff + i], ", plus negative padding ", + TORCH_CHECK(new_dim > 0, "The input size ", input_sizes[l_diff + i], ", plus negative padding ", pad[pad_idx], " and ", pad[pad_idx + 1], "resulted in a negative output size, " "which is invalid. Check dimension ", l_diff + i, "of your input."); new_shape.emplace_back(new_dim); diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp index bc02fdc25119..34afeeb9b586 100644 --- a/aten/src/ATen/native/Convolution.cpp +++ b/aten/src/ATen/native/Convolution.cpp @@ -211,18 +211,18 @@ static void check_shape_forward(const at::Tensor& input, auto dilation = params.dilation; bool transposed = params.transposed; - AT_CHECK(!params.is_padding_neg(), "negative padding is not supported"); - AT_CHECK(!params.is_output_padding_neg(), "negative output_padding is not supported"); - AT_CHECK(!params.is_stride_neg(), "negative stride is not supported"); + TORCH_CHECK(!params.is_padding_neg(), "negative padding is not supported"); + TORCH_CHECK(!params.is_output_padding_neg(), "negative output_padding is not supported"); + TORCH_CHECK(!params.is_stride_neg(), "negative stride is not supported"); - AT_CHECK(weight_dim == k, + TORCH_CHECK(weight_dim == k, "Expected ", weight_dim, "-dimensional input for ", weight_dim, "-dimensional weight ", weight_sizes, ", but got ", k, "-dimensional input of size ", input.sizes(), " instead"); - AT_CHECK(weight_sizes[0] >= groups, + TORCH_CHECK(weight_sizes[0] >= groups, "Given groups=", groups, ", expected weight to be at least ", groups, " at dimension 0, but got weight of size ", weight_sizes, " instead"); - AT_CHECK(weight_sizes[0] % groups == 0, + TORCH_CHECK(weight_sizes[0] % groups == 0, "Given groups=", groups, ", expected weight to be divisible by ", groups, " at dimension 0, but got weight of size ", weight_sizes, " instead"); @@ -232,12 +232,12 @@ static void check_shape_forward(const at::Tensor& input, std::vector kernel_shape; bool kernel_size_correct = true; - AT_CHECK(input.size(1) == (weight_sizes[1] * groups), + TORCH_CHECK(input.size(1) == (weight_sizes[1] * groups), "Given groups=", groups, ", weight of size ", weight_sizes, ", expected input", input.sizes(), " to have ", (weight_sizes[1] * groups), " channels, but got ", input.size(1), " channels instead"); - AT_CHECK(!bias.defined() || (bias.ndimension() == 1 && bias.size(0) == weight_sizes[0]), + TORCH_CHECK(!bias.defined() || (bias.ndimension() == 1 && bias.size(0) == weight_sizes[0]), "Given weight of size ", weight_sizes, ", expected bias to be 1-dimensional with ", weight_sizes[0], " elements", ", but got bias of size ", bias.sizes(), " instead"); @@ -251,7 +251,7 @@ static void check_shape_forward(const at::Tensor& input, } } - AT_CHECK(input_shape.size() == kernel_shape.size(), "Inconsistent shape between Input and Kernel"); + TORCH_CHECK(input_shape.size() == kernel_shape.size(), "Inconsistent shape between Input and Kernel"); if (!kernel_size_correct) { // If kernel size is incorrect @@ -270,11 +270,11 @@ static void check_shape_forward(const at::Tensor& input, "Kernel size: (", kernel_ss.str(), "). Kernel size can't be greater than actual input size"); } } else { // transposed - AT_CHECK(input.size(1) == weight_sizes[0], + TORCH_CHECK(input.size(1) == weight_sizes[0], "Given transposed=", transposed, ", weight of size ", weight_sizes, ", expected input", input.sizes(), " to have ", weight_sizes[0], " channels, but got ", input.size(1), " channels instead"); - AT_CHECK(!bias.defined() || (bias.ndimension() == 1 && bias.size(0) == weight_sizes[1] * groups), + TORCH_CHECK(!bias.defined() || (bias.ndimension() == 1 && bias.size(0) == weight_sizes[1] * groups), "Given transposed=", transposed, ", weight of size ", weight_sizes, ", expected bias to be 1-dimensional with ", weight_sizes[1] * groups, " elements", ", but got bias of size ", bias.sizes(), " instead"); @@ -282,14 +282,14 @@ static void check_shape_forward(const at::Tensor& input, } static auto view4d(const at::Tensor& tensor) -> at::Tensor { - AT_CHECK(tensor.ndimension() == 3, + TORCH_CHECK(tensor.ndimension() == 3, "expected 3D tensor, got tensor with ", tensor.ndimension(), " dimensions instead"); return tensor.unsqueeze(2); } static auto view3d(const at::Tensor& tensor) -> at::Tensor { - AT_CHECK(tensor.ndimension() == 4, + TORCH_CHECK(tensor.ndimension() == 4, "expected 4D tensor, got tensor with ", tensor.ndimension(), " dimensions instead"); return tensor.squeeze(2); @@ -378,7 +378,7 @@ at::Tensor _convolution( } int64_t dim = k - 2; - AT_CHECK(dim > 0, "weight should have at least three dimensions"); + TORCH_CHECK(dim > 0, "weight should have at least three dimensions"); ConvParams params; params.stride = expand_param_if_needed(stride_, "stride", dim); @@ -409,10 +409,10 @@ at::Tensor _convolution( auto dilation = params.dilation; output = at::thnn_conv_depthwise2d(input, weight, kernel_size, bias, stride, padding, dilation); } else if (params.use_cudnn(input)) { - AT_CHECK(input.type() == weight.type(), + TORCH_CHECK(input.type() == weight.type(), "Input type (", input.type().toString(), ") and weight type (", weight.type().toString(), ") should be the same"); - AT_CHECK(!bias.defined() || (input.type() == bias.type()), + TORCH_CHECK(!bias.defined() || (input.type() == bias.type()), "Input type (", input.type().toString(), ") and bias type (", bias.type().toString(), ") should be the same"); @@ -426,10 +426,10 @@ at::Tensor _convolution( params.padding, params.stride, params.dilation, params.groups, params.benchmark, params.deterministic); } } else if (params.use_miopen(input)) { - AT_CHECK(input.type() == weight.type(), + TORCH_CHECK(input.type() == weight.type(), "Input type (", input.type().toString(), ") and weight type (", weight.type().toString(), ") should be the same"); - AT_CHECK(!bias.defined() || (input.type() == bias.type()), + TORCH_CHECK(!bias.defined() || (input.type() == bias.type()), "Input type (", input.type().toString(), ") and bias type (", bias.type().toString(), ") should be the same"); @@ -444,10 +444,10 @@ at::Tensor _convolution( } } else if (params.use_mkldnn(input)) { #if AT_MKLDNN_ENABLED() - AT_CHECK(input.type() == weight.type(), + TORCH_CHECK(input.type() == weight.type(), "Input type (", input.type().toString(), ") and weight type (", weight.type().toString(), ") should be the same"); - AT_CHECK(!bias.defined() || (input.type() == bias.type()), + TORCH_CHECK(!bias.defined() || (input.type() == bias.type()), "Input type (", input.type().toString(), ") and bias type (", bias.type().toString(), ") should be the same"); if (!input_is_mkldnn) { diff --git a/aten/src/ATen/native/ConvolutionTBC.cpp b/aten/src/ATen/native/ConvolutionTBC.cpp index c798582dc0b7..90eec5d7853f 100644 --- a/aten/src/ATen/native/ConvolutionTBC.cpp +++ b/aten/src/ATen/native/ConvolutionTBC.cpp @@ -6,11 +6,11 @@ namespace at { namespace native { Tensor conv_tbc(const Tensor& self, const Tensor& weight, const Tensor& bias, int64_t pad) { - AT_CHECK(self.dim() == 3, "Input must have 3 dims: time, batch, " + TORCH_CHECK(self.dim() == 3, "Input must have 3 dims: time, batch, " "in_channel"); - AT_CHECK(weight.dim() == 3, "Weight tensor must have 3 dims: kernel_width," + TORCH_CHECK(weight.dim() == 3, "Weight tensor must have 3 dims: kernel_width," " in_channels, out_channels."); - AT_CHECK(bias.dim() == 1, "Bias must be 1-D"); + TORCH_CHECK(bias.dim() == 1, "Bias must be 1-D"); auto input_size = self.sizes(); auto weight_size = weight.sizes(); @@ -27,9 +27,9 @@ Tensor conv_tbc(const Tensor& self, const Tensor& weight, const Tensor& bias, in // Input = (time, batch, in_channels) // Weight = (kernel_width, in_channels, out_channels) // Bias = (out_channels) - AT_CHECK(inputPlanes == weight_size[1], "Input dim 2 (input channels) " + TORCH_CHECK(inputPlanes == weight_size[1], "Input dim 2 (input channels) " "is not == dim 1 in the weight tensor"); - AT_CHECK(weight_size[2] == bias.sizes()[0], "Bias size must equal dim 2 in " + TORCH_CHECK(weight_size[2] == bias.sizes()[0], "Bias size must equal dim 2 in " "the weight tensor (output channels)."); // input * weights + bias -> output_features diff --git a/aten/src/ATen/native/Copy.cpp b/aten/src/ATen/native/Copy.cpp index d55b554dd69e..59ec3a3c95ff 100644 --- a/aten/src/ATen/native/Copy.cpp +++ b/aten/src/ATen/native/Copy.cpp @@ -1,121 +1,142 @@ #include #include -#include #include -#include #include -#include +#include +#include namespace { -bool copy_transpose_valid(const at::Tensor& self, const at::Tensor& src) { +using namespace at; + +bool copy_transpose_valid(const Tensor& self, const Tensor& src) { const int MIN_SZ = 60 * 60; return self.is_contiguous() && src.numel() != 0 && src.dim() == 2 && src.stride(0) == 1 && src.stride(1) == src.size(0) && + self.scalar_type() == src.scalar_type() && self.numel() >= MIN_SZ; } +// special case copy where tensor is contiguous and src is a transposed matrix +// This can be generalized to most copies, but it's trickier +void copy_same_type_transpose_(Tensor& self, const Tensor& src) { + int64_t BLOCK_SZ; + if (self.scalar_type() == kByte) { + BLOCK_SZ = 120; + } else { + BLOCK_SZ = 60; + } + Tensor buf = empty({BLOCK_SZ, BLOCK_SZ}, self.options()); + + AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBool, self.scalar_type(), "copy_", [&] { + scalar_t* sp = src.data(); + scalar_t* rp = self.data(); + scalar_t* bp = buf.data(); + + int64_t NR = src.size(0); + int64_t NC = src.size(1); + for (int64_t R = 0; R < NR; R += BLOCK_SZ) { + for (int64_t C = 0; C < NC; C += BLOCK_SZ) { + scalar_t* spo = sp + R + C * NR; + scalar_t* rpo = rp + C + R * NC; + + int nr = std::min(NR - R, BLOCK_SZ); + int nc = std::min(NC - C, BLOCK_SZ); + + // 1. copy columns from src to buf + for (int c = 0; c < nc; c++) { + memcpy(bp + c * BLOCK_SZ, spo + c * NR, nr * sizeof(scalar_t)); + } + + // 2. transpose buf in place + int rc_max = std::max(nr, nc); + int rc_min = std::min(nr, nc); + for (int r = 0; r < rc_max; r++) { + int end = std::min(r, rc_min); + for (int c = 0; c < end; c++) { + scalar_t tmp = bp[r + BLOCK_SZ * c]; + bp[r + BLOCK_SZ * c] = bp[r * BLOCK_SZ + c]; + bp[r * BLOCK_SZ + c] = tmp; + } + } + + // 3. copy rows from buf to dst + for (int r = 0; r < nr; r++) { + memcpy(rpo + r * NC, bp + r * BLOCK_SZ, nc * sizeof(scalar_t)); + } + } + } + }); +} + +// Devices directly supported by this copy implementation. Other device types +// (e.g. XLA) may be supported by overriding copy_ and _copy_from. +bool is_supported_device(Device device) { + DeviceType device_type = device.type(); + return device_type == kCPU || device_type == kCUDA || device_type == kHIP; +} + } // namespace namespace at { namespace native { Tensor & copy_(Tensor & self, const Tensor & src, bool non_blocking) { - Tensor b_src; + // TODO: this should be handled during dispatch, but that's missing... + TORCH_CHECK(self.defined(), "self is undefined"); + TORCH_CHECK(self.defined(), "src is undefined"); + if (self.is_sparse() && src.is_sparse()) { return at::copy_sparse_to_sparse_(self, src, non_blocking); + } else if (self.is_sparse() || src.is_sparse()) { + AT_ERROR("copy_() between dense and sparse Tensors is not implemented! Found self type = ", + self.type(), " and src type = ", src.type()); } - if (!self.is_sparse() && !src.is_sparse()) { - std::tie(b_src) = expand_inplace(self, src, "copy"); - return s_copy_(self, b_src, non_blocking); - } - AT_ERROR("copy_() between dense and sparse Tensors is not implemented! Found self type = ", - self.type(), " and src type = ", src.type()); -} -Tensor& _s_copy__cpu(Tensor& self, const Tensor& src, bool non_blocking) { - if (src.type_id() != CPUTensorId()) { - _s_copy_from(src, self, non_blocking); + if (self.is_same(src)) { return self; } - if (self.scalar_type() == src.scalar_type()) { - copy_kernel_same_type(kCPU, self, src); - } else { - AT_CHECK(self.numel() == src.numel(), "sizes do not match"); - copy_kernel_cast(kCPU, self, src); + // Re-dispatch copies when src device not implemented here (e.g. XLA). + // This includes: cpu_tensor.copy_(xla_tensor) which + // calls xla_tensor._copy_from(cpu_tensor) + if (!is_supported_device(src.device())) { + TORCH_INTERNAL_ASSERT(is_supported_device(self.device())); + at::_copy_from(src, self, non_blocking); + return self; } - return self; -} -// special case copy where tensor is contiguous and src is a transposed matrix -// This can be generalized to most copies, but it's tricker -void _copy_same_type_transpose_(Tensor& self, const Tensor& src) { - int64_t BLOCK_SZ; - if (self.scalar_type() == kByte) { - BLOCK_SZ = 120; - } else { - BLOCK_SZ = 60; + if (self.scalar_type() == kQUInt8) { + return quantized_copy_(self, src); } - Tensor buf = empty({BLOCK_SZ, BLOCK_SZ}, self.options()); - AT_DISPATCH_ALL_TYPES_AND( - at::ScalarType::Half, self.scalar_type(), "_copy_same_type_transpose_", [&]() { - scalar_t* sp = src.data(); - scalar_t* rp = self.data(); - scalar_t* bp = buf.data(); - - int64_t NR = src.size(0); - int64_t NC = src.size(1); - for (int64_t R = 0; R < NR; R += BLOCK_SZ) { - for (int64_t C = 0; C < NC; C += BLOCK_SZ) { - scalar_t* spo = sp + R + C * NR; - scalar_t* rpo = rp + C + R * NC; - - int nr = std::min(NR - R, BLOCK_SZ); - int nc = std::min(NC - C, BLOCK_SZ); - - // 1. copy columns from src to buf - for (int c = 0; c < nc; c++) { - memcpy(bp + c * BLOCK_SZ, spo + c * NR, nr * sizeof(scalar_t)); - } - - // 2. transpose buf in place - int rc_max = std::max(nr, nc); - int rc_min = std::min(nr, nc); - for (int r = 0; r < rc_max; r++) { - int end = std::min(r, rc_min); - for (int c = 0; c < end; c++) { - scalar_t tmp = bp[r + BLOCK_SZ * c]; - bp[r + BLOCK_SZ * c] = bp[r * BLOCK_SZ + c]; - bp[r * BLOCK_SZ + c] = tmp; - } - } - - // 3. copy rows from buf to dst - for (int r = 0; r < nr; r++) { - memcpy(rpo + r * NC, bp + r * BLOCK_SZ, nc * sizeof(scalar_t)); - } - } - } - }); -} + auto builder = TensorIterator::Builder(); + builder.add_output(self); + builder.add_input(src); + builder.dont_resize_outputs(); + builder.dont_compute_common_dtype(); + auto iter = builder.build(); -void _copy_same_type__cpu(Tensor& self, const Tensor& src) { - if (self.is_same(src)) { - return; + if (iter->numel() == 0) { + return self; } - if (self.numel() == src.numel() && copy_transpose_valid(self, src)) { - return _copy_same_type_transpose_(self, src); + DeviceType device_type = iter->device_type(0); + if (iter->device_type(1) == kCUDA) { + device_type = kCUDA; } - copy_kernel_same_type(kCPU, self, src); + if (device_type == kCPU && copy_transpose_valid(self, src)) { + copy_same_type_transpose_(self, src); + return self; + } + + copy_stub(device_type, *iter, non_blocking); + return self; } -DEFINE_DISPATCH(copy_kernel_cast); -DEFINE_DISPATCH(copy_kernel_same_type); +DEFINE_DISPATCH(copy_stub); } // namespace native } // namespace at diff --git a/aten/src/ATen/native/Copy.h b/aten/src/ATen/native/Copy.h index 08989740a17b..a8d16f6f7b87 100644 --- a/aten/src/ATen/native/Copy.h +++ b/aten/src/ATen/native/Copy.h @@ -1,8 +1,12 @@ #pragma once #include +#include namespace at { + +struct TensorIterator; + namespace native { // Note [Implicit conversion between signed and unsigned] @@ -43,5 +47,9 @@ struct inter_copy_type { template using inter_copy_type_t = typename inter_copy_type::type; +using copy_fn = void (*)(TensorIterator&, bool non_blocking); + +DECLARE_DISPATCH(copy_fn, copy_stub); + } // namespace native } // namespace at diff --git a/aten/src/ATen/native/Cross.cpp b/aten/src/ATen/native/Cross.cpp index 8788969797a5..4e1e230cc37d 100644 --- a/aten/src/ATen/native/Cross.cpp +++ b/aten/src/ATen/native/Cross.cpp @@ -16,17 +16,17 @@ Tensor cross(const Tensor & input, const Tensor & other, const c10::optional dimension) { auto device_res = input.type().device_type(); - AT_CHECK(device_res == kCPU || device_res == kCUDA, "cross only supports CPU and CUDA devices, out got: ", device_res); + TORCH_CHECK(device_res == kCPU || device_res == kCUDA, "cross only supports CPU and CUDA devices, out got: ", device_res); auto device1 = input.type().device_type(); - AT_CHECK(device1 == kCPU || device1 == kCUDA, "cross only supports CPU and CUDA devices, input got: ", device1); + TORCH_CHECK(device1 == kCPU || device1 == kCUDA, "cross only supports CPU and CUDA devices, input got: ", device1); auto device2 = other.type().device_type(); - AT_CHECK(device2 == kCPU || device2 == kCUDA, "cross only supports CPU and CUDA devices, other got: ", device2); - AT_CHECK(device_res == device1, "out and input must have the same device type. out: ", device_res, " input: ", device1); - AT_CHECK(device1 == device2, "input and other must have the same device type. input: ", device1, " other: ", device2); - AT_CHECK(!out.is_cuda() || out.get_device() == input.get_device(), "device of out (", input.get_device(), ") must match device of input (", other.get_device(), ")"); - AT_CHECK(!input.is_cuda() || input.get_device() == other.get_device(), "device of input (", input.get_device(), ") must match device of other (", other.get_device(), ")"); - AT_CHECK(input.dim() == other.dim(), "inconsistent tensors dimensions input: ", input.dim(), " other: ", other.dim()); - AT_CHECK(input.sizes() == other.sizes(), "inconsistent tensors sizes input: ", input.sizes(), " other: ", other.sizes()); + TORCH_CHECK(device2 == kCPU || device2 == kCUDA, "cross only supports CPU and CUDA devices, other got: ", device2); + TORCH_CHECK(device_res == device1, "out and input must have the same device type. out: ", device_res, " input: ", device1); + TORCH_CHECK(device1 == device2, "input and other must have the same device type. input: ", device1, " other: ", device2); + TORCH_CHECK(!out.is_cuda() || out.get_device() == input.get_device(), "device of out (", input.get_device(), ") must match device of input (", other.get_device(), ")"); + TORCH_CHECK(!input.is_cuda() || input.get_device() == other.get_device(), "device of input (", input.get_device(), ") must match device of other (", other.get_device(), ")"); + TORCH_CHECK(input.dim() == other.dim(), "inconsistent tensors dimensions input: ", input.dim(), " other: ", other.dim()); + TORCH_CHECK(input.sizes() == other.sizes(), "inconsistent tensors sizes input: ", input.sizes(), " other: ", other.sizes()); int64_t dim = -1; if(!dimension.has_value()) { @@ -36,10 +36,10 @@ Tensor & cross_out(Tensor & out, const Tensor & input, const Tensor & other, con break; } } - AT_CHECK(dim >= 0, "no dimension of size 3 in input"); + TORCH_CHECK(dim >= 0, "no dimension of size 3 in input"); } else { dim = maybe_wrap_dim(dimension.value(), input.dim()); - AT_CHECK(input.size(dim) == 3, "dimension ", dimension.value(), " does not have size 3"); + TORCH_CHECK(input.size(dim) == 3, "dimension ", dimension.value(), " does not have size 3"); } if (out.sizes() != input.sizes()) { diff --git a/aten/src/ATen/native/DilatedMaxPool.h b/aten/src/ATen/native/DilatedMaxPool.h new file mode 100644 index 000000000000..71f5bb5fe813 --- /dev/null +++ b/aten/src/ATen/native/DilatedMaxPool.h @@ -0,0 +1,112 @@ +#include +#include +#include +#include + +#pragma once + +namespace at { +namespace native { + +namespace { + +template +static inline dest_t +safe_downcast(src_t v) +{ + TORCH_CHECK(std::numeric_limits::min() <= v && v <= std::numeric_limits::max(), + "integer out of range"); + + return static_cast(v); +} + +template +static inline T pooling_output_shape( + T inputSize, T kernelSize, T pad, T stride, T dilation, bool ceil_mode) { + T outputSize = ((inputSize + 2 * pad - dilation * (kernelSize - 1) - 1 + (ceil_mode ? stride - 1 : 0)) / stride + 1); + if (pad) { + // ensure that the last pooling starts inside the image + // needed to avoid problems in ceil mode + if ((outputSize - 1) * stride >= inputSize + pad) + --outputSize; + } + return outputSize; +} + +static inline void +max_pool2d_with_indices_shape_check( + const Tensor& input, + int kH, int kW, int dH, int dW, int padH, int padW, int dilationH, int dilationW, + int64_t nInputPlane, + int64_t inputHeight, int64_t inputWidth, + int64_t outputHeight, int64_t outputWidth) +{ + const int64_t ndim = input.ndimension(); + const int64_t nOutputPlane = nInputPlane; + + TORCH_CHECK(kW > 0 && kH > 0, + "kernel size should be greater than zero, but got ", + "kH: ", kH, " kW: ", kW); + TORCH_CHECK(dW > 0 && dH > 0, + "stride should be greater than zero, but got " + "dH: ", dH, " dW: ", dW); + TORCH_CHECK(dilationH > 0 && dilationW > 0, + "dilation should be greater than zero, but got ", + "dilationH: ", dilationH, " dilationW: ", dilationW); + + TORCH_CHECK(input.numel() > 0 && (ndim == 3 || ndim == 4), + "non-empty 3D or 4D input tensor expected but got ndim: ", ndim); + TORCH_CHECK(kW/2 >= padW && kH/2 >= padH, + "pad should be smaller than half of kernel size, but got ", + "padW = ", padW, ", padH = ", padH, ", kW = ", kW, ", kH = ", kH); + + if (outputWidth < 1 || outputHeight < 1) { + AT_ERROR("Given input size: (", + nInputPlane, "x", inputHeight, "x", inputWidth, "). ", + "Calculated output size: (", + nOutputPlane, "x", outputHeight, "x", outputWidth, "). ", + "Output size is too small"); + } +} + +static inline void +max_pool2d_with_indices_shape_check( + const Tensor& input, + const Tensor& gradOutput, + const Tensor& indices, + int64_t nbatch, + int kH, int kW, int dH, int dW, int padH, int padW, int dilationH, int dilationW, + int64_t nInputPlane, + int64_t inputHeight, int64_t inputWidth, + int64_t outputHeight, int64_t outputWidth, + bool cuda=false) +{ + max_pool2d_with_indices_shape_check( + input, + kH, kW, dH, dW, padH, padW, dilationH, dilationW, + nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth); + + const int64_t ndim = input.ndimension(); + const int64_t nOutputPlane = nInputPlane; + + check_dim_size(gradOutput, ndim, ndim-3, nOutputPlane); + check_dim_size(gradOutput, ndim, ndim-2, outputHeight); + check_dim_size(gradOutput, ndim, ndim-1, outputWidth); + + if (cuda) { + check_dim_size(indices, 4, 0, nbatch); + check_dim_size(indices, 4, 1, nOutputPlane); + check_dim_size(indices, 4, 2, outputHeight); + check_dim_size(indices, 4, 3, outputWidth); + } + else { + check_dim_size(indices, ndim, ndim-3, nOutputPlane); + check_dim_size(indices, ndim, ndim-2, outputHeight); + check_dim_size(indices, ndim, ndim-1, outputWidth); + } +} + +} // namespace + +} // at::native +} // at diff --git a/aten/src/ATen/native/DilatedMaxPool2d.cpp b/aten/src/ATen/native/DilatedMaxPool2d.cpp new file mode 100644 index 000000000000..acc7eebaf88b --- /dev/null +++ b/aten/src/ATen/native/DilatedMaxPool2d.cpp @@ -0,0 +1,499 @@ +#include +#include +#include +#include +#include + + +namespace at { +namespace native { + +namespace { + +template +static void max_pool2d_with_indices_single_out_frame( + scalar_t *input_p, + scalar_t *output_p, + int64_t *ind_p, + int64_t nslices, + int64_t iwidth, + int64_t iheight, + int64_t owidth, + int64_t oheight, + int kW, + int kH, + int dW, + int dH, + int padW, + int padH, + int dilationW, + int dilationH + ) +{ + at::parallel_for(0, nslices, 0, [&](int64_t start, int64_t end) { + for (auto k = start; k < end; k++) + { + /* loop over output */ + int64_t i, j; + scalar_t *ip = input_p + k*iwidth*iheight; + for(i = 0; i < oheight; i++) + { + for(j = 0; j < owidth; j++) + { + int64_t hstart = i * dH - padH; + int64_t wstart = j * dW - padW; + int64_t hend = std::min(hstart + (kH - 1) * dilationH + 1, iheight); + int64_t wend = std::min(wstart + (kW - 1) * dilationW + 1, iwidth); + while(hstart < 0) + hstart += dilationH; + while(wstart < 0) + wstart += dilationW; + + /* local pointers */ + scalar_t *op = output_p + k*owidth*oheight + i*owidth + j; + int64_t *indp = ind_p + k*owidth*oheight + i*owidth + j; + + /* compute local max: */ + int64_t maxindex = -1; + scalar_t maxval = -std::numeric_limits::max(); + int64_t tcntr = 0; + int64_t x,y; + for(y = hstart; y < hend; y += dilationH) + { + for(x = wstart; x < wend; x += dilationW) + { + tcntr = y*iwidth + x; + scalar_t val = *(ip + tcntr); + if ((val > maxval) || std::isnan(val)) + { + maxval = val; + maxindex = tcntr; + } + } + } + + /* set output to local max */ + *op = maxval; + + /* store location of max */ + *indp = maxindex; + } + } + } + }); +} + +template +static void max_pool2d_with_indices_out_frame( + scalar_t *input_data, + scalar_t *output_data, + int64_t *indices_data, + int64_t nbatch, + int64_t nInputPlane, + int64_t inputWidth, + int64_t inputHeight, + int64_t outputWidth, + int64_t outputHeight, + int kW, + int kH, + int dW, + int dH, + int padW, + int padH, + int dilationW, + int dilationH) +{ + at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) { + for (auto p = start; p < end; p++) { + max_pool2d_with_indices_single_out_frame( + input_data+p*nInputPlane*inputWidth*inputHeight, + output_data+p*nInputPlane*outputWidth*outputHeight, + indices_data+p*nInputPlane*outputWidth*outputHeight, + nInputPlane, + inputWidth, inputHeight, + outputWidth, outputHeight, + kW, kH, dW, dH, + padW, padH, + dilationW, dilationH); + } + }); +} + +void max_pool2d_with_indices_out_cpu_template( + Tensor& output, + Tensor& indices, + const Tensor& input_, + IntArrayRef kernel_size, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + bool ceil_mode) +{ + // XXX JIT: Pooling.cpp allows stride.empty(). + // XXX IntegrationTest.MNIST: padding.size() == 1 && dilation.size() == 1. + TORCH_CHECK(kernel_size.size() == 2 && + (stride.empty() || stride.size() == 2) && + (padding.size() == 1 || padding.size() == 2) && + (dilation.size() == 1 || dilation.size() == 2), + "max_pool2d_with_indices: internal error: all IntArrayRef sizes must be 2"); + + TORCH_CHECK((input_.ndimension() == 3 || input_.ndimension() == 4), + "non-empty 3D or 4D (batch mode) tensor expected for input"); + + const int kH = safe_downcast(kernel_size[0]); + const int kW = safe_downcast(kernel_size[1]); + + const int dH = stride.empty() ? kH : safe_downcast(stride[0]); + const int dW = stride.empty() ? kW : safe_downcast(stride[1]); + + const int padH = safe_downcast(padding[0]); + const int padW = padding.size() == 1 ? padH : safe_downcast(padding[1]); + + const int dilationH = safe_downcast(dilation[0]); + const int dilationW = dilation.size() == 1 ? dilationH : safe_downcast(dilation[1]); + + /* sizes */ + const int64_t nbatch = input_.ndimension() == 4 ? input_.size(-4) : 1; + const int64_t nInputPlane = input_.size(-3); + const int64_t inputHeight = input_.size(-2); + const int64_t inputWidth = input_.size(-1); + + const int64_t outputHeight = pooling_output_shape(inputHeight, kH, padH, dH, dilationH, ceil_mode); + const int64_t outputWidth = pooling_output_shape(inputWidth, kW, padW, dW, dilationW, ceil_mode); + + max_pool2d_with_indices_shape_check( + input_, + kH, kW, dH, dW, padH, padW, dilationH, dilationW, + nInputPlane, + inputHeight, inputWidth, + outputHeight, outputWidth); + + /* get contiguous input */ + Tensor input = input_.contiguous(); + + /* resize output */ + if (input.ndimension() == 3) + { + output.resize_({nInputPlane, outputHeight, outputWidth}); + /* indices will contain the locations for each output point */ + indices.resize_({nInputPlane, outputHeight, outputWidth}); + + AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), + "max_pool2d_with_indices_cpu", + [&] { + /* get raw pointers */ + scalar_t *input_data = input.data(); + scalar_t *output_data = output.data(); + int64_t *indices_data = indices.data(); + + max_pool2d_with_indices_single_out_frame( + input_data, output_data, + indices_data, + nInputPlane, + inputWidth, inputHeight, + outputWidth, outputHeight, + kW, kH, dW, dH, + padW, padH, + dilationW, dilationH); + } + ); + } + else + { + output.resize_({nbatch, nInputPlane, outputHeight, outputWidth}); + /* indices will contain the locations for each output point */ + indices.resize_({nbatch, nInputPlane, outputHeight, outputWidth}); + + AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), + "max_pool2d_with_indices_cpu", + [&] { + scalar_t *input_data = input.data(); + scalar_t *output_data = output.data(); + int64_t *indices_data = indices.data(); + + max_pool2d_with_indices_out_frame( + input_data, + output_data, + indices_data, + nbatch, + nInputPlane, + inputWidth, inputHeight, + outputWidth, outputHeight, + kW, kH, dW, dH, + padW, padH, + dilationW, dilationH); } + ); + } +} + +template +static void max_pool2d_with_indices_backward_single_out_frame( + scalar_t *gradInput_p, + scalar_t *gradOutput_p, + int64_t *ind_p, + int64_t nInputPlane, + int64_t inputWidth, + int64_t inputHeight, + int64_t outputWidth, + int64_t outputHeight, + int dW, + int dH) +{ + at::parallel_for(0, nInputPlane, 0, [&](int64_t start, int64_t end) { + for (auto k = start; k < end; k++) + { + scalar_t *gradInput_p_k = gradInput_p + k*inputWidth*inputHeight; + scalar_t *gradOutput_p_k = gradOutput_p + k*outputWidth*outputHeight; + int64_t *ind_p_k = ind_p + k*outputWidth*outputHeight; + + /* calculate max points */ + int64_t i, j; + for(i = 0; i < outputHeight; i++) + { + for(j = 0; j < outputWidth; j++) + { + /* retrieve position of max */ + int64_t maxp = ind_p_k[i*outputWidth + j]; + if (maxp != -1) { + /* update gradient */ + gradInput_p_k[maxp] += gradOutput_p_k[i*outputWidth + j]; + } + } + } + } + }); +} + +template +static void max_pool2d_with_indices_backward_out_frame( + scalar_t *gradInput_data, + scalar_t *gradOutput_data, + int64_t *indices_data, + int64_t nbatch, + int64_t nInputPlane, + int64_t inputWidth, + int64_t inputHeight, + int64_t outputWidth, + int64_t outputHeight, + int dW, + int dH) +{ + at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) { + for (auto p = start; p < end; p++) { + max_pool2d_with_indices_backward_single_out_frame( + gradInput_data+p*nInputPlane*inputWidth*inputHeight, + gradOutput_data+p*nInputPlane*outputWidth*outputHeight, + indices_data+p*nInputPlane*outputWidth*outputHeight, + nInputPlane, + inputWidth, inputHeight, + outputWidth, outputHeight, + dW, dH); + } + }); +} + +Tensor& max_pool2d_with_indices_backward_out_cpu_template( + Tensor& gradInput, + const Tensor& gradOutput_, + const Tensor& input, + const Tensor& indices, + IntArrayRef kernel_size, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + bool ceil_mode) +{ + // XXX JIT: Pooling.cpp allows stride.empty(). + // XXX IntegrationTest.MNIST: padding.size() == 1 && dilation.size() == 1. + TORCH_CHECK(kernel_size.size() == 2 && + (stride.empty() || stride.size() == 2) && + (padding.size() == 1 || padding.size() == 2) && + (dilation.size() == 1 || dilation.size() == 2), + "max_pool2d_with_indices: internal error: all IntArrayRef sizes must be 2"); + + TORCH_CHECK((input.ndimension() == 3 || input.ndimension() == 4), + "non-empty 3D or 4D (batch mode) tensor expected for input"); + + const int kH = safe_downcast(kernel_size[0]); + const int kW = safe_downcast(kernel_size[1]); + + const int dH = stride.empty() ? kH : safe_downcast(stride[0]); + const int dW = stride.empty() ? kW : safe_downcast(stride[1]); + + const int padH = safe_downcast(padding[0]); + const int padW = padding.size() == 1 ? padH : safe_downcast(padding[1]); + + const int dilationH = safe_downcast(dilation[0]); + const int dilationW = dilation.size() == 1 ? dilationH : safe_downcast(dilation[1]); + + /* get contiguous gradOutput */ + const Tensor gradOutput = gradOutput_.contiguous(); + + /* resize */ + gradInput.resize_as_(input); + gradInput.zero_(); + + /* sizes */ + const int64_t nbatch = input.ndimension() == 4 ? input.size(-4) : 1; + const int64_t nInputPlane = input.size(-3); + const int64_t inputHeight = input.size(-2); + const int64_t inputWidth = input.size(-1); + const int64_t outputHeight = gradOutput.size(-2); + const int64_t outputWidth = gradOutput.size(-1); + + /* XXX preserve the existing shape check behavior */ + const int64_t outputHeight_for_shape_check = pooling_output_shape(inputHeight, kH, padH, dH, dilationH, ceil_mode); + const int64_t outputWidth_for_shape_check = pooling_output_shape(inputWidth, kW, padW, dW, dilationW, ceil_mode); + + max_pool2d_with_indices_shape_check( + input, + gradOutput_, + indices, + nbatch, + kH, kW, dH, dW, padH, padW, dilationH, dilationW, + nInputPlane, + inputHeight, inputWidth, + outputHeight_for_shape_check, outputWidth_for_shape_check); + + /* backprop */ + if (input.ndimension() == 3) + { + AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), + "max_pool2d_with_indices_backward", + [&] { + /* get raw pointers */ + scalar_t *gradInput_data = gradInput.data(); + scalar_t *gradOutput_data = gradOutput.data(); + int64_t *indices_data = indices.data(); + + max_pool2d_with_indices_backward_single_out_frame( + gradInput_data, gradOutput_data, + indices_data, + nInputPlane, + inputWidth, inputHeight, + outputWidth, outputHeight, + dW, dH); + } + ); + } + else + { + AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), + "max_pool2d_with_indices_backward", + [&] { + /* get raw pointers */ + scalar_t *gradInput_data = gradInput.data(); + scalar_t *gradOutput_data = gradOutput.data(); + int64_t *indices_data = indices.data(); + + max_pool2d_with_indices_backward_out_frame( + gradInput_data, gradOutput_data, + indices_data, + nbatch, + nInputPlane, + inputWidth, inputHeight, + outputWidth, outputHeight, + dW, dH); + } + ); + } + + return gradInput; +} + +} // namespace + +std::tuple max_pool2d_with_indices_out_cpu( + Tensor& output, + Tensor& indices, + const Tensor& input, + IntArrayRef kernel_size, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + bool ceil_mode) +{ + max_pool2d_with_indices_out_cpu_template( + output, + indices, + input, + kernel_size, + stride, + padding, + dilation, + ceil_mode); + return std::tuple(output, indices); +} + +std::tuple max_pool2d_with_indices_cpu( + const Tensor& input, + IntArrayRef kernel_size, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + bool ceil_mode) +{ + Tensor output = at::empty({0}, input.options()); + Tensor indices = at::empty({0}, input.options().dtype(kLong)); + max_pool2d_with_indices_out_cpu_template( + output, + indices, + input, + kernel_size, + stride, + padding, + dilation, + ceil_mode); + return std::tuple(output, indices); +} + +Tensor& max_pool2d_with_indices_backward_out_cpu( + Tensor& gradInput, + const Tensor& gradOutput_, + const Tensor& input, + IntArrayRef kernel_size, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + bool ceil_mode, + const Tensor& indices) +{ + max_pool2d_with_indices_backward_out_cpu_template( + gradInput, + gradOutput_, + input, + indices, + kernel_size, + stride, + padding, + dilation, + ceil_mode); + return gradInput; +} + +Tensor max_pool2d_with_indices_backward_cpu( + const Tensor& gradOutput_, + const Tensor& input, + IntArrayRef kernel_size, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + bool ceil_mode, + const Tensor& indices) +{ + auto gradInput = at::zeros_like(input); + max_pool2d_with_indices_backward_out_cpu_template( + gradInput, + gradOutput_, + input, + indices, + kernel_size, + stride, + padding, + dilation, + ceil_mode); + return gradInput; +} + +} // at::native +} // at diff --git a/aten/src/ATen/native/Distance.cpp b/aten/src/ATen/native/Distance.cpp index d8420fc3cd3a..76dad527721e 100644 --- a/aten/src/ATen/native/Distance.cpp +++ b/aten/src/ATen/native/Distance.cpp @@ -17,28 +17,28 @@ Tensor pairwise_distance(const Tensor& x1, const Tensor& x2, double p, double ep // This is to guarantee that the contiguous memory is passed to the backward pass Tensor pdist(const Tensor& self, const double p) { - AT_CHECK(self.dim() == 2, + TORCH_CHECK(self.dim() == 2, "pdist only supports 2D tensors, got: ", self.dim(), "D"); - AT_CHECK(at::isFloatingType(self.scalar_type()), "pdist only supports floating-point dtypes"); - AT_CHECK(p >= 0, "pdist only supports non-negative p values"); + TORCH_CHECK(at::isFloatingType(self.scalar_type()), "pdist only supports floating-point dtypes"); + TORCH_CHECK(p >= 0, "pdist only supports non-negative p values"); return at::_pdist_forward(self.contiguous(), p); } Tensor cdist(const Tensor& x1, const Tensor& x2, const double p) { - AT_CHECK(x1.dim() == 2, "cdist only supports 2D tensors, X1 got: ", x1.dim(), "D"); - AT_CHECK(at::isFloatingType(x1.scalar_type()), "cdist only supports floating-point dtypes, X1 got: ", x1.scalar_type()); + TORCH_CHECK(x1.dim() == 2, "cdist only supports 2D tensors, X1 got: ", x1.dim(), "D"); + TORCH_CHECK(at::isFloatingType(x1.scalar_type()), "cdist only supports floating-point dtypes, X1 got: ", x1.scalar_type()); auto device1 = x1.type().device_type(); - AT_CHECK(device1 == kCPU || device1 == kCUDA, "cdist only supports CPU and CUDA devices, X1 got: ", device1); - AT_CHECK(x2.dim() == 2, "cdist only supports 2D tensors, X2 got: ", x2.dim(), "D"); - AT_CHECK(at::isFloatingType(x1.scalar_type()), "cdist only supports floating-point dtypes, X2 got: ", x2.scalar_type()); + TORCH_CHECK(device1 == kCPU || device1 == kCUDA, "cdist only supports CPU and CUDA devices, X1 got: ", device1); + TORCH_CHECK(x2.dim() == 2, "cdist only supports 2D tensors, X2 got: ", x2.dim(), "D"); + TORCH_CHECK(at::isFloatingType(x1.scalar_type()), "cdist only supports floating-point dtypes, X2 got: ", x2.scalar_type()); auto device2 = x2.type().device_type(); - AT_CHECK(device2 == kCPU || device2 == kCUDA, "cdist only supports CPU and CUDA devices, X2 got: ", device2); - AT_CHECK(p >= 0, "cdist only supports non-negative p values"); - AT_CHECK(device1 == device2, "X1 and X2 must have the same device type. X1: ", device1, " X2: ", device2); - AT_CHECK(!x1.is_cuda() || x1.get_device() == x2.get_device(), "device of X1 (", x1.get_device(), ") must match device of X2 (", x2.get_device(), ")"); + TORCH_CHECK(device2 == kCPU || device2 == kCUDA, "cdist only supports CPU and CUDA devices, X2 got: ", device2); + TORCH_CHECK(p >= 0, "cdist only supports non-negative p values"); + TORCH_CHECK(device1 == device2, "X1 and X2 must have the same device type. X1: ", device1, " X2: ", device2); + TORCH_CHECK(!x1.is_cuda() || x1.get_device() == x2.get_device(), "device of X1 (", x1.get_device(), ") must match device of X2 (", x2.get_device(), ")"); int64_t c1 = x1.size(-1); int64_t c2 = x2.size(-1); - AT_CHECK(c1 == c2, "X1 and X2 must have the same number of columns. X1: ", c1, " X2: ", c2); + TORCH_CHECK(c1 == c2, "X1 and X2 must have the same number of columns. X1: ", c1, " X2: ", c2); int64_t r1 = x1.size(-2); int64_t r2 = x2.size(-2); @@ -54,24 +54,24 @@ Tensor cdist(const Tensor& x1, const Tensor& x2, const double p) { } Tensor _cdist_backward(const Tensor& grad, const Tensor& x1, const Tensor& x2, const double p, const Tensor& cdist) { - AT_CHECK(x1.is_contiguous(), "_cdist_backward requires X1 to be contiguous"); - AT_CHECK(x2.is_contiguous(), "_cdist_backward requires X2 to be contiguous"); - AT_CHECK(cdist.is_contiguous(), "_cdist_backward requires dist to be contiguous"); + TORCH_CHECK(x1.is_contiguous(), "_cdist_backward requires X1 to be contiguous"); + TORCH_CHECK(x2.is_contiguous(), "_cdist_backward requires X2 to be contiguous"); + TORCH_CHECK(cdist.is_contiguous(), "_cdist_backward requires dist to be contiguous"); int64_t n = x1.size(-2); int64_t m = x1.size(-1); auto device1 = x1.type().device_type(); - AT_CHECK(device1 == kCPU || device1 == kCUDA, "_cdist_backward only supports CPU and CUDA devices, X1 got: ", device1); + TORCH_CHECK(device1 == kCPU || device1 == kCUDA, "_cdist_backward only supports CPU and CUDA devices, X1 got: ", device1); auto device2 = x2.type().device_type(); - AT_CHECK(device2 == kCPU || device2 == kCUDA, "_cdist_backward only supports CPU and CUDA devices, X2 got: ", device2); + TORCH_CHECK(device2 == kCPU || device2 == kCUDA, "_cdist_backward only supports CPU and CUDA devices, X2 got: ", device2); Tensor grad_x1 = at::empty({n, m}, x1.options()); cdist_backward_stub(device1, grad_x1, grad, x1, x2, p, cdist); return grad_x1; } Tensor _pdist_forward(const Tensor& self, const double p) { - AT_CHECK(self.is_contiguous(), "_pdist_forward requires contiguous input"); + TORCH_CHECK(self.is_contiguous(), "_pdist_forward requires contiguous input"); auto device = self.type().device_type(); - AT_CHECK(device == kCPU || device == kCUDA, "_pdist_forward only supports CPU and CUDA devices, got: ", device); + TORCH_CHECK(device == kCPU || device == kCUDA, "_pdist_forward only supports CPU and CUDA devices, got: ", device); Tensor result = at::empty({0}, self.options()); if (self.size(0) <= 1) { result.resize_({0}); @@ -89,10 +89,10 @@ Tensor _pdist_forward(const Tensor& self, const double p) { } Tensor _pdist_backward(const Tensor& grad, const Tensor& self, const double p, const Tensor& pdist) { - AT_CHECK(self.is_contiguous(), "_pdist_backward requires self to be contiguous"); - AT_CHECK(pdist.is_contiguous(), "_pdist_backward requires pdist to be contiguous"); + TORCH_CHECK(self.is_contiguous(), "_pdist_backward requires self to be contiguous"); + TORCH_CHECK(pdist.is_contiguous(), "_pdist_backward requires pdist to be contiguous"); auto device = self.type().device_type(); - AT_CHECK(device == kCPU || device == kCUDA, "_pdist_backward only supports CPU and CUDA devices, got: ", device); + TORCH_CHECK(device == kCPU || device == kCUDA, "_pdist_backward only supports CPU and CUDA devices, got: ", device); Tensor result = at::empty_like(self); pdist_backward_stub(device, result, grad, self, p, pdist); return result; diff --git a/aten/src/ATen/native/Distributions.cpp b/aten/src/ATen/native/Distributions.cpp index 9d7c3aa2f3c8..e99f6be6c994 100644 --- a/aten/src/ATen/native/Distributions.cpp +++ b/aten/src/ATen/native/Distributions.cpp @@ -153,7 +153,7 @@ Tensor& bernoulli_tensor_cpu_(Tensor& self, const Tensor& p_, Generator* gen) { DEFINE_DISPATCH(bernoulli_mkl_stub); Tensor& bernoulli_scalar_cpu_(Tensor& self, double p, Generator* gen) { - AT_CHECK(0 <= p && p <= 1, "bernoulli_ expects p to be in [0, 1], but got p=", p); + TORCH_CHECK(0 <= p && p <= 1, "bernoulli_ expects p to be in [0, 1], but got p=", p); #if AT_MKL_ENABLED() if (cpuinfo_initialize() && cpuinfo_vendor_intel == cpuinfo_get_processor(0)->core->vendor) { bernoulli_mkl_stub(kCPU, self, p, gen); diff --git a/aten/src/ATen/native/Dropout.cpp b/aten/src/ATen/native/Dropout.cpp index 52181f1f1eac..32b27d438134 100644 --- a/aten/src/ATen/native/Dropout.cpp +++ b/aten/src/ATen/native/Dropout.cpp @@ -10,7 +10,7 @@ using Ctype = typename std::conditional::type; Tensor make_feature_noise(const Tensor& input) { auto input_sizes = input.sizes(); - AT_CHECK(input.dim() >= 2, "Feature dropout requires at least 2 dimensions in the input"); + TORCH_CHECK(input.dim() >= 2, "Feature dropout requires at least 2 dimensions in the input"); std::vector sizes; sizes.reserve(input.dim()); sizes.push_back(input_sizes[0]); @@ -21,7 +21,7 @@ Tensor make_feature_noise(const Tensor& input) { } bool is_fused_kernel_acceptable(const Tensor& input, double p) { - return input.is_cuda() && p > 0 && p < 1; + return input.is_cuda() && p > 0 && p < 1 && input.numel() > 0; } // NB: sure, we could have used different overloads here, but I would feel insecure @@ -40,8 +40,8 @@ Tensor multiply(const Tensor& input, const Tensor& noise) { template Ctype _dropout_impl(T& input, double p, bool train) { - AT_CHECK(p >= 0 && p <= 1, "dropout probability has to be between 0 and 1, but got ", p); - if (p == 0 || !train) { + TORCH_CHECK(p >= 0 && p <= 1, "dropout probability has to be between 0 and 1, but got ", p); + if (p == 0 || !train || input.numel() == 0) { return input; } diff --git a/aten/src/ATen/native/EmbeddingBag.cpp b/aten/src/ATen/native/EmbeddingBag.cpp index 485a2b0d39b5..66d4fb1c48b3 100644 --- a/aten/src/ATen/native/EmbeddingBag.cpp +++ b/aten/src/ATen/native/EmbeddingBag.cpp @@ -339,7 +339,7 @@ _embedding_bag_cpu(const Tensor &weight, const Tensor &indices, checkScalarTypes("embedding_bag", weight_arg, {kFloat, kDouble}); if (per_sample_weights.defined()) { - AT_CHECK(mode == MODE_SUM, + TORCH_CHECK(mode == MODE_SUM, "embedding_bag: per_sample_weights only supported with mode='sum'"); auto per_input_weights_arg = TensorArg( per_sample_weights,"per_sample_weights", 1); @@ -624,7 +624,7 @@ Tensor _embedding_bag_per_sample_weights_backward_cpu_template( const Tensor& offsets, const Tensor& offset2bag, int64_t mode) { - AT_CHECK( + TORCH_CHECK( mode == MODE_SUM, "embedding_bag_backward: per_sample_weights only supported for mode='sum'"); diff --git a/aten/src/ATen/native/FractionalMaxPool2d.cpp b/aten/src/ATen/native/FractionalMaxPool2d.cpp index 879a798a40b0..8e7dc4e6343a 100644 --- a/aten/src/ATen/native/FractionalMaxPool2d.cpp +++ b/aten/src/ATen/native/FractionalMaxPool2d.cpp @@ -144,7 +144,7 @@ void fractional_max_pool2d_out_cpu_template( auto input = input_.contiguous(); int ndims = input.ndimension(); - AT_CHECK(input.numel() > 0 && (ndims == 3 || ndims == 4), + TORCH_CHECK(input.numel() > 0 && (ndims == 3 || ndims == 4), "non-empty 3D or 4D (batch mode) tensor expected for input, but got: ", ndims); @@ -160,10 +160,10 @@ void fractional_max_pool2d_out_cpu_template( int inputH = input.size(heightDim); int inputW = input.size(widthDim); - AT_CHECK(outputH + poolSizeH - 1 <= inputH, + TORCH_CHECK(outputH + poolSizeH - 1 <= inputH, "fractional_max_pool2d(): pool height ", poolSizeH, " too large relative to input height ", inputH); - AT_CHECK(outputW + poolSizeW - 1 <= inputW, + TORCH_CHECK(outputW + poolSizeW - 1 <= inputW, "fractional_max_pool2d(): pool width ", poolSizeW, " too large relative to input width ", inputW); @@ -284,9 +284,9 @@ Tensor& fractional_max_pool2d_backward_out_cpu_template( /* get contiguous gradOutput */ auto gradOutput = gradOutput_.contiguous(); - AT_CHECK(outputW == gradOutput.size(widthDim), + TORCH_CHECK(outputW == gradOutput.size(widthDim), "fractional_max_pool2d_backward(): gradOutput width unexpected"); - AT_CHECK(outputH == gradOutput.size(heightDim), + TORCH_CHECK(outputH == gradOutput.size(heightDim), "fractional_max_pool2d_backward(): gradOutput height unexpected"); /* resize */ diff --git a/aten/src/ATen/native/FractionalMaxPool3d.cpp b/aten/src/ATen/native/FractionalMaxPool3d.cpp index f807cc610d02..9ed35b4add68 100644 --- a/aten/src/ATen/native/FractionalMaxPool3d.cpp +++ b/aten/src/ATen/native/FractionalMaxPool3d.cpp @@ -161,7 +161,7 @@ void fractional_max_pool3d_out_cpu_template( int64_t widthDim = 3; int64_t ndims = input_.ndimension(); - AT_CHECK(input_.numel() != 0 && (ndims == 4 || ndims == 5), + TORCH_CHECK(input_.numel() != 0 && (ndims == 4 || ndims == 5), "fractional_max_pool3d_out(): non-empty 4D or 5D (batch mode) tensor ", " expected for input, but got: ", ndims); @@ -179,13 +179,13 @@ void fractional_max_pool3d_out_cpu_template( int64_t inputH = input_.size(heightDim); int64_t inputW = input_.size(widthDim); - AT_CHECK(outputT + poolSizeT - 1 < inputT, + TORCH_CHECK(outputT + poolSizeT - 1 < inputT, "fractional_max_pool3d_out(): pool time ", poolSizeT, " too large relative to input time ", inputT); - AT_CHECK(outputW + poolSizeW - 1 < inputW, + TORCH_CHECK(outputW + poolSizeW - 1 < inputW, "fractional_max_pool3d_out(): pool width ", poolSizeW, " too large relative to input width ", inputW); - AT_CHECK(outputH + poolSizeH - 1 < inputH, + TORCH_CHECK(outputH + poolSizeH - 1 < inputH, "fractional_max_pool3d_out(): pool height ", poolSizeH, " too large relative to input height ", inputH); @@ -317,12 +317,12 @@ void fractional_max_pool3d_backward_out_cpu_template( int64_t inputH = input.size(heightDim); int64_t inputW = input.size(widthDim); - AT_CHECK(outputT == gradOutput_.size(timeDim), + TORCH_CHECK(outputT == gradOutput_.size(timeDim), "fractional_max_pool3d_backward_out(): gradOutput time unexpected"); - AT_CHECK(outputH == gradOutput_.size(heightDim), + TORCH_CHECK(outputH == gradOutput_.size(heightDim), "fractional_max_pool3d_backward_out(): ", "gradOutput height unexpected"); - AT_CHECK(outputW == gradOutput_.size(widthDim), + TORCH_CHECK(outputW == gradOutput_.size(widthDim), "fractional_max_pool3d_backward_out(): gradOutput width unexpected"); /* get contiguous gradOutput */ diff --git a/aten/src/ATen/native/GridSampler.cpp b/aten/src/ATen/native/GridSampler.cpp index 6b54a5f779f0..71f17d99b269 100644 --- a/aten/src/ATen/native/GridSampler.cpp +++ b/aten/src/ATen/native/GridSampler.cpp @@ -559,39 +559,39 @@ grid_sampler_3d_backward_cpu(const Tensor& grad_output, const Tensor& input, con Tensor grid_sampler(const Tensor& input, const Tensor& grid, int64_t interpolation_mode, int64_t padding_mode) { - AT_CHECK( + TORCH_CHECK( input.defined() && grid.defined(), "grid_sampler(): expected input and grid to not be undefined, but input " "is ", input, " and grid is ", grid); auto input_opt = input.options(); auto grid_opt = grid.options(); - AT_CHECK( + TORCH_CHECK( input_opt.device() == grid_opt.device(), "grid_sampler(): expected input and grid to be on same device, but input " "is on ", input_opt.device(), " and grid is on ", grid_opt.device()); - AT_CHECK( + TORCH_CHECK( input_opt.dtype() == grid_opt.dtype(), "grid_sampler(): expected input and grid to have same dtype, but input " "has ", input_opt.dtype(), " and grid has ", grid_opt.dtype()); - AT_CHECK( + TORCH_CHECK( input_opt.layout() == kStrided && grid_opt.layout() == kStrided, "grid_sampler(): expected input and grid to have torch.strided layout, but " "input has ", input_opt.layout(), " and grid has ", grid_opt.layout()); - AT_CHECK( + TORCH_CHECK( (input.dim() == 4 || input.dim() == 5) && input.dim() == grid.dim(), "grid_sampler(): expected 4D or 5D input and grid with same number of " "dimensions, but got input with sizes ", input.sizes(), " and grid with sizes ", grid.sizes()); - AT_CHECK( + TORCH_CHECK( input.size(0) == grid.size(0), "grid_sampler(): expected grid and input to have same batch size, but got " "input with sizes ", input.sizes(), " and grid with sizes ", grid.sizes()); - AT_CHECK( + TORCH_CHECK( grid.size(-1) == input.dim() - 2, "grid_sampler(): expected grid to have size ", input.dim() - 2, " in last " "dimension, but got grid with sizes ", grid.sizes()); for (int64_t i = 2; i < input.dim(); i++) { - AT_CHECK(input.size(i) > 0, + TORCH_CHECK(input.size(i) > 0, "grid_sampler(): expected input to have non-empty spatial dimensions, " "but input has sizes ", input.sizes(), " with dimension ", i, " being " "empty"); diff --git a/aten/src/ATen/native/Itertools.cpp b/aten/src/ATen/native/Itertools.cpp index 839d69bdd73a..cb379df7a31a 100644 --- a/aten/src/ATen/native/Itertools.cpp +++ b/aten/src/ATen/native/Itertools.cpp @@ -32,7 +32,7 @@ namespace native{ Tensor cartesian_prod(TensorList tensors) { for(const Tensor &t : tensors) { - AT_CHECK(t.dim() == 1, "Expect a 1D vector, but got shape ", t.sizes()); + TORCH_CHECK(t.dim() == 1, "Expect a 1D vector, but got shape ", t.sizes()); } if (tensors.size() == 1) { return tensors[0]; @@ -45,8 +45,8 @@ Tensor cartesian_prod(TensorList tensors) { } Tensor combinations(const Tensor& self, int64_t r, bool with_replacement) { - AT_CHECK(self.dim() == 1, "Expect a 1D vector, but got shape ", self.sizes()); - AT_CHECK(r > 0, "Expect a positive number, but got ", r); + TORCH_CHECK(self.dim() == 1, "Expect a 1D vector, but got shape ", self.sizes()); + TORCH_CHECK(r > 0, "Expect a positive number, but got ", r); int64_t num_elements = self.numel(); std::vector grids = at::meshgrid(std::vector(r, self)); Tensor mask = _triu_mask(num_elements, r, with_replacement, self.options()); diff --git a/aten/src/ATen/native/LegacyDefinitions.cpp b/aten/src/ATen/native/LegacyDefinitions.cpp index c62de35f2937..73e64d72dec1 100644 --- a/aten/src/ATen/native/LegacyDefinitions.cpp +++ b/aten/src/ATen/native/LegacyDefinitions.cpp @@ -249,7 +249,7 @@ Tensor & random_(Tensor& self, Generator * generator) { return at::legacy::th::_th_random_(self, generator); } -Tensor & uniform_(Tensor& self, double from, double to, Generator * generator) { +Tensor & uniform_cpu_(Tensor& self, double from, double to, Generator * generator) { return at::legacy::th::_th_uniform_(self, from, to, generator); } @@ -405,10 +405,10 @@ Tensor & masked_select_out(Tensor & result, const Tensor & self, const Tensor & Tensor masked_select(const Tensor & self, const Tensor & mask) { if (mask.dtype() == at::ScalarType::Byte) { - return at::legacy::th::_th_masked_select(self, mask); -} else { - return at::legacy::th::_th_masked_select_bool(self, mask); -} + return at::legacy::th::_th_masked_select(self, mask); + } else { + return at::legacy::th::_th_masked_select_bool(self, mask); + } } Tensor & nonzero_out(Tensor & result, const Tensor & self) { diff --git a/aten/src/ATen/native/LegacyNNDefinitions.cpp b/aten/src/ATen/native/LegacyNNDefinitions.cpp index dcf819e51a39..ecde3af9dcab 100644 --- a/aten/src/ATen/native/LegacyNNDefinitions.cpp +++ b/aten/src/ATen/native/LegacyNNDefinitions.cpp @@ -332,22 +332,6 @@ Tensor softshrink_backward(const Tensor & grad_output, const Tensor & self, Scal return at::legacy::th::_thnn_softshrink_backward(grad_output, self, lambd); } -Tensor & adaptive_avg_pool3d_out(Tensor & output, const Tensor & self, IntArrayRef output_size) { - return at::legacy::th::_thnn_adaptive_avg_pool3d_forward_out(output, self, output_size); -} - -Tensor adaptive_avg_pool3d(const Tensor & self, IntArrayRef output_size) { - return at::legacy::th::_thnn_adaptive_avg_pool3d_forward(self, output_size); -} - -Tensor & adaptive_avg_pool3d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self) { - return at::legacy::th::_thnn_adaptive_avg_pool3d_backward_out(grad_input, grad_output, self); -} - -Tensor adaptive_avg_pool3d_backward(const Tensor & grad_output, const Tensor & self) { - return at::legacy::th::_thnn_adaptive_avg_pool3d_backward(grad_output, self); -} - Tensor & avg_pool2d_out(Tensor & output, const Tensor & self, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, bool ceil_mode, bool count_include_pad) { return at::legacy::th::_thnn_avg_pool2d_forward_out(output, self, kernel_size, stride, padding, ceil_mode, count_include_pad); } @@ -380,22 +364,6 @@ Tensor avg_pool3d_backward(const Tensor & grad_output, const Tensor & self, IntA return at::legacy::th::_thnn_avg_pool3d_backward(grad_output, self, kernel_size, stride, padding, ceil_mode, count_include_pad); } -std::tuple max_pool2d_with_indices_out(Tensor & output, Tensor & indices, const Tensor & self, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool ceil_mode) { - return at::legacy::th::_thnn_max_pool2d_with_indices_forward_out(output, indices, self, kernel_size, stride, padding, dilation, ceil_mode); -} - -std::tuple max_pool2d_with_indices(const Tensor & self, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool ceil_mode) { - return at::legacy::th::_thnn_max_pool2d_with_indices_forward(self, kernel_size, stride, padding, dilation, ceil_mode); -} - -Tensor & max_pool2d_with_indices_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool ceil_mode, const Tensor & indices) { - return at::legacy::th::_thnn_max_pool2d_with_indices_backward_out(grad_input, grad_output, self, kernel_size, stride, padding, dilation, ceil_mode, indices); -} - -Tensor max_pool2d_with_indices_backward(const Tensor & grad_output, const Tensor & self, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool ceil_mode, const Tensor & indices) { - return at::legacy::th::_thnn_max_pool2d_with_indices_backward(grad_output, self, kernel_size, stride, padding, dilation, ceil_mode, indices); -} - std::tuple max_pool3d_with_indices_out(Tensor & output, Tensor & indices, const Tensor & self, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool ceil_mode) { return at::legacy::th::_thnn_max_pool3d_with_indices_forward_out(output, indices, self, kernel_size, stride, padding, dilation, ceil_mode); } diff --git a/aten/src/ATen/native/Lerp.cpp b/aten/src/ATen/native/Lerp.cpp index 37dfa0522d8f..5cd4c67376ee 100644 --- a/aten/src/ATen/native/Lerp.cpp +++ b/aten/src/ATen/native/Lerp.cpp @@ -38,7 +38,7 @@ namespace native { Tensor& lerp_cpu_tensor_out(Tensor& result, const Tensor& self, const Tensor& end, const Tensor& weight) { Tensor b_self, b_end, b_weight; - AT_CHECK(weight.dim() <= std::max(self.dim(), end.dim()), + TORCH_CHECK(weight.dim() <= std::max(self.dim(), end.dim()), "weight should be of dimension max(self.dim(), end.dim()) or lesser"); std::tie(b_self, b_end, b_weight) = expand_outplace(self, end, weight, "lerp_out_cpu"); result.resize_as_(b_self); @@ -62,10 +62,10 @@ Tensor& lerp_cpu_scalar_out(Tensor& result, const Tensor& self, Tensor& lerp_cpu_tensor_(Tensor& self, const Tensor& end, const Tensor& weight) { Tensor b_self, b_end, b_weight; std::tie(b_self, b_end, b_weight) = expand_outplace(self, end, weight, "lerp__cpu"); - AT_CHECK(b_self.sizes() == self.sizes(), + TORCH_CHECK(b_self.sizes() == self.sizes(), "output with shape ", self.sizes(), " doesn't match the broadcast shape ", b_self.sizes()); - AT_CHECK(weight.dim() <= std::max(self.dim(), end.dim()), + TORCH_CHECK(weight.dim() <= std::max(self.dim(), end.dim()), "weight should be of dimension max(self.dim(), end.dim()) or lesser"); AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "lerp__cpu", [&]{ lerp_cpu(self, b_self, b_end, b_weight); @@ -76,7 +76,7 @@ Tensor& lerp_cpu_tensor_(Tensor& self, const Tensor& end, const Tensor& weight) Tensor& lerp_cpu_scalar_(Tensor& self, const Tensor& end, Scalar weight) { Tensor b_self, b_end; std::tie(b_self, b_end) = expand_outplace(self, end, "lerp__cpu"); - AT_CHECK(b_self.sizes() == self.sizes(), + TORCH_CHECK(b_self.sizes() == self.sizes(), "output with shape ", self.sizes(), " doesn't match the broadcast shape ", b_self.sizes()); AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "lerp__cpu", [&]{ @@ -87,7 +87,7 @@ Tensor& lerp_cpu_scalar_(Tensor& self, const Tensor& end, Scalar weight) { Tensor lerp_cpu_tensor(const Tensor& self, const Tensor& end, const Tensor& weight) { Tensor b_self, b_end, b_weight; - AT_CHECK(weight.dim() <= std::max(self.dim(), end.dim()), + TORCH_CHECK(weight.dim() <= std::max(self.dim(), end.dim()), "weight should be of dimension max(self.dim(), end.dim()) or lesser"); std::tie(b_self, b_end, b_weight) = expand_outplace(self, end, weight, "lerp_cpu"); Tensor result = at::empty_like(b_self); diff --git a/aten/src/ATen/native/Linear.cpp b/aten/src/ATen/native/Linear.cpp index 0970a9217eea..b03e1032d945 100644 --- a/aten/src/ATen/native/Linear.cpp +++ b/aten/src/ATen/native/Linear.cpp @@ -33,7 +33,7 @@ Tensor linear(const Tensor& input, const Tensor& weight, const Tensor& bias) { static Tensor sumproduct_pair(const Tensor& left_, const Tensor& right_, IntArrayRef sum_dims_, bool keepdim) { // assumes that tensors have been pre-unsqueezed (so that all dimensions match - after broadcasting) // but makes no other assumptions on the order of dimensions - AT_CHECK(left_.dim()==right_.dim(), "number of dimensions must match"); + TORCH_CHECK(left_.dim()==right_.dim(), "number of dimensions must match"); if (sum_dims_.size() == 0) return at::mul(left_, right_); int64_t dim = left_.dim(); @@ -50,7 +50,7 @@ static Tensor sumproduct_pair(const Tensor& left_, const Tensor& right_, IntArra auto sr = right.size(i)>1; if (sum_dims[i]) { // first dimensions that will be summed over after multiplication if (sl && sr) { // dimensions nontrivially in both left and right must be of the same size - AT_CHECK(left.size(i)==right.size(i), "non-broadcast dimensions must match"); + TORCH_CHECK(left.size(i)==right.size(i), "non-broadcast dimensions must match"); sum_size *= left.size(i); } else if (sl) { // if it is only in one of left and right, we can sum right away left = left.sum(i, true); @@ -59,7 +59,7 @@ static Tensor sumproduct_pair(const Tensor& left_, const Tensor& right_, IntArra } } else if (sl && sr) { // now deal with dimensions dimensions that will be in the output // dimensions nontrivially in both left and right must be of the same size - AT_CHECK(left.size(i)==right.size(i), "non-broadcast dimensions must match"); + TORCH_CHECK(left.size(i)==right.size(i), "non-broadcast dimensions must match"); lro.push_back(i); lro_size *= left.size(i); } else if (sl) { // keep track of dimensions appearing only once @@ -168,7 +168,7 @@ Tensor einsum(std::string eqn, TensorList tensors) { int64_t num_total_idxes = 0; while (! eqn_stream.eof()) { std::getline(eqn_stream, term, ','); // term = string with indices of current term - AT_CHECK((int64_t) tensors.size()>operand, "more operands in equation than tensors"); // we cannot have a longer equation than operands. We need to check here before we use the dimension + TORCH_CHECK((int64_t) tensors.size()>operand, "more operands in equation than tensors"); // we cannot have a longer equation than operands. We need to check here before we use the dimension int64_t ell_char_count = 0; // handling of ellipsis '...' is a bit tedious, we count the '.' // if there is an ellipsis, the number of dimensions it represents must be total dim - letter dimensions @@ -178,7 +178,7 @@ Tensor einsum(std::string eqn, TensorList tensors) { for (auto &c : term) { // c = character with a single letter or '.' if (c == '.') { ell_char_count++; - AT_CHECK(ell_char_count <= 3, "can only have '.' in one ellispis '...' in term ", operand, " of the equation"); + TORCH_CHECK(ell_char_count <= 3, "can only have '.' in one ellispis '...' in term ", operand, " of the equation"); if (ell_char_count == 3) { // this completes the ellipsis if (num_ell_idxes == -1) { // if we have not seen an ellipsis before, keep track of indices and size first_ell_idx = num_total_idxes; @@ -186,7 +186,7 @@ Tensor einsum(std::string eqn, TensorList tensors) { num_total_idxes += num_ell_idxes; } else { // we have seen an ellipsis before, so we check compatibility - AT_CHECK(candidate_num_ell_idxes == num_ell_idxes, + TORCH_CHECK(candidate_num_ell_idxes == num_ell_idxes, "ellipsis must represent ", num_ell_idxes, " dimensions in all terms"); } for (int64_t i = 0; i < num_ell_idxes; ++i) { // map ellipsis dimensions in operand to indices @@ -196,8 +196,8 @@ Tensor einsum(std::string eqn, TensorList tensors) { dims_in_term += num_ell_idxes; // keep track of dimensions } } else { // a letter (hopefully) - AT_CHECK((ell_char_count == 0) || (ell_char_count == 3), "'.' must only occur in ellipsis, operand ", operand); - AT_CHECK(('a' <= c) && (c <= 'z'), "only lowercase letters a-z allowed as indices"); + TORCH_CHECK((ell_char_count == 0) || (ell_char_count == 3), "'.' must only occur in ellipsis, operand ", operand); + TORCH_CHECK(('a' <= c) && (c <= 'z'), "only lowercase letters a-z allowed as indices"); int64_t letter_num = c-'a'; // letter_num = position in letter_mapping if (letter_mapping[letter_num] == -1) { // new letter, add internal index and mapping letter_mapping[letter_num] = num_total_idxes; @@ -211,12 +211,12 @@ Tensor einsum(std::string eqn, TensorList tensors) { dims_in_term++; } } - AT_CHECK(dims_in_term == tensors[operand].dim(), "dimension mismatch for operand ", operand, ": equation ", dims_in_term, " tensor ", tensors[operand].dim()); + TORCH_CHECK(dims_in_term == tensors[operand].dim(), "dimension mismatch for operand ", operand, ": equation ", dims_in_term, " tensor ", tensors[operand].dim()); input_op_idxes.push_back(std::move(current_op_idxes)); operand++; } // in the check below, we need ==, but > is captured above, so the error message can be specific that it is <. - AT_CHECK((int64_t) tensors.size()==operand, "more tensors than operands in equation"); + TORCH_CHECK((int64_t) tensors.size()==operand, "more tensors than operands in equation"); // the following parses or infers output (right hand side) // it also assigns the idxes_to_preprocessed_dims (index -> dimension in preprocessed / output tensors) @@ -228,19 +228,19 @@ Tensor einsum(std::string eqn, TensorList tensors) { for (auto &c : eqn.substr(pos+2)) { if (c == '.') { // '.' as part of ellipsis ell_char_count++; - AT_CHECK(ell_char_count <= 3, "can only have '.' in one ellispis '...' in right hand side of the equation"); + TORCH_CHECK(ell_char_count <= 3, "can only have '.' in one ellispis '...' in right hand side of the equation"); if (ell_char_count == 3) { // ellipsis complete - AT_CHECK(num_ell_idxes >= 0, "ellipsis '...' may only appear in right hand side if it does in left hand side"); + TORCH_CHECK(num_ell_idxes >= 0, "ellipsis '...' may only appear in right hand side if it does in left hand side"); for (int64_t i = 0; i < num_ell_idxes; ++i) { idxes_to_preprocessed_dims[first_ell_idx + i] = num_output_dims; num_output_dims++; } } } else if (! isspace(c)) { // letter (hopefully) - AT_CHECK((ell_char_count == 0) || (ell_char_count == 3), "'.' must only occur in ellipsis in the right hand side"); - AT_CHECK(('a' <= c) && (c <= 'z'), "only lowercase letters a-z allowed as indices"); + TORCH_CHECK((ell_char_count == 0) || (ell_char_count == 3), "'.' must only occur in ellipsis in the right hand side"); + TORCH_CHECK(('a' <= c) && (c <= 'z'), "only lowercase letters a-z allowed as indices"); int64_t letter_num = c-'a'; - AT_CHECK(idxes_to_preprocessed_dims[letter_mapping[letter_num]] == -1, "index ", c, "occurs twice in output"); + TORCH_CHECK(idxes_to_preprocessed_dims[letter_mapping[letter_num]] == -1, "index ", c, "occurs twice in output"); idxes_to_preprocessed_dims[letter_mapping[letter_num]] = num_output_dims; num_output_dims++; } @@ -293,11 +293,11 @@ Tensor einsum(std::string eqn, TensorList tensors) { size_of_dims[idx] = preprocessed_op.size(dim); } else { - AT_CHECK(size_of_dims[idx] == preprocessed_op.size(dim), "size of dimension does not match previous size, operand ", op, ", dim ", i); + TORCH_CHECK(size_of_dims[idx] == preprocessed_op.size(dim), "size of dimension does not match previous size, operand ", op, ", dim ", i); } dim++; } else { // duplicate dimension in tensor --> take diagonal of idx_to_dim[dim_out] and dim and put the diagonal dimension to idx_to_dim[dim_out] - AT_CHECK(size_of_dims[idx] == preprocessed_op.size(dim), "size of dimension does not match previous size, operand ", op, ", dim ", i); + TORCH_CHECK(size_of_dims[idx] == preprocessed_op.size(dim), "size of dimension does not match previous size, operand ", op, ", dim ", i); preprocessed_op = preprocessed_op.diagonal(0, idx_to_dim[dim_out], dim); // diagonal moves the diagonal dimension to the back // now we permute the last dim back to idx_to_dim[dim_out] @@ -367,7 +367,7 @@ Tensor _trilinear(const Tensor& i1_, const Tensor& i2_, const Tensor& i3_, IntArrayRef expand1_, IntArrayRef expand2_, IntArrayRef expand3_, IntArrayRef sumdim_, int64_t unroll_dim) { int64_t total_dim = i1_.dim()+expand1_.size(); - AT_CHECK((unroll_dim >= 0) && (unroll_dim < total_dim), "unroll_dim must be in [0,", total_dim-1, "]"); + TORCH_CHECK((unroll_dim >= 0) && (unroll_dim < total_dim), "unroll_dim must be in [0,", total_dim-1, "]"); auto expand1 = at::dim_list_to_bitset(expand1_, total_dim); auto expand2 = at::dim_list_to_bitset(expand2_, total_dim); auto expand3 = at::dim_list_to_bitset(expand3_, total_dim); @@ -433,18 +433,18 @@ Tensor _trilinear(const Tensor& i1_, const Tensor& i2_, const Tensor& i3_, } Tensor bilinear(const Tensor& input1, const Tensor& input2, const Tensor& weight, const Tensor& bias) { - AT_CHECK(input1.dim() == input2.dim(), "bilinear(): input dimensions do not match: got ", input1.dim(), " and ", input2.dim()); + TORCH_CHECK(input1.dim() == input2.dim(), "bilinear(): input dimensions do not match: got ", input1.dim(), " and ", input2.dim()); for (int64_t i = 0; i < input1.dim() - 1; i++) { - AT_CHECK(input1.size(i) == input2.size(i), + TORCH_CHECK(input1.size(i) == input2.size(i), "bilinear(): input batch dimensions do not match at dim ", i, ": got ", input1.size(i), " and ", input2.size(i)); } - AT_CHECK(input1.size(input1.dim() - 1) == weight.size(1), + TORCH_CHECK(input1.size(input1.dim() - 1) == weight.size(1), "bilinear(): input1 size does not match weight size: got ", input1.size(input1.dim() - 1), " but expected ", weight.size(1)); - AT_CHECK(input2.size(input2.dim() - 1) == weight.size(2), + TORCH_CHECK(input2.size(input2.dim() - 1) == weight.size(2), "bilinear(): input2 size does not match weight size: got ", input2.size(input2.dim() - 1), " but expected ", weight.size(2)); - AT_CHECK(!bias.defined() || bias.size(0) == weight.size(0), + TORCH_CHECK(!bias.defined() || bias.size(0) == weight.size(0), "bilinear(): bias size does not match weight size: got ", bias.size(0), " but expected ", weight.size(0)); @@ -464,7 +464,7 @@ Tensor bilinear(const Tensor& input1, const Tensor& input2, const Tensor& weight // implements tensordot, a matrix-multiplication-like contraction, but the dimensions given // in the two dimension lists Tensor tensordot(const Tensor& input1, const Tensor& input2, IntArrayRef dims1, IntArrayRef dims2) { - AT_CHECK(dims1.size() == dims2.size(), "both dimension lists should have same length"); + TORCH_CHECK(dims1.size() == dims2.size(), "both dimension lists should have same length"); int64_t csize = 1; // total size of the contracted dimensions Tensor t1 = input1; Tensor t2 = input2; @@ -476,7 +476,7 @@ Tensor tensordot(const Tensor& input1, const Tensor& input2, IntArrayRef dims1, } else if (s1 == 1) { t2 = t2.sum(dims2[i], true); } else { - AT_CHECK(s1 == s2, "contracted dimensions need to match, but first has size ", s1, " in dim ", dims1[i], + TORCH_CHECK(s1 == s2, "contracted dimensions need to match, but first has size ", s1, " in dim ", dims1[i], " and second has size ", s2, " in dim ", dims2[i]); csize *= s1; } diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp index b40120c3a5e2..c01f37beb480 100644 --- a/aten/src/ATen/native/LinearAlgebra.cpp +++ b/aten/src/ATen/native/LinearAlgebra.cpp @@ -23,7 +23,7 @@ static inline std::tuple _lu_det_P_diag_U_info(const Tensor Tensor p, lu, info; std::tie(lu, p, info) = at::_lu_with_info(self, /*pivot=*/true, /*check_errors=*/false); int int_info = info.item(); - AT_CHECK(int_info >= 0, "LU factorization (getrf) failed with info = ", int_info); + TORCH_CHECK(int_info >= 0, "LU factorization (getrf) failed with info = ", int_info); auto n = self.size(0); auto num_exchanges = (at::arange(1, n + 1, p.options()) != p).nonzero().size(0); if (num_exchanges % 2 == 1) { @@ -34,7 +34,7 @@ static inline std::tuple _lu_det_P_diag_U_info(const Tensor } Tensor det(const Tensor& self) { - AT_CHECK(at::isFloatingType(self.scalar_type()) && + TORCH_CHECK(at::isFloatingType(self.scalar_type()) && self.dim() == 2 && self.size(0) == self.size(1), "det(", self.type(), "{", self.sizes(), "}): expected a 2D square tensor " "of floating types"); @@ -50,7 +50,7 @@ Tensor det(const Tensor& self) { } Tensor logdet(const Tensor& self) { - AT_CHECK(at::isFloatingType(self.scalar_type()) && + TORCH_CHECK(at::isFloatingType(self.scalar_type()) && self.dim() == 2 && self.size(0) == self.size(1), "logdet(", self.type(), "{", self.sizes(), "}): expected a 2D square tensor " "of floating types"); @@ -73,7 +73,7 @@ Tensor logdet(const Tensor& self) { } std::tuple slogdet(const Tensor& self) { - AT_CHECK(at::isFloatingType(self.scalar_type()) && + TORCH_CHECK(at::isFloatingType(self.scalar_type()) && self.dim() == 2 && self.size(0) == self.size(1), "slogdet(", self.type(), "{", self.sizes(), "}): expected a 2D square tensor " "of floating types"); @@ -93,7 +93,7 @@ std::tuple slogdet(const Tensor& self) { } Tensor pinverse(const Tensor& self, double rcond) { - AT_CHECK(at::isFloatingType(self.scalar_type()) && self.dim() == 2, + TORCH_CHECK(at::isFloatingType(self.scalar_type()) && self.dim() == 2, "pinverse(", self.type(), "{", self.sizes(), "}): expected a 2D tensor " "of floating types"); if (self.numel() == 0) { @@ -121,7 +121,7 @@ static inline Tensor _matrix_rank_helper(const Tensor& self, bool symmetric) { } Tensor matrix_rank(const Tensor& self, double tol, bool symmetric) { - AT_CHECK(at::isFloatingType(self.scalar_type()) && self.dim() == 2, + TORCH_CHECK(at::isFloatingType(self.scalar_type()) && self.dim() == 2, "matrix_rank(", self.type(), "{", self.sizes(), "}): expected a 2D tensor " "of floating types"); @@ -130,7 +130,7 @@ Tensor matrix_rank(const Tensor& self, double tol, bool symmetric) { } Tensor matrix_rank(const Tensor& self, bool symmetric) { - AT_CHECK(at::isFloatingType(self.scalar_type()) && self.dim() == 2, + TORCH_CHECK(at::isFloatingType(self.scalar_type()) && self.dim() == 2, "matrix_rank(", self.type(), "{", self.sizes(), "}): expected a 2D tensor " "of floating types"); @@ -140,7 +140,7 @@ Tensor matrix_rank(const Tensor& self, bool symmetric) { } static void check_1d(const Tensor& t, const char* arg, const char* fn) { - AT_CHECK(t.dim() == 1, fn, ": Expected 1-D argument ", arg, ", but got ", t.dim(), "-D"); + TORCH_CHECK(t.dim() == 1, fn, ": Expected 1-D argument ", arg, ", but got ", t.dim(), "-D"); } Tensor ger(const Tensor& self, const Tensor& vec2) { @@ -368,7 +368,7 @@ Tensor dot(const Tensor& self, const Tensor& tensor) { Tensor& dot_out(Tensor& result, const Tensor& self, const Tensor& tensor) { result.resize_({}); - AT_CHECK(result.scalar_type() == self.scalar_type(), + TORCH_CHECK(result.scalar_type() == self.scalar_type(), "result dtype ", result.scalar_type(), " does not match self dtype ", self.scalar_type()); return result.fill_(self.dot(tensor)); } @@ -428,6 +428,29 @@ Tensor matmul( Tensor output = has_out ? at::_unsafe_view(at::mm_out(out, t1, t2), output_size) : at::_unsafe_view(t1.mm(t2), output_size); return has_out ? out.set_(output) : output; + } else if ((dim_tensor1 == 1 || dim_tensor1 == 2) && dim_tensor2 >= 3) { + // optimization: transpose the inner dimensions of the arguments, call + // matmul on the swapped arguments, then transpose the inner dimensions + // of the result. + const int64_t n = dim_tensor1 == 2 ? tensor1.size(-2) : 1; + const int64_t m = tensor1.size(-1); + const int64_t p = tensor2.size(-1); + + const Tensor t2_T = tensor2.transpose(-1, -2); + const Tensor t1_T = dim_tensor1 == 2 ? tensor1.t() : tensor1.reshape({n, m}).t(); + const Tensor res_T = matmul(out_opt, t2_T, t1_T); + + if (dim_tensor1 == 2) { + Tensor res = res_T.transpose(-1, -2).contiguous(); + return has_out ? out.set_(res) : res; + } + else { + std::vector shape = tensor2.sizes().slice(0, dim_tensor2 - 2).vec(); + shape.push_back(p); + + Tensor res = res_T.reshape(shape).contiguous(); + return has_out ? out.set_(res) : res; + } } else if ((dim_tensor1 >= 1 && dim_tensor2 >= 1) && (dim_tensor1 >= 3 || dim_tensor2 >= 3)) { // We are multiplying b1 x n x m1 by x2 x m2 x p (where b1 can be a list); // we track m1 vs m2 separately even though they must match for nicer error messages @@ -489,7 +512,7 @@ Tensor& matmul_out(Tensor &result, const Tensor & tensor1, const Tensor & tensor } Tensor matrix_power(const Tensor& a, int64_t n) { - AT_CHECK(a.dim() >= 2 && at::isFloatingType(a.scalar_type()), + TORCH_CHECK(a.dim() >= 2 && at::isFloatingType(a.scalar_type()), "matrix_power(", a.type(), "{", a.sizes(), "}): expected a tensor " "of floating types with dim at least 2"); if (n == 0) { @@ -531,7 +554,7 @@ Tensor frobenius_norm(const Tensor& self) { } Tensor frobenius_norm(const Tensor& self, IntArrayRef dim, bool keepdim) { - AT_CHECK( + TORCH_CHECK( dim.size() <= 2, "Expected at most 2 dimensions, but got ", dim.size(), @@ -547,7 +570,7 @@ Tensor &frobenius_norm_out( const Tensor& self, IntArrayRef dim, bool keepdim) { - AT_CHECK( + TORCH_CHECK( dim.size() <= 2, "Expected at most 2 dimensions, but got ", dim.size(), @@ -559,7 +582,7 @@ Tensor &frobenius_norm_out( } Tensor nuclear_norm(const Tensor& self, bool keepdim) { - AT_CHECK( + TORCH_CHECK( self.dim() == 2, "Expected a tensor with 2 dimensions, but got a ", self.dim(), @@ -568,7 +591,7 @@ Tensor nuclear_norm(const Tensor& self, bool keepdim) { } Tensor &nuclear_norm_out(Tensor& result, const Tensor& self, bool keepdim) { - AT_CHECK( + TORCH_CHECK( self.dim() == 2, "Expected a tensor with 2 dimensions, but got a ", self.dim(), diff --git a/aten/src/ATen/native/LinearAlgebraUtils.h b/aten/src/ATen/native/LinearAlgebraUtils.h index 3e470f51da06..9fee50876462 100644 --- a/aten/src/ATen/native/LinearAlgebraUtils.h +++ b/aten/src/ATen/native/LinearAlgebraUtils.h @@ -99,11 +99,11 @@ static inline void linearSolveCheckInputs(const Tensor& self, const Tensor& A) { AT_ERROR(ss.str()); } - AT_CHECK(A.size(-1) == A.size(-2), + TORCH_CHECK(A.size(-1) == A.size(-2), "A must be batches of square matrices, " "but they are ", A.size(-1), " by ", A.size(-2), " matrices"); - AT_CHECK(A.size(-1) == self.size(-2), + TORCH_CHECK(A.size(-1) == self.size(-2), "Incompatible matrix sizes for matmul: each A " "matrix is ", A.size(-1), " by ", A.size(-1), " but each b matrix is ", self.size(-2), " by ", self.size(-1)); @@ -111,7 +111,7 @@ static inline void linearSolveCheckInputs(const Tensor& self, const Tensor& A) { // Validates input shapes for operations on batches of square matrices (inverse, cholesky, lu) static inline void squareCheckInputs(const Tensor& self) { - AT_CHECK(self.size(-1) == self.size(-2), + TORCH_CHECK(self.size(-1) == self.size(-2), "A must be batches of square matrices, " "but they are ", self.size(-1), " by ", self.size(-2), " matrices"); } @@ -164,7 +164,7 @@ static inline void singleCheckErrors(int64_t info, const char* name) { // Checks if all the Tensors in a TensorList are of the same dimensions static inline void checkAllSameDim(TensorList tensors, int64_t dim) { for (auto &t : tensors) { - AT_CHECK(t.dim() == dim, "Tensor dimension is ", t.dim(), ", expected ", dim, " instead."); + TORCH_CHECK(t.dim() == dim, "Tensor dimension is ", t.dim(), ", expected ", dim, " instead."); } } diff --git a/aten/src/ATen/native/Loss.cpp b/aten/src/ATen/native/Loss.cpp index 7def0da9ae0f..ad18ee8578e4 100644 --- a/aten/src/ATen/native/Loss.cpp +++ b/aten/src/ATen/native/Loss.cpp @@ -1,9 +1,15 @@ +// define constants like M_PI and C keywords for MSVC +#ifdef _MSC_VER +#define _USE_MATH_DEFINES +#include +#endif #include #include #include #include #define EPSILON 1e-12 +#define _USE_MATH_DEFINES namespace { static inline at::Tensor apply_loss_reduction(const at::Tensor& unreduced, int64_t reduction) { @@ -125,4 +131,21 @@ Tensor binary_cross_entropy_with_logits_backward(const Tensor& grad, const Tenso return grad_input; } + +Tensor poisson_nll_loss(const Tensor& input, const Tensor& target, const bool log_input, const bool full, const double eps, const int64_t reduction) +{ + Tensor loss; + if (log_input) { + loss = at::exp(input) - target * input; + } else { + loss = input - target * at::log(input + eps); + } + + if (full) { + auto mask1 = (target > 1); + loss.masked_select(mask1) += (target * at::log(target) - target + 0.5 * at::log(2 * M_PI * target)).masked_select(mask1); + } + + return apply_loss_reduction(loss, reduction); +} }} // namespace at::native diff --git a/aten/src/ATen/native/LossCTC.cpp b/aten/src/ATen/native/LossCTC.cpp index b17f9c9f7d8d..756f36f4abc4 100644 --- a/aten/src/ATen/native/LossCTC.cpp +++ b/aten/src/ATen/native/LossCTC.cpp @@ -49,9 +49,9 @@ std::tuple ctc_loss_cpu_template(const Tensor& log_probs, const int64_t batch_size = log_probs.size(1); int64_t num_labels = log_probs.size(2); - AT_CHECK((0 <= BLANK) && (BLANK < num_labels), "blank must be in label range"); - AT_CHECK((int64_t) input_lengths.size() == batch_size, "input_lengths must be of size batch_size"); - AT_CHECK((int64_t) target_lengths.size() == batch_size, "target_lengths must be of size batch_size"); + TORCH_CHECK((0 <= BLANK) && (BLANK < num_labels), "blank must be in label range"); + TORCH_CHECK((int64_t) input_lengths.size() == batch_size, "input_lengths must be of size batch_size"); + TORCH_CHECK((int64_t) target_lengths.size() == batch_size, "target_lengths must be of size batch_size"); size_t tg_target_stride; int64_t max_target_length = 0; @@ -77,13 +77,13 @@ std::tuple ctc_loss_cpu_template(const Tensor& log_probs, const } tg_target_stride = targets.stride(1); checkSize(c, targets_arg, 0, batch_size); - AT_CHECK(targets.size(1) >= max_target_length, + TORCH_CHECK(targets.size(1) >= max_target_length, "Expected tensor to have size at least ", max_target_length, " at dimension 1, but got size ", targets.size(1), " for ", targets_arg, " (while checking arguments for ", c, ")"); } int64_t max_input_length = log_probs.size(0); for (int64_t b = 0; b < batch_size; b++) { - AT_CHECK(input_lengths[b] <= max_input_length, + TORCH_CHECK(input_lengths[b] <= max_input_length, "Expected tensor to have size at least ", max_input_length, " at dimension 1, but got size ", input_lengths[b], " for ", log_probs_arg, " (while checking arguments for ", c, ")"); } @@ -377,8 +377,8 @@ Tensor ctc_loss(const Tensor& log_probs, const Tensor& targets, IntArrayRef inpu // Convenience function accepting Tensors Tensor ctc_loss(const Tensor& log_probs, const Tensor& targets, const Tensor& input_lengths, const Tensor& target_lengths, int64_t BLANK, int64_t reduction, bool zero_infinity) { - AT_CHECK(isIntegralType(input_lengths.scalar_type()), "input_lenghts must be integral"); - AT_CHECK(isIntegralType(target_lengths.scalar_type()), "target_lenghts must be integral"); + TORCH_CHECK(isIntegralType(input_lengths.scalar_type()), "input_lenghts must be integral"); + TORCH_CHECK(isIntegralType(target_lengths.scalar_type()), "target_lenghts must be integral"); Tensor ilc = input_lengths.toType(kLong).toBackend(Backend::CPU).contiguous(); Tensor tlc = target_lengths.toType(kLong).toBackend(Backend::CPU).contiguous(); diff --git a/aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/Normalization.cpp index ca0215cc4a8b..0f257e6eb247 100644 --- a/aten/src/ATen/native/Normalization.cpp +++ b/aten/src/ATen/native/Normalization.cpp @@ -15,7 +15,7 @@ namespace at { namespace native { namespace { void check_dims_match_num_input_features(const char* arg_name, int64_t expected, int64_t actual){ - AT_CHECK(actual == expected, + TORCH_CHECK(actual == expected, arg_name, " should contain ", expected, " elements not ", actual); } @@ -434,7 +434,7 @@ Tensor instance_norm( const Tensor& input, const Tensor& weight /* optional */, const Tensor& bias /* optional */, const Tensor& running_mean /* optional */, const Tensor& running_var /* optional */, bool use_input_stats, double momentum, double eps, bool cudnn_enabled) { - AT_CHECK(use_input_stats || (running_mean.defined() && running_var.defined()), + TORCH_CHECK(use_input_stats || (running_mean.defined() && running_var.defined()), "Expected running_mean and running_var to be defined when use_input_stats is false"); std::vector shape = input.sizes().vec(); int64_t b = input.size(0); @@ -462,64 +462,6 @@ Tensor instance_norm( return out.view(input.sizes()); } -Tensor layer_norm(const Tensor& input, IntArrayRef normalized_shape, - const Tensor& weight /* optional */, const Tensor& bias /* optional */, - double eps, bool cudnn_enabled) { - - int64_t normalized_ndim = normalized_shape.size(); - - AT_CHECK(normalized_ndim >= 1, - "Expected normalized_shape to be at least 1-dimensional, i.e., ", - "containing at least one element, but got normalized_shape=", - normalized_shape); - - AT_CHECK(!weight.defined() || weight.sizes().equals(normalized_shape), - "Expected weight to be of same shape as normalized_shape, but got ", - "weight of shape ", weight.sizes(), " and normalized_shape=", - normalized_shape); - AT_CHECK(!bias.defined() || bias.sizes().equals(normalized_shape), - "Expected bias to be of same shape as normalized_shape, but got ", - "bias of shape ", bias.sizes(), " and normalized_shape=", - normalized_shape); - - auto input_shape = input.sizes(); - auto input_ndim = input.dim(); - - if (input_ndim < normalized_ndim || - !input_shape.slice(input_ndim - normalized_ndim).equals(normalized_shape)) { - std::stringstream ss; - ss << "Given normalized_shape=" << normalized_shape - << ", expected input with shape [*"; - for (auto size : normalized_shape) { - ss << ", " << size; - } - ss << "], but got input of size" << input_shape; - AT_ERROR(ss.str()); - } - - int64_t n = 1; - for (int64_t i = 0; i < input_ndim - normalized_ndim; i++) { - n *= input_shape[i]; - } - - // Apply layer norm - auto input_reshaped = input.contiguous().view({1, n, -1}); - - auto out = at::batch_norm(input_reshaped, {}, {}, {}, {}, true, 0, eps, - cudnn_enabled); - out = out.view(input_shape); - - if (weight.defined() && bias.defined()) { - return bias.addcmul(out, weight, 1); - } else if (weight.defined()) { - return out.mul(weight); - } else if (bias.defined()) { - return out.add(bias); - } else { - return out; - } -} - Tensor group_norm(const Tensor& input, int64_t num_groups, const Tensor& weight /* optional */, const Tensor& bias /* optional */, double eps, bool cudnn_enabled) { @@ -528,16 +470,16 @@ Tensor group_norm(const Tensor& input, int64_t num_groups, int64_t b = input.size(0); int64_t c = input.size(1); - AT_CHECK(c % num_groups == 0, + TORCH_CHECK(c % num_groups == 0, "Expected number of channels in input to be divisible by ", "num_groups, but got input of shape ", input.sizes(), " and " "num_groups=", num_groups); - AT_CHECK(!weight.defined() || (weight.dim() == 1 && weight.numel() == c), + TORCH_CHECK(!weight.defined() || (weight.dim() == 1 && weight.numel() == c), "Expected weight to be a vector of size equal to the number of ", "channels in input, but got weight of shape ", weight.sizes(), " and input of shape ", input.sizes()); - AT_CHECK(!bias.defined() || (bias.dim() == 1 && bias.numel() == c), + TORCH_CHECK(!bias.defined() || (bias.dim() == 1 && bias.numel() == c), "Expected bias to be a vector of size equal to the number of ", "channels in input, but got bias of shape ", weight.sizes(), " and input of shape ", input.sizes()); diff --git a/aten/src/ATen/native/Onehot.cpp b/aten/src/ATen/native/Onehot.cpp index d02f97ee3e59..2962e04676e9 100644 --- a/aten/src/ATen/native/Onehot.cpp +++ b/aten/src/ATen/native/Onehot.cpp @@ -3,7 +3,7 @@ namespace at { namespace native { Tensor one_hot(const Tensor &self, int64_t num_classes) { - AT_CHECK(self.dtype() == kLong, "one_hot is only applicable to index tensor."); + TORCH_CHECK(self.dtype() == kLong, "one_hot is only applicable to index tensor."); auto shape = self.sizes().vec(); // empty tensor could be converted to one hot representation, @@ -18,11 +18,11 @@ Tensor one_hot(const Tensor &self, int64_t num_classes) { } // non-empty tensor - AT_CHECK(self.min().item().toLong() >= 0, "Class values must be non-negative."); + TORCH_CHECK(self.min().item().toLong() >= 0, "Class values must be non-negative."); if (num_classes == -1) { num_classes = self.max().item().toLong() + 1; } else { - AT_CHECK(num_classes > self.max().item().toLong(), "Class values must be smaller than num_classes."); + TORCH_CHECK(num_classes > self.max().item().toLong(), "Class values must be smaller than num_classes."); } shape.push_back(num_classes); diff --git a/aten/src/ATen/native/PackedSequence.cpp b/aten/src/ATen/native/PackedSequence.cpp index bd9a70c781ff..39dd0b518863 100644 --- a/aten/src/ATen/native/PackedSequence.cpp +++ b/aten/src/ATen/native/PackedSequence.cpp @@ -4,7 +4,7 @@ namespace at { namespace native { void checkLongTensor(const Tensor& tensor) { - AT_CHECK(tensor.dim() == 1 && tensor.type().device_type() == at::kCPU && tensor.scalar_type() == at::kLong, + TORCH_CHECK(tensor.dim() == 1 && tensor.type().device_type() == at::kCPU && tensor.scalar_type() == at::kLong, "'lengths' argument should be a 1D CPU int64 tensor"); } @@ -20,10 +20,10 @@ std::tuple _pack_padded_sequence(const Tensor& _input, const Ten int64_t batch_size = input.size(1); int64_t * lengths = lengths_t.data(); - AT_CHECK(lengths_t.size(0) == batch_size, + TORCH_CHECK(lengths_t.size(0) == batch_size, "Expected `len(lengths)` to be equal to batch_size, but got ", lengths_t.size(0), " (batch_size=", batch_size, ")"); - AT_CHECK(lengths[batch_size - 1] > 0, + TORCH_CHECK(lengths[batch_size - 1] > 0, "Length of all samples has to be greater than 0, but found an element " "in 'lengths' that is <= 0"); for(auto i = 0; i < batch_size - 1; i++) { @@ -83,7 +83,7 @@ std::tuple _pack_padded_sequence(const Tensor& _input, const Ten } prev_l = l; } - AT_CHECK(l >= prev_l); + TORCH_CHECK(l >= prev_l); } return std::make_tuple(at::cat(steps), batch_sizes_t); @@ -95,7 +95,7 @@ std::tuple _pack_padded_sequence(const Tensor& _input, const Ten Tensor _pack_padded_sequence_backward(const Tensor& grad, at::IntArrayRef input_size, const Tensor& _batch_sizes, bool batch_first) { std::vector input_size_after_t = input_size.vec(); if (batch_first) { - AT_CHECK(input_size.size() >= 2); + TORCH_CHECK(input_size.size() >= 2); std::swap(input_size_after_t[0], input_size_after_t[1]); } auto grad_input = at::zeros(input_size_after_t, grad.options()); @@ -126,7 +126,7 @@ std::tuple _pad_packed_sequence(const Tensor& data, const Tensor int64_t max_real_seq_length = batch_sizes_t.size(0); int64_t max_seq_length = max_real_seq_length; if (total_length > 0) { - AT_CHECK(total_length >= max_seq_length, + TORCH_CHECK(total_length >= max_seq_length, "Expected total_length to be at least the length of the longest " "sequence in input, but got total_length=", total_length, " and " "max sequence length being ", max_seq_length); diff --git a/aten/src/ATen/native/Pooling.cpp b/aten/src/ATen/native/Pooling.cpp index 9e267ffb76ed..a04c2878845e 100644 --- a/aten/src/ATen/native/Pooling.cpp +++ b/aten/src/ATen/native/Pooling.cpp @@ -12,7 +12,7 @@ static void check1d( const char* function_name, const char* argument_name, IntArrayRef x) { - AT_CHECK( + TORCH_CHECK( x.size() == 1, function_name, "() argument '", argument_name, "' should contain one int (got ", x.size(), ")"); diff --git a/aten/src/ATen/native/README.md b/aten/src/ATen/native/README.md index 3e5b7369fc05..449a99530b1c 100644 --- a/aten/src/ATen/native/README.md +++ b/aten/src/ATen/native/README.md @@ -49,13 +49,13 @@ signature. and may be omitted by passing an undefined tensor. When a function takes multiple `Tensor` arguments, these tensors are assumed to be the same type (e.g., if one argument is a `FloatTensor`, all other arguments are checked - to be `FloatTensor`s.) + to be `FloatTensor`s). `Tensor` or `Tensor?` must sometimes be annotated to indicate aliasing and mutability. - In general annotations can be defined via the following four situtations - `Tensor(a)` - `a` is a set of Tensors that may alias to the same data. - `Tensor(a!)` - `a` members of a may be written to thus mutating the underlying data. - `Tensor!` - shorthand for Tensor(fresh\_identifier!) - `Tensor(a! -> a|b)` - Tensor is in set `a`, written to, and after the write is in set `a` AND `b`. + In general annotations can be defined via the following four situations: + - `Tensor(a)` - `a` is a set of Tensors that may alias to the same data. + - `Tensor(a!)` - `a` members of a may be written to thus mutating the underlying data. + - `Tensor!` - shorthand for Tensor(fresh\_identifier!) + - `Tensor(a! -> a|b)` - Tensor is in set `a`, written to, and after the write is in set `a` AND `b`. For more details on when and why this needs to happen, please see the section on annotations. - `Tensor[]`. A `Tensor[]` argument translates into a C++ argument of type `ArrayRef` (a.k.a. `TensorList`) @@ -80,18 +80,18 @@ signature. - `*` is a special sentinel argument, which doesn't translate into an actual argument, but indicates that in the Python bindings, any subsequent arguments must be specified as keyword arguments (and cannot be provided positionally). -- `?` is trailing question mark that annotate an argument to be an optional type, grep for +- `?` is trailing question mark that annotates an argument to be an optional type. Grep for `optional` to find some example usages. In general, most functions will not need to use this, but there are some cases that we want to use optional for the different types: - - You want to pass in a `None` to a ATen function/method from Python, and handles the - None type in the C++ side. For example, `clamp(Tensor self, Scalar? min=None, Scalar? max=None)` - can take `None` for its `min` and `max` parameter, and do dispatch to different - backend if one of the parameters is `None`. Optional type can accept a `None` type + - You want to pass a `None` to an ATen function/method from Python and handle the + None type on the C++ side. For example, `clamp(Tensor self, Scalar? min=None, Scalar? max=None)` + can take `None` for its `min` and `max` parameter, but does not dispatch to different + backends if one of the parameters is `None`. Optional type can accept a `None` type (`nullopt` in C++) from Python and use the [C++ Optional class](https://en.cppreference.com/w/cpp/utility/optional) to interact with the parameters. - - You want a default value which is fine in Python but would cause ambiguity in C++. + - You want a default value, which is fine in Python, but would cause ambiguity in C++. For example, `norm(Tensor self, Scalar p=2, int dim, bool keepdim=False)` would - cause ambiguity in C++ since it default args must be adjacent and `p` could not - have a default value when `dim` does not. Therefore, we need to make `p` as a + cause ambiguity in C++ since its default args must be adjacent (`p` could not + have a default value when `dim` does not). Therefore, we need to make `p` as a optional Scalar, and make `p=2` when `p` is not passed in (nullopt). - You want a value to default to the same value as another argument (this cannot be expressed in C++ default arguments). @@ -123,7 +123,7 @@ Here are the supported default values: * Numbers (e.g., `0` or `5.0` for `int`, `float` and `int[]` with an explicit length (e.g., `int[2]`)--in the case of `int[]` a number is replicated to fill the length (e.g., `int[2] x=2` - is equivalent to `int[2] x=[2,2]`. + is equivalent to `int[2] x=[2,2]`). * Lists of numbers (e.g., `[0, 0]`) for `IntList`. * Booleans (e.g., `True`) for `bool`. * Empty initializer lists (e.g., `[]`) for `Tensor` (this implicitly changes @@ -191,19 +191,19 @@ more complicated neural network layers (e.g., `conv2d`) and internal functions designed specifically for binding (e.g., `cudnn_convolution`). As we progress along our schema unification of the `func` schema with the JIT -signatue schema, we must introduce features that allow us to increase compliance. +signature schema, we must introduce features that allow us to increase compliance. One of these features are Tensor annotations. As of now we use naming conventions to indicate whether an argument of a function is going to be mutated and returned. ### `annotations` There are two typical situations in which we mutate the memory of an argument in the Python -frontend: -a) For an inplace operations such as `self.abs_()` +frontend: +a) For an inplace operations such as `self.abs_()` b) for a function with an output keyword argument such as `torch.abs(input, out=None)`. In order to provide implementations for these Python functions the legacy schema -requires C++ implementations for three situations `abs(Tensor self) -> Tensor`, +requires C++ implementations for three situations `abs(Tensor self) -> Tensor`, `abs_(Tensor self) -> Tensor` and `abs_out(Tensor out, Tensor self) -> Tensor`. Now, as we move towards the unification, we start to use a different syntax to represent @@ -220,14 +220,14 @@ Let's revisit the previous native function declarations and see the conventions `self` may be written to and returned. Further, the annotation indicates that the return value may alias the input. This indicates an inplace function and by convention ends in a single '\_'. - `abs(Tensor self, *, Tensor(a!) out) -> Tensor(a!)` - In the Python frontend `out` can be passed as a keyword argument and may be written to. + In the Python frontend `out` can be passed as a keyword argument and may be written to. In this case it indicates the schema for a function that must accept `out` as this does not provide a default argument. The idea behind representing this as a optional argument is to document the intended usage. This maps to the legacy `abs_out(Tensor out, Tensor self) -> Tensor`. As with the legacy `_out` function you must call the argument `Tensor out` or `Tensor out0`, `Tensor out1` in the context of multiple arguments. -There is also another situtation in which we use annotations, namely views. +There is also another situation in which we use annotations, namely views. - `transpose(Tensor(a) self, int dim0, int dim1) -> Tensor(a)` An alias to the memory represented by `self` may be also returned, however it is not mutated. @@ -298,9 +298,8 @@ implementation (no header necessary) with a matching signature to the generated header from the ATen metadata. There are many simple native functions; take a look at some of them to see what to do. -Although, for the most part, writing an ATen function is mostly writing -the algorithm you want to implement, there are some less obvious details -you should also consider. +Although writing an ATen function is mostly writing the algorithm you want +to implement, there are some less obvious details you should also consider. ### Will your function be automatically differentiable? diff --git a/aten/src/ATen/native/RNN.cpp b/aten/src/ATen/native/RNN.cpp index 420e0ea4df04..9b5071941e89 100644 --- a/aten/src/ATen/native/RNN.cpp +++ b/aten/src/ATen/native/RNN.cpp @@ -114,10 +114,10 @@ struct QuantizedCellParams { const Scalar zero_point_hh; Tensor matmul_ih(Tensor input) const { - AT_CHECK(false, "matmul is not supported with quantized cell params"); + TORCH_CHECK(false, "matmul is not supported with quantized cell params"); } Tensor matmul_hh(Tensor h) const { - AT_CHECK(false, "matmul is not supported with quantized cell params"); + TORCH_CHECK(false, "matmul is not supported with quantized cell params"); } Tensor linear_ih(Tensor input) const { return at::fbgemm_linear_int8_weight( @@ -132,7 +132,7 @@ struct QuantizedCellParams { // Gathers every two elements of a vector in a vector of pairs template static std::vector> pair_vec(const std::vector& vals) { - AT_CHECK(vals.size() % 2 == 0, "Odd number of params or hiddens given to a bidirectional RNN"); + TORCH_CHECK(vals.size() % 2 == 0, "Odd number of params or hiddens given to a bidirectional RNN"); std::vector> result; result.reserve(vals.size() / 2); for (int64_t i = 0; i < vals.size(); i += 2) { @@ -158,12 +158,12 @@ static std::vector gather_params(TensorList params, bool has_biases) static at::Tensor undefined; std::vector result; if (has_biases) { - AT_CHECK(params.size() % 4 == 0, "got an incorrect number of RNN parameters"); + TORCH_CHECK(params.size() % 4 == 0, "got an incorrect number of RNN parameters"); for (size_t i = 0; i < params.size(); i += 4) { result.emplace_back(params[i], params[i + 1], params[i + 2], params[i + 3]); } } else { - AT_CHECK(params.size() % 2 == 0, "got an incorrect number of RNN parameters"); + TORCH_CHECK(params.size() % 2 == 0, "got an incorrect number of RNN parameters"); for (size_t i = 0; i < params.size(); i += 2) { result.emplace_back(params[i], params[i + 1], undefined, undefined); } @@ -174,7 +174,7 @@ static std::vector gather_params(TensorList params, bool has_biases) static std::vector gather_quantized_params(TensorList params) { static at::Tensor undefined; std::vector result; - AT_CHECK(params.size() % 12 == 0, "got an incorrect number of quantized RNN parameters"); + TORCH_CHECK(params.size() % 12 == 0, "got an incorrect number of quantized RNN parameters"); for (size_t i = 0; i < params.size(); i += 12) { result.emplace_back(params[i], params[i + 1], params[i + 2], params[i + 3], params[i + 4], params[i + 5], params[i + 6], params[i + 7], @@ -512,8 +512,8 @@ LayerOutput> apply_layer_stack(const Layer& layer, const io_type& input, const std::vector& hiddens, const std::vector& weights, int64_t num_layers, double dropout_p, bool train) { - AT_CHECK(num_layers == hiddens.size(), "Expected more hidden states in stacked_rnn"); - AT_CHECK(num_layers == weights.size(), "Expected more weights in stacked_rnn"); + TORCH_CHECK(num_layers == hiddens.size(), "Expected more hidden states in stacked_rnn"); + TORCH_CHECK(num_layers == weights.size(), "Expected more weights in stacked_rnn"); auto layer_input = input; auto hidden_it = hiddens.begin(); @@ -658,7 +658,7 @@ std::tuple lstm( const Tensor& _input, TensorList hx, TensorList _params, bool has_biases, int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) { - AT_CHECK(hx.size() == 2, "lstm expects two hidden states"); + TORCH_CHECK(hx.size() == 2, "lstm expects two hidden states"); if (at::cudnn_is_acceptable(_input)) { Tensor output, hy, cy; lstm_cudnn_stub(_input.type().device_type(), output, hy, cy, _input, hx, _params, has_biases, @@ -680,7 +680,7 @@ std::tuple lstm( const Tensor& data, const Tensor& batch_sizes, TensorList hx, TensorList _params, bool has_biases, int64_t num_layers, double dropout_p, bool train, bool bidirectional) { - AT_CHECK(hx.size() == 2, "lstm expects two hidden states"); + TORCH_CHECK(hx.size() == 2, "lstm expects two hidden states"); if (at::cudnn_is_acceptable(data)) { Tensor output, hy, cy; lstm_packed_cudnn_stub(data.type().device_type(), output, hy, cy, data, batch_sizes, hx, @@ -698,7 +698,7 @@ std::tuple lstm( std::tuple lstm_cell( const Tensor& input, TensorList hx, const Tensor& w_ih, const Tensor& w_hh, const Tensor& b_ih, const Tensor& b_hh) { - AT_CHECK(hx.size() == 2, "lstm_cell expects two hidden states"); + TORCH_CHECK(hx.size() == 2, "lstm_cell expects two hidden states"); return LSTMCell{}(input, std::make_tuple(hx[0], hx[1]), CellParams{w_ih, w_hh, b_ih, b_hh}); } @@ -730,7 +730,7 @@ std::tuple quantized_lstm( const Tensor& _input, TensorList hx, TensorList _params, bool has_biases, int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) { - AT_CHECK(hx.size() == 2, "lstm expects two hidden states"); + TORCH_CHECK(hx.size() == 2, "lstm expects two hidden states"); if (at::cudnn_is_acceptable(_input)) { Tensor output, hy, cy; lstm_cudnn_stub(_input.type().device_type(), output, hy, cy, _input, hx, _params, has_biases, @@ -739,7 +739,7 @@ std::tuple quantized_lstm( } check_device(_input, _params, hx); auto input = batch_first ? _input.transpose(0, 1) : _input; - AT_CHECK(has_biases, "quantized LSTM requires biases"); + TORCH_CHECK(has_biases, "quantized LSTM requires biases"); auto params = gather_quantized_params(_params); auto results = _lstm_impl( input, params, hx[0], hx[1], num_layers, dropout_p, train, bidirectional); diff --git a/aten/src/ATen/native/RNN.h b/aten/src/ATen/native/RNN.h index a4a359a07380..d9bdd90e4860 100644 --- a/aten/src/ATen/native/RNN.h +++ b/aten/src/ATen/native/RNN.h @@ -25,7 +25,7 @@ inline void check_device(const Tensor& input, const TensorList& params, const Te auto check_tensors = [&](const std::string& name, const Tensor& t) { if (!t.defined()) return; auto t_device = t.device(); - AT_CHECK(input_device == t_device, + TORCH_CHECK(input_device == t_device, "Input and ", name, " tensors are not at the same device, found input tensor at ", input_device, " and ", name, " tensor at ", t_device); }; diff --git a/aten/src/ATen/native/RangeFactories.cpp b/aten/src/ATen/native/RangeFactories.cpp index 2a7fe9b4f1f8..63a4d7b71cc0 100644 --- a/aten/src/ATen/native/RangeFactories.cpp +++ b/aten/src/ATen/native/RangeFactories.cpp @@ -9,7 +9,7 @@ namespace at { namespace native { Tensor& linspace_cpu_out(Tensor& result, Scalar start, Scalar end, int64_t steps) { - AT_CHECK(steps >= 0, "number of steps must be non-negative"); + TORCH_CHECK(steps >= 0, "number of steps must be non-negative"); if (result.numel() != steps) { result.resize_({steps}); @@ -42,7 +42,7 @@ Tensor& linspace_cpu_out(Tensor& result, Scalar start, Scalar end, int64_t steps } Tensor& logspace_cpu_out(Tensor& result, Scalar start, Scalar end, int64_t steps, double base) { - AT_CHECK(steps >= 0, "number of steps must be non-negative"); + TORCH_CHECK(steps >= 0, "number of steps must be non-negative"); if (result.numel() != steps) { result.resize_({steps}); @@ -82,11 +82,11 @@ Tensor& range_cpu_out(Tensor& result, Scalar start, Scalar end, Scalar step) { auto xend = end.to(); auto xstep = step.to(); - AT_CHECK(xstep > 0 || xstep < 0, "step must be nonzero"); - AT_CHECK(std::isfinite(static_cast(xstart)) && + TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero"); + TORCH_CHECK(std::isfinite(static_cast(xstart)) && std::isfinite(static_cast(xend)), "unsupported range: ", xstart, " -> ", xend); - AT_CHECK(((xstep > 0) && (xend >= xstart)) || ((xstep < 0) && (xend <= xstart)), + TORCH_CHECK(((xstep > 0) && (xend >= xstart)) || ((xstep < 0) && (xend <= xstart)), "upper bound and larger bound inconsistent with step sign"); int64_t size = static_cast(((xend - xstart) / xstep) + 1); if (result.numel() != size) { @@ -132,14 +132,14 @@ Tensor& arange_cpu_out(Tensor& result, Scalar start, Scalar end, Scalar step) { / step.to()); } - AT_CHECK(xstep > 0 || xstep < 0, "step must be nonzero"); - AT_CHECK(std::isfinite(static_cast(xstart)) && + TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero"); + TORCH_CHECK(std::isfinite(static_cast(xstart)) && std::isfinite(static_cast(xend)), "unsupported range: ", xstart, " -> ", xend); - AT_CHECK(((xstep > 0) && (xend >= xstart)) || ((xstep < 0) && (xend <= xstart)), + TORCH_CHECK(((xstep > 0) && (xend >= xstart)) || ((xstep < 0) && (xend <= xstart)), "upper bound and larger bound inconsistent with step sign"); - AT_CHECK(size_d >= 0 && size_d <= static_cast(std::numeric_limits::max()), + TORCH_CHECK(size_d >= 0 && size_d <= static_cast(std::numeric_limits::max()), "invalid size, possible overflow?"); int64_t size = static_cast(size_d); diff --git a/aten/src/ATen/native/ReduceOps.cpp b/aten/src/ATen/native/ReduceOps.cpp index 9cb247defb4c..330cad03a2af 100644 --- a/aten/src/ATen/native/ReduceOps.cpp +++ b/aten/src/ATen/native/ReduceOps.cpp @@ -91,7 +91,7 @@ static std::unique_ptr make_reduction( bool keepdim, ScalarType dtype) { // check that result type and dtype match if provided - AT_CHECK( + TORCH_CHECK( !result.defined() || result.scalar_type() == dtype, name, ": provided dtype must match dtype of result. Got ", toString(result.scalar_type()), @@ -114,6 +114,41 @@ static std::unique_ptr make_reduction( return TensorIterator::reduce_op(viewed_result, self.to(dtype)); } +static std::unique_ptr make_reduction( + const char* name, Tensor& result1, Tensor& result2, const Tensor& self, IntArrayRef dim, + bool keepdim, ScalarType dtype) +{ + // check that result type and dtype match if provided + for (const Tensor *t: {&result1, &result2}) { + const Tensor& result = *t; + TORCH_CHECK( + !result.defined() || result.type().scalarType() == dtype, + name, ": provided dtype must match dtype of result. Got ", + toString(result.type().scalarType()), + " and ", + toString(dtype), + "."); + } + + int64_t ndim = self.dim(); + DimMask mask = make_dim_mask(dim, ndim); + allocate_reduction_result(result1, self, mask, keepdim, dtype); + auto viewed_result1 = review_reduce_result(result1, ndim, mask, keepdim); + + allocate_reduction_result(result2, self, mask, keepdim, dtype); + auto viewed_result2 = review_reduce_result(result2, ndim, mask, keepdim); + + // special case for type promotion in mixed precision, improves computational + // efficiency. + // We don't generalize this to common mismatched input/output types to avoid cross + // product of templated kernel launches. + if (self.type().scalarType() == dtype || + (self.is_cuda() && self.type().scalarType() == kHalf && dtype == kFloat)) { + return TensorIterator::reduce_op(viewed_result1, viewed_result2, self); + } + return TensorIterator::reduce_op(viewed_result1, viewed_result2, self.to(dtype)); +} + static inline int64_t n_dim_size(const Tensor& self, IntArrayRef dim) { int64_t numel = 1; for (auto d : dim) { @@ -136,7 +171,7 @@ Tensor cumsum(const Tensor& self, int64_t dim) { static inline Tensor& cumsum_out(Tensor& result, const Tensor& self, int64_t dim, optional dtype) { // result type is favored over dtype; check that they match if provided (NumPy doesn't check) - AT_CHECK( + TORCH_CHECK( !dtype.has_value() || (result.scalar_type() == dtype.value()), "provided dtype must match dtype of result in cumsum. Got ", toString(result.scalar_type()), @@ -168,7 +203,7 @@ Tensor cumprod(const Tensor& self, int64_t dim) { static inline Tensor& cumprod_out(Tensor& result, const Tensor& self, int64_t dim, optional dtype) { // result type is favored over dtype; check that they match if provided (NumPy doesn't check) - AT_CHECK( + TORCH_CHECK( !dtype.has_value() || (result.scalar_type() == dtype.value()), "provided dtype must match dtype of result in cumprod. Got ", toString(result.scalar_type()), @@ -257,7 +292,7 @@ Tensor prod(const Tensor &self) { static inline Tensor &mean_out(Tensor &result, const Tensor &self, IntArrayRef dim, bool keepdim, optional opt_dtype) { ScalarType scalarType = opt_dtype.has_value() ? opt_dtype.value() : self.scalar_type(); - AT_CHECK( + TORCH_CHECK( at::isFloatingType(scalarType), "Can only calculate the mean of floating types. Got ", toString(scalarType), @@ -419,11 +454,11 @@ Tensor logsumexp(const Tensor &self, IntArrayRef dims, bool keepdim) { static Tensor& norm_out(Tensor &result, const Tensor &self, optional opt_p, IntArrayRef dim, bool keepdim, optional opt_dtype) { auto p = opt_p.value_or(2.0); - AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA, + TORCH_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA, "norm only supports CPU AND CUDA backend, got: ", toString(self.type().backend())); ScalarType scalarType = opt_dtype.has_value() ? opt_dtype.value() : self.scalar_type(); - AT_CHECK( + TORCH_CHECK( at::isFloatingType(scalarType), "Can only calculate the mean of floating types. Got ", toString(scalarType), @@ -443,9 +478,9 @@ static inline Tensor _norm(const Tensor &self, Scalar p) { if (self.is_sparse()) { return at::native_norm(self, p); } else { - AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA, + TORCH_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA, "norm only supports CPU AND CUDA backend, got: ", toString(self.type().backend())); - AT_CHECK(at::isFloatingType(self.scalar_type()), "norm only supports floating-point dtypes"); + TORCH_CHECK(at::isFloatingType(self.scalar_type()), "norm only supports floating-point dtypes"); Tensor result; return at::native::norm_out(result, self, p, {}, false, c10::nullopt); @@ -494,10 +529,10 @@ inline Tensor & _all(Tensor & result, std::unique_ptr & iter) { } Tensor all(const Tensor& self) { - AT_CHECK(self.type().backend() == Backend::CPU || + TORCH_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA, "all only supports CPU AND CUDA " "backend, got: ", toString(self.type().backend())); - AT_CHECK(self.scalar_type() == at::ScalarType::Byte, + TORCH_CHECK(self.scalar_type() == at::ScalarType::Byte, "all only supports torch.uint8 dtype"); Tensor result = at::empty({0}, self.options()); @@ -512,10 +547,10 @@ Tensor all(const Tensor& self, int64_t dim, bool keepdim) { } Tensor &all_out(Tensor &result, const Tensor &self, int64_t dim, bool keepdim) { - AT_CHECK(self.type().backend() == Backend::CPU || + TORCH_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA, "all only supports CPU AND CUDA " "backend, got: ", toString(self.type().backend())); - AT_CHECK(self.scalar_type() == at::ScalarType::Byte, + TORCH_CHECK(self.scalar_type() == at::ScalarType::Byte, "all only supports torch.uint8 dtype"); dim = maybe_wrap_dim(dim, self.dim()); if (_dimreduce_return_trivial(result, self, 1, dim, keepdim)) { @@ -538,10 +573,10 @@ inline Tensor & _any(Tensor & result, std::unique_ptr & iter) { } Tensor any(const Tensor& self) { - AT_CHECK(self.type().backend() == Backend::CPU || + TORCH_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA, "any only supports CPU AND CUDA " "backend, got: ", toString(self.type().backend())); - AT_CHECK(self.scalar_type() == at::ScalarType::Byte, + TORCH_CHECK(self.scalar_type() == at::ScalarType::Byte, "any only supports torch.uint8 dtype"); Tensor result = at::empty({0}, self.options()); @@ -556,10 +591,10 @@ Tensor any(const Tensor& self, int64_t dim, bool keepdim) { } Tensor &any_out(Tensor &result, const Tensor &self, int64_t dim, bool keepdim) { - AT_CHECK(self.type().backend() == Backend::CPU || + TORCH_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA, "any only supports CPU AND CUDA " "backend, got: ", toString(self.type().backend())); - AT_CHECK(self.scalar_type() == at::ScalarType::Byte, + TORCH_CHECK(self.scalar_type() == at::ScalarType::Byte, "any only supports torch.uint8 dtype"); dim = maybe_wrap_dim(dim, self.dim()); if (_dimreduce_return_trivial(result, self, 0, dim, keepdim)) { @@ -578,7 +613,7 @@ Tensor min_values(const Tensor& self, IntArrayRef dims, bool keepdim) { Tensor result = at::empty({0}, self.options()); ScalarType dtype = get_dtype(result, self, {}, true); auto iter = make_reduction("min_values", result, self, dims, keepdim, dtype); - AT_CHECK(iter->numel() > 0, "min_values on a tensor with no elements is not defined."); + TORCH_CHECK(iter->numel() > 0, "min_values on a tensor with no elements is not defined."); min_values_stub(iter->device_type(), *iter); return result; } @@ -591,16 +626,16 @@ Tensor max_values(const Tensor& self, IntArrayRef dims, bool keepdim) { Tensor result = at::empty({0}, self.options()); ScalarType dtype = get_dtype(result, self, {}, true); auto iter = make_reduction("max_values", result, self, dims, keepdim, dtype); - AT_CHECK(iter->numel() > 0, "max_values on a tensor with no elements is not defined."); + TORCH_CHECK(iter->numel() > 0, "max_values on a tensor with no elements is not defined."); max_values_stub(iter->device_type(), *iter); return result; } } static Tensor &std_var_out(Tensor &result, const Tensor &self, IntArrayRef dim, bool unbiased, bool keepdim, bool take_sqrt) { - AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA, + TORCH_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA, "std and var only support CPU AND CUDA backend, got: ", toString(self.type().backend())); - AT_CHECK(at::isFloatingType(self.scalar_type()), "std and var only support floating-point dtypes"); + TORCH_CHECK(at::isFloatingType(self.scalar_type()), "std and var only support floating-point dtypes"); ScalarType dtype = get_dtype(result, self, {}, true); auto iter = make_reduction("std or var", result, self, dim, keepdim, dtype); if (iter->numel() == 0) { @@ -611,10 +646,72 @@ static Tensor &std_var_out(Tensor &result, const Tensor &self, IntArrayRef dim, return result; } +static std::tuple std_var_mean_out(const char* fname, Tensor &result1, Tensor &result2, const Tensor &self, IntArrayRef dim, bool unbiased, bool keepdim, bool take_sqrt) { + AT_ASSERT(result1.defined() && result2.defined()); + TORCH_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA, + fname, " only support CPU and CUDA backend, got: ", toString(self.type().backend())); + TORCH_CHECK(at::isFloatingType(self.type().scalarType()), fname, " only support floating-point dtypes"); + TORCH_CHECK(result1.type().scalarType() == result2.type().scalarType(), + "provided by result1 dtype must match dtype of result2. Got ", + toString(result1.type().scalarType()), + " and ", + toString(result2.type().scalarType()), + "."); + ScalarType dtype = get_dtype(result1, self, {}, true); + auto iter = make_reduction(fname, result1, result2, self, dim, keepdim, dtype); + if (iter->numel() == 0) { + result1.fill_(NAN); + result2.fill_(NAN); + } else { + std_var_stub(iter->device_type(), *iter, unbiased, take_sqrt); + } + return std::tuple(result1, result2); +} + +std::tuple var_mean_out(Tensor &result1, Tensor &result2, const Tensor &self, IntArrayRef dim, bool unbiased, bool keepdim) { + return std_var_mean_out("var_mean", result1, result2, self, dim, unbiased, keepdim, false); +} + +std::tuple std_mean_out(Tensor &result1, Tensor &result2, const Tensor &self, IntArrayRef dim, bool unbiased, bool keepdim) { + return std_var_mean_out("std_mean", result1, result2, self, dim, unbiased, keepdim, true); +} + +std::tuple var_mean_out(Tensor &result1, Tensor &result2, const Tensor &self, bool unbiased) { + return std_var_mean_out("var_mean", result1, result2, self, {}, unbiased, false, false); +} + +std::tuple std_mean_out(Tensor &result1, Tensor &result2, const Tensor &self, bool unbiased) { + return std_var_mean_out("std_mean", result1, result2, self, {}, unbiased, false, true); +} + +std::tuple var_mean(const Tensor& self, IntArrayRef dim, bool unbiased, bool keepdim) { + Tensor result1 = at::empty({0}, self.options()); + Tensor result2 = at::empty({0}, self.options()); + return at::native::var_mean_out(result1, result2, self, dim, unbiased, keepdim); +} + +std::tuple std_mean(const Tensor& self, IntArrayRef dim, bool unbiased, bool keepdim) { + Tensor result1 = at::empty({0}, self.options()); + Tensor result2 = at::empty({0}, self.options()); + return at::native::std_mean_out(result1, result2, self, dim, unbiased, keepdim); +} + +std::tuple std_mean(const Tensor& self, bool unbiased) { + Tensor result1 = at::empty({0}, self.options()); + Tensor result2 = at::empty({0}, self.options()); + return at::native::std_mean_out(result1, result2, self, unbiased); +} + +std::tuple var_mean(const Tensor& self, bool unbiased) { + Tensor result1 = at::empty({0}, self.options()); + Tensor result2 = at::empty({0}, self.options()); + return at::native::var_mean_out(result1, result2, self, unbiased); +} + Tensor var(const Tensor& self, bool unbiased) { - AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA, + TORCH_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA, "var only supports CPU AND CUDA backend, got: ", toString(self.type().backend())); - AT_CHECK(at::isFloatingType(self.scalar_type()), "var only supports floating-point dtypes"); + TORCH_CHECK(at::isFloatingType(self.scalar_type()), "var only supports floating-point dtypes"); auto trivial_return = _allreduce_return_trivial(self, std::numeric_limits::quiet_NaN()); return trivial_return.has_value() ? trivial_return.value() : at::legacy::th::_th_var(self, unbiased); } @@ -629,9 +726,9 @@ Tensor &var_out(Tensor &result, const Tensor &self, IntArrayRef dim, bool unbias } Tensor std(const Tensor& self, bool unbiased) { - AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA, + TORCH_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA, "std only supports CPU AND CUDA backend, got: ", toString(self.type().backend())); - AT_CHECK(at::isFloatingType(self.scalar_type()), "std only supports floating-point dtypes"); + TORCH_CHECK(at::isFloatingType(self.scalar_type()), "std only supports floating-point dtypes"); auto trivial_return = _allreduce_return_trivial(self, std::numeric_limits::quiet_NaN()); return trivial_return.has_value() ? trivial_return.value() : at::legacy::th::_th_std(self, unbiased); } diff --git a/aten/src/ATen/native/ReflectionPad.cpp b/aten/src/ATen/native/ReflectionPad.cpp index 5689f02bea57..22889b08a0bc 100644 --- a/aten/src/ATen/native/ReflectionPad.cpp +++ b/aten/src/ATen/native/ReflectionPad.cpp @@ -61,7 +61,7 @@ void reflection_pad1d_out_template( int64_t dim_w = 1; int64_t nbatch = 1; - AT_CHECK(input_.numel() > 0 && + TORCH_CHECK(input_.numel() > 0 && (input_.ndimension() == 2 || input_.ndimension() == 3), "non-empty 2D " "or 3D (batch mode) tensor expected for input, but got: ", input_); @@ -79,11 +79,11 @@ void reflection_pad1d_out_template( int64_t input_w = input_.size(dim_w); int64_t output_w = input_w + pad_l + pad_r; - AT_CHECK(pad_l < input_w && pad_r < input_w, "Argument #4: Padding size " + TORCH_CHECK(pad_l < input_w && pad_r < input_w, "Argument #4: Padding size " "should be less than the corresponding input dimension, but got: padding (", pad_l, ", ", pad_r, ") at dimension ", dim_w, " of input ", input_.sizes()); - AT_CHECK(output_w >= 1 , 2, + TORCH_CHECK(output_w >= 1 , 2, "input (W: ", input_w, ")is too small. Calculated output W: ", output_w); /* get contiguous input */ @@ -179,7 +179,7 @@ void reflection_pad1d_backward_out_template( int64_t input_w = input.size(dim_w); int64_t output_w = input_w + pad_l + pad_r; - AT_CHECK(output_w == grad_output_.size(dim_w), "grad_output width unexpected." + TORCH_CHECK(output_w == grad_output_.size(dim_w), "grad_output width unexpected." " Expected: ", output_w, ", Got: ", grad_output_.size(dim_w)); /* get contiguous grad_output */ @@ -280,7 +280,7 @@ void reflection_pad2d_out_template( int dim_slices = 0; int64_t nbatch = 1; - AT_CHECK(input_.numel() > 0 && + TORCH_CHECK(input_.numel() > 0 && (input_.ndimension() == 3 || input_.ndimension() == 4), "non-empty 3D or " "4D (batch mode) tensor expected for input, but got: ", input_); @@ -303,17 +303,17 @@ void reflection_pad2d_out_template( int64_t output_h = input_h + pad_t + pad_b; int64_t output_w = input_w + pad_l + pad_r; - AT_CHECK(pad_l < input_w && pad_r < input_w, + TORCH_CHECK(pad_l < input_w && pad_r < input_w, "Argument #4: Padding size should be less than the corresponding " "input dimension, but got: padding (", pad_l, ", ", pad_r, ") at dimension ", dim_w, " of input ", input_.ndimension()); - AT_CHECK(pad_t < input_h && pad_b < input_h, + TORCH_CHECK(pad_t < input_h && pad_b < input_h, "Argument #6: Padding size should be less than the corresponding " "input dimension, but got: padding (", pad_t, ", ", pad_b, ") at dimension ", dim_h, " of input ", input_.ndimension()); - AT_CHECK(output_w >= 1 || output_h >= 1, + TORCH_CHECK(output_w >= 1 || output_h >= 1, "input (H: ", input_h, ", W: ", input_w, ")is too small. Calculated " "output H: ", output_h, " W: ", output_w); @@ -435,11 +435,11 @@ void reflection_pad2d_backward_out_template( int64_t output_h = input_h + pad_t + pad_b; int64_t output_w = input_w + pad_l + pad_r; - AT_CHECK(output_w == grad_output_.size(dim_w), + TORCH_CHECK(output_w == grad_output_.size(dim_w), "gradOutput width unexpected. Expected: ", output_w, ", Got: ", grad_output_.size(dim_w)); - AT_CHECK(output_h == grad_output_.size(dim_h), + TORCH_CHECK(output_h == grad_output_.size(dim_h), "gradOutput height unexpected. Expected: ", output_h, ", Got: ", grad_output_.size(dim_h)); diff --git a/aten/src/ATen/native/Repeat.cpp b/aten/src/ATen/native/Repeat.cpp index 0137c80098c9..d63d0a511d61 100644 --- a/aten/src/ATen/native/Repeat.cpp +++ b/aten/src/ATen/native/Repeat.cpp @@ -32,7 +32,7 @@ Tensor repeat_interleave(const Tensor &self, const Tensor &repeats, c10::optiona if (repeats.dim() == 0 || (repeats.dim() == 1 && repeats.size(0) == 1)) { repeats_ = repeats.reshape({1}).expand({input.size(dim.value())}); } else if (repeats.dim() == 1) { - AT_CHECK(repeats.size(0) == input.size(dim.value()), "repeats must have the same size as input along dim") + TORCH_CHECK(repeats.size(0) == input.size(dim.value()), "repeats must have the same size as input along dim") } else { AT_ERROR("repeats must be 0-dim or 1-dim tensor"); } diff --git a/aten/src/ATen/native/Repeat.h b/aten/src/ATen/native/Repeat.h index a1ba07527d8f..e44e32b6bbdc 100644 --- a/aten/src/ATen/native/Repeat.h +++ b/aten/src/ATen/native/Repeat.h @@ -6,9 +6,9 @@ namespace at { namespace native { template static inline Tensor repeat_interleave_common(const Tensor &repeats) { - AT_CHECK(repeats.dim() == 1, "repeat_interleave only accept 1D vector as repeat"); - AT_CHECK(repeats.scalar_type() == at::kLong, "repeats has to be Long tensor"); - AT_CHECK((repeats >= 0).all().item(), "repeats can not be negative"); + TORCH_CHECK(repeats.dim() == 1, "repeat_interleave only accept 1D vector as repeat"); + TORCH_CHECK(repeats.scalar_type() == at::kLong, "repeats has to be Long tensor"); + TORCH_CHECK((repeats >= 0).all().item(), "repeats can not be negative"); Tensor repeats_ = repeats.contiguous(); Tensor cumsum = repeats.cumsum(0); int64_t total = cumsum[-1].item(); diff --git a/aten/src/ATen/native/ReplicationPadding.cpp b/aten/src/ATen/native/ReplicationPadding.cpp index 562f11215037..4ecab611a231 100644 --- a/aten/src/ATen/native/ReplicationPadding.cpp +++ b/aten/src/ATen/native/ReplicationPadding.cpp @@ -67,11 +67,11 @@ void replication_pad1d_out_cpu_template( int dimw = 1; int dimslices = 0; long nbatch = 1; - AT_CHECK(paddingSize.size() == 2, "padding size is expected to be 2"); + TORCH_CHECK(paddingSize.size() == 2, "padding size is expected to be 2"); int pad_l = paddingSize[0]; int pad_r = paddingSize[1]; - AT_CHECK(input_.numel() > 0 + TORCH_CHECK(input_.numel() > 0 && (input_.ndimension() == 2 || input_.ndimension() == 3), "non-empty 2D or 3D (batch mode) tensor expected for input"); @@ -87,7 +87,7 @@ void replication_pad1d_out_cpu_template( long iwidth = input_.size(dimw); long owidth = iwidth + pad_l + pad_r; - AT_CHECK(owidth >= 1, + TORCH_CHECK(owidth >= 1, "input (W: ", iwidth, ") is too small." " Calculated output W: ", owidth); @@ -193,7 +193,7 @@ Tensor& replication_pad1d_backward_out_cpu_template( int dimw = 1; int dimslices = 0; long nbatch = 1; - AT_CHECK(paddingSize.size() == 2, "padding size is expected to be 2"); + TORCH_CHECK(paddingSize.size() == 2, "padding size is expected to be 2"); int pad_l = paddingSize[0]; int pad_r = paddingSize[1]; @@ -209,7 +209,7 @@ Tensor& replication_pad1d_backward_out_cpu_template( long iwidth = input.size(dimw); long owidth = iwidth + pad_l + pad_r; - AT_CHECK(owidth == gradOutput_.size(dimw), + TORCH_CHECK(owidth == gradOutput_.size(dimw), "gradOutput width unexpected. Expected: ", owidth, " Got: ", gradOutput_.size(dimw)); @@ -329,7 +329,7 @@ void replication_pad2d_out_cpu_template(Tensor& output, const Tensor& input_, IntArrayRef paddingSize) { - AT_CHECK(paddingSize.size() == 4, "padding size is expected to be 4"); + TORCH_CHECK(paddingSize.size() == 4, "padding size is expected to be 4"); int pad_l = paddingSize[0]; int pad_r = paddingSize[1]; int pad_t = paddingSize[2]; @@ -339,7 +339,7 @@ void replication_pad2d_out_cpu_template(Tensor& output, int dimslices = 0; int64_t nbatch = 1; - AT_CHECK(input_.numel() > 0 && (input_.dim() == 3 || input_.dim() == 4), + TORCH_CHECK(input_.numel() > 0 && (input_.dim() == 3 || input_.dim() == 4), "3D or 4D (batch mode) tensor expected for input, but got: ", input_); if (input_.dim() == 4) @@ -357,7 +357,7 @@ void replication_pad2d_out_cpu_template(Tensor& output, int64_t oheight = iheight + pad_t + pad_b; int64_t owidth = iwidth + pad_l + pad_r; - AT_CHECK(owidth >= 1 || oheight >= 1, + TORCH_CHECK(owidth >= 1 || oheight >= 1, "input (H: ", iheight, ", W: ", iwidth, " ) is too small." " Calculated output H: ", oheight, " W: ", owidth); @@ -473,7 +473,7 @@ Tensor& replication_pad2d_backward_out_cpu_template( const Tensor& input, IntArrayRef paddingSize) { - AT_CHECK(paddingSize.size() == 4, "padding size is expected to be 4"); + TORCH_CHECK(paddingSize.size() == 4, "padding size is expected to be 4"); int pad_l = paddingSize[0]; int pad_r = paddingSize[1]; int pad_t = paddingSize[2]; @@ -498,10 +498,10 @@ Tensor& replication_pad2d_backward_out_cpu_template( int64_t oheight = iheight + pad_t + pad_b; int64_t owidth = iwidth + pad_l + pad_r; - AT_CHECK(owidth == gradOutput_.size(dimw), + TORCH_CHECK(owidth == gradOutput_.size(dimw), "gradOutput width unexpected. Expected: ", owidth, ", Got: ", gradOutput_.size(dimw)); - AT_CHECK(oheight == gradOutput_.size(dimh), + TORCH_CHECK(oheight == gradOutput_.size(dimh), "gradOutput height unexpected. Expected: ", oheight, ", Got: ", gradOutput_.size(dimh)); @@ -557,7 +557,7 @@ static inline void shapeCheck3d( int dimd = 1; int dimslices = 0; - AT_CHECK(input.numel() > 0 && (input.dim() == 4 || input.dim() == 5), + TORCH_CHECK(input.numel() > 0 && (input.dim() == 4 || input.dim() == 5), "non-empty 4D or 5D (batch mode) tensor expected for input, but got: ", input); if (input.dim() == 5) @@ -577,7 +577,7 @@ static inline void shapeCheck3d( int64_t oheight = iheight + ptop + pbottom; int64_t owidth = iwidth + pleft + pright; - AT_CHECK(owidth >= 1 || oheight >= 1 || odepth >= 1, + TORCH_CHECK(owidth >= 1 || oheight >= 1 || odepth >= 1, "input (D: ", idepth, " H: ", iheight, ", W: ", iwidth, ") is too small." " Calculated output D: ", odepth, " H: ", oheight, " W: ", owidth); @@ -674,7 +674,7 @@ void replication_pad3d_out_cpu_template( const Tensor& input_, IntArrayRef paddingSize) { - AT_CHECK(paddingSize.size() == 6, "padding size is expected to be 6"); + TORCH_CHECK(paddingSize.size() == 6, "padding size is expected to be 6"); int pleft = paddingSize[0]; int pright = paddingSize[1]; int ptop = paddingSize[2]; @@ -832,7 +832,7 @@ Tensor& replication_pad3d_backward_out_cpu_template( const Tensor& input, IntArrayRef paddingSize) { - AT_CHECK(paddingSize.size() == 6, "padding size is expected to be 6"); + TORCH_CHECK(paddingSize.size() == 6, "padding size is expected to be 6"); int pleft = paddingSize[0]; int pright = paddingSize[1]; int ptop = paddingSize[2]; diff --git a/aten/src/ATen/native/Resize.h b/aten/src/ATen/native/Resize.h index 27ceb5fcf8ed..1e8e661eaf09 100644 --- a/aten/src/ATen/native/Resize.h +++ b/aten/src/ATen/native/Resize.h @@ -63,7 +63,7 @@ static inline void checkInBoundsForStorage( return; } int64_t new_storage_size = new_storage.numel(); - AT_CHECK( + TORCH_CHECK( storage_offset + storage_size <= new_storage_size, "setStorage: sizes ", size, ", strides ", stride, "," " and storage offset ", storage_offset, @@ -84,7 +84,7 @@ inline void setStrided( checkInBoundsForStorage(size, stride, storage_offset, self_->storage()); /* storage offset */ - AT_CHECK(storage_offset >= 0, "Tensor: invalid storage offset ", storage_offset); + TORCH_CHECK(storage_offset >= 0, "Tensor: invalid storage offset ", storage_offset); self_->set_storage_offset(storage_offset); /* size and stride */ diff --git a/aten/src/ATen/native/Scalar.cpp b/aten/src/ATen/native/Scalar.cpp index c018b3a3cc40..7e9cdf30d032 100644 --- a/aten/src/ATen/native/Scalar.cpp +++ b/aten/src/ATen/native/Scalar.cpp @@ -6,7 +6,7 @@ namespace native { Scalar item(const Tensor& self) { int64_t numel = self.numel(); - AT_CHECK(numel == 1, "a Tensor with ", numel, " elements cannot be converted to Scalar"); + TORCH_CHECK(numel == 1, "a Tensor with ", numel, " elements cannot be converted to Scalar"); if (self.is_sparse()) { if (self._nnz() == 0) return Scalar(0); if (self.is_coalesced()) return at::_local_scalar_dense(self._values()); diff --git a/aten/src/ATen/native/SharedReduceOps.h b/aten/src/ATen/native/SharedReduceOps.h index 144d38e5400c..7831ef5035d5 100644 --- a/aten/src/ATen/native/SharedReduceOps.h +++ b/aten/src/ATen/native/SharedReduceOps.h @@ -6,9 +6,11 @@ #if defined(__CUDACC__) #include #include +#include #elif defined(__HIPCC__) #include #include +#include #else #include #define device_sqrt std::sqrt @@ -42,7 +44,7 @@ struct WelfordData { }; -template +template struct WelfordOps { bool unbiased; bool take_sqrt; @@ -80,12 +82,18 @@ struct WelfordOps { new_count }; } - inline C10_DEVICE scalar_t project(acc_t acc) const { + inline C10_DEVICE res_t project(acc_t acc) const { + auto mean = acc.mean; combine_t divisor = unbiased ? (acc.nf - 1) : acc.nf; auto ret = (divisor > 0) ? (take_sqrt ? device_sqrt(acc.m2 / divisor) : (acc.m2 / divisor)) : NAN; - return (scalar_t) ret; +#if defined(__CUDACC__) || defined(__HIPCC__) + thrust::tuple results((scalar_t) ret, (scalar_t) mean); +#else + std::tuple results{(scalar_t) ret, (scalar_t) mean}; +#endif + return results; } #if defined(__CUDACC__) || defined(__HIPCC__) inline __device__ acc_t warp_shfl_down(acc_t acc, int offset) const { diff --git a/aten/src/ATen/native/SobolEngineOps.cpp b/aten/src/ATen/native/SobolEngineOps.cpp index 1e0af21473b7..b0b1c36664f6 100644 --- a/aten/src/ATen/native/SobolEngineOps.cpp +++ b/aten/src/ATen/native/SobolEngineOps.cpp @@ -16,9 +16,9 @@ namespace native { /// `sobolstate`. std::tuple _sobol_engine_draw(const Tensor& quasi, int64_t n, const Tensor& sobolstate, int64_t dimension, int64_t num_generated, optional dtype) { - AT_CHECK(sobolstate.dtype() == at::kLong, + TORCH_CHECK(sobolstate.dtype() == at::kLong, "sobolstate needs to be of type ", at::kLong); - AT_CHECK(quasi.dtype() == at::kLong, + TORCH_CHECK(quasi.dtype() == at::kLong, "quasi needs to be of type ", at::kLong); Tensor wquasi = quasi.clone(); @@ -55,9 +55,9 @@ std::tuple _sobol_engine_draw(const Tensor& quasi, int64_t n, co /// specified above. Tensor& _sobol_engine_ff_(Tensor& quasi, int64_t n, const Tensor& sobolstate, int64_t dimension, int64_t num_generated) { - AT_CHECK(sobolstate.dtype() == at::kLong, + TORCH_CHECK(sobolstate.dtype() == at::kLong, "sobolstate needs to be of type ", at::kLong); - AT_CHECK(quasi.dtype() == at::kLong, + TORCH_CHECK(quasi.dtype() == at::kLong, "quasi needs to be of type ", at::kLong); // We deal with `data` and `strides` due to performance issues. @@ -82,7 +82,7 @@ Tensor& _sobol_engine_ff_(Tensor& quasi, int64_t n, const Tensor& sobolstate, /// and a list of random lower triangular matrices consisting of 0s and 1s. `dimension` is /// passed explicitly again. Tensor& _sobol_engine_scramble_(Tensor& sobolstate, const Tensor& ltm, int64_t dimension) { - AT_CHECK(sobolstate.dtype() == at::kLong, + TORCH_CHECK(sobolstate.dtype() == at::kLong, "sobolstate needs to be of type ", at::kLong); /// Require a tensor accessor for `sobolstate` @@ -121,7 +121,7 @@ Tensor& _sobol_engine_scramble_(Tensor& sobolstate, const Tensor& ltm, int64_t d /// This is a core function to initialize the main state variable of a `SobolEngine`. /// `dimension` is passed explicitly as well (see why above) Tensor& _sobol_engine_initialize_state_(Tensor& sobolstate, int64_t dimension) { - AT_CHECK(sobolstate.dtype() == at::kLong, + TORCH_CHECK(sobolstate.dtype() == at::kLong, "sobolstate needs to be of type ", at::kLong); /// First row of `sobolstate` is 1 diff --git a/aten/src/ATen/native/SoftMax.cpp b/aten/src/ATen/native/SoftMax.cpp index 60ba37ef69e1..aa81f09bee2f 100644 --- a/aten/src/ATen/native/SoftMax.cpp +++ b/aten/src/ATen/native/SoftMax.cpp @@ -127,7 +127,7 @@ Tensor softmax_cpu(const Tensor& input_, const int64_t dim_, const bool half_to_ } if (input.dim() == 0) input = input.view(1); - AT_CHECK( + TORCH_CHECK( dim >= 0 && dim < input.dim(), "dim must be non-negative and less than input dimensions"); if (input.ndimension() > 0 && dim == input.ndimension() - 1) { @@ -151,7 +151,7 @@ Tensor log_softmax_cpu(const Tensor& input_, const int64_t dim_, const bool half } if (input.dim() == 0) input = input.view(1); - AT_CHECK( + TORCH_CHECK( dim >= 0 && dim < input.dim(), "dim must be non-negative and less than input dimensions"); if (input.ndimension() > 0 && dim == input.ndimension() - 1) { @@ -183,7 +183,7 @@ Tensor softmax_backward_cpu( grad = grad.view(1); if (output.dim() == 0) output = output.view(1); - AT_CHECK( + TORCH_CHECK( dim >= 0 && dim < grad.dim(), "dim must be non-negative and less than input dimensions"); if (grad.ndimension() > 0 && dim == grad.ndimension() - 1) { @@ -215,7 +215,7 @@ Tensor log_softmax_backward_cpu( grad = grad.view(1); if (output.dim() == 0) output = output.view(1); - AT_CHECK( + TORCH_CHECK( dim >= 0 && dim < grad.dim(), "dim must be non-negative and less than input dimensions"); if (grad.ndimension() > 0 && dim == grad.ndimension() - 1) { diff --git a/aten/src/ATen/native/Sorting.cpp b/aten/src/ATen/native/Sorting.cpp index a2b387ae621c..4fee5be8da34 100644 --- a/aten/src/ATen/native/Sorting.cpp +++ b/aten/src/ATen/native/Sorting.cpp @@ -138,11 +138,11 @@ std::tuple kthvalue_out_cpu( // FIXME: This seems bogus, I only do this because it was the old behaviour. // The reductions are fine, as long as the axis being reduced along // isn't of 0 elements (and the output has elements). - AT_CHECK( + TORCH_CHECK( self.numel() > 0, "cannot perform reduction function kthvalue", " on tensor with no elements because the operation does not have an identity"); - AT_CHECK( + TORCH_CHECK( k > 0 && k <= (self.dim() > 0 ? self.size(dim) : 1), "selected index k out of range"); @@ -225,7 +225,7 @@ std::tuple median( // this does not reduce to median with dim beause we don't want to copy twice Tensor median_cpu(const Tensor& self) { - AT_CHECK(self.numel() > 0, "median cannot be called with empty tensor"); + TORCH_CHECK(self.numel() > 0, "median cannot be called with empty tensor"); if (self.dim() == 0 && self.numel() == 1) { return self.clone(); } diff --git a/aten/src/ATen/native/SortingUtils.h b/aten/src/ATen/native/SortingUtils.h index 9cc7afb8b4c1..b8d97750a97e 100644 --- a/aten/src/ATen/native/SortingUtils.h +++ b/aten/src/ATen/native/SortingUtils.h @@ -17,7 +17,7 @@ static void _reduction_with_indices_allocate_or_resize_output( result_sizes[dim] = 1; } if (values.defined()) { - AT_CHECK( + TORCH_CHECK( self.type() == values.type(), "output values must be of same type as input"); if (!keepdim && values.dim() == self.dim() - 1) { @@ -29,9 +29,9 @@ static void _reduction_with_indices_allocate_or_resize_output( values = at::empty(result_sizes, self.options()); } if (indices.defined()) { - AT_CHECK( + TORCH_CHECK( indices.dtype() == kLong, "output indices must be of scalar type Long"); - AT_CHECK( + TORCH_CHECK( indices.device() == self.device(), "output indices must be on same device as input"); if (!keepdim && indices.dim() == self.dim() - 1) { diff --git a/aten/src/ATen/native/SpectralOps.cpp b/aten/src/ATen/native/SpectralOps.cpp index d00169c75f3b..83d37452e479 100644 --- a/aten/src/ATen/native/SpectralOps.cpp +++ b/aten/src/ATen/native/SpectralOps.cpp @@ -24,10 +24,10 @@ static inline Tensor _fft(const Tensor &self, const int64_t signal_ndim, const bool inverse, IntArrayRef signal_sizes, const bool normalized, const bool onesided) { - AT_CHECK(signal_ndim >= 1 && signal_ndim <= 3, + TORCH_CHECK(signal_ndim >= 1 && signal_ndim <= 3, "Expected signal_ndim to be 1, 2, or 3, but got signal_ndim=", signal_ndim); - AT_CHECK(at::isFloatingType(self.scalar_type()), + TORCH_CHECK(at::isFloatingType(self.scalar_type()), "Expected an input tensor of floating types, but got input=", self.type(), self.sizes()); @@ -62,14 +62,14 @@ static inline Tensor _fft(const Tensor &self, const int64_t signal_ndim, // now we assume that input is batched as [ B x signal_dims... ] if (complex_input) { - AT_CHECK(input.size(signal_ndim + 1) == 2, + TORCH_CHECK(input.size(signal_ndim + 1) == 2, "Expected an input tensor with a last dimension of size 2 " "representing real + imaginary components, but got input ", self.type(), self.sizes()); } // build signal_sizes and output_size - AT_CHECK(signal_sizes.size() == 0 || static_cast(signal_sizes.size()) == signal_ndim, + TORCH_CHECK(signal_sizes.size() == 0 || static_cast(signal_sizes.size()) == signal_ndim, "Expected signal_sizes to be empty (default) or of signal_ndim=", signal_ndim, "D, but got signal_sizes=", signal_sizes); std::vector output_sizes(signal_ndim + 1 + static_cast(complex_output)); @@ -98,7 +98,7 @@ static inline Tensor _fft(const Tensor &self, const int64_t signal_ndim, output_sizes[i + 1] = input_size; } checked_signal_sizes[i] = input_size; - AT_CHECK(signal_sizes.size() == 0 || signal_sizes[i] == checked_signal_sizes[i], + TORCH_CHECK(signal_sizes.size() == 0 || signal_sizes[i] == checked_signal_sizes[i], "Expected given signal_sizes=", signal_sizes," to have same " "shape with input at signal dimension ", i, ", but got " "signal_sizes=", signal_sizes, " and input=", self.type(), diff --git a/aten/src/ATen/native/TensorCompare.cpp b/aten/src/ATen/native/TensorCompare.cpp index 9d0c2be2eb9e..9aaf55f47b35 100644 --- a/aten/src/ATen/native/TensorCompare.cpp +++ b/aten/src/ATen/native/TensorCompare.cpp @@ -119,7 +119,7 @@ std::tuple mode(const Tensor& self, int64_t dim, bool keepdim) { std::tuple mode_out(Tensor& values, Tensor& indices, const Tensor& self, int64_t dim, bool keepdim) { - AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA, + TORCH_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA, "mode only supports CPU AND CUDA backend, got: ", toString(self.type().backend())); dim = maybe_wrap_dim(dim, self.dim()); if (_dimreduce_return_trivial_no_ident(values, self, dim, keepdim, "mode")) { @@ -154,7 +154,7 @@ std::tuple max(const Tensor& self, int64_t dim, bool keepdim) { std::tuple max_out(Tensor& max, Tensor& max_indices, const Tensor& self, int64_t dim, bool keepdim) { - AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA, + TORCH_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA, "max only supports CPU AND CUDA backend, got: ", toString(self.type().backend())); dim = maybe_wrap_dim(dim, self.dim()); if (_dimreduce_return_trivial_no_ident(max, self, dim, keepdim, "max")) { @@ -193,7 +193,7 @@ std::tuple min(const Tensor& self, int64_t dim, bool keepdim) { std::tuple min_out(Tensor& min, Tensor& min_indices, const Tensor& self, int64_t dim, bool keepdim) { - AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA, + TORCH_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA, "min only supports CPU AND CUDA backend, got: ", toString(self.type().backend())); dim = maybe_wrap_dim(dim, self.dim()); if (_dimreduce_return_trivial_no_ident(min, self, dim, keepdim, "min")) { diff --git a/aten/src/ATen/native/TensorConversions.cpp b/aten/src/ATen/native/TensorConversions.cpp index a4006f93b182..75a74f112cc1 100644 --- a/aten/src/ATen/native/TensorConversions.cpp +++ b/aten/src/ATen/native/TensorConversions.cpp @@ -26,12 +26,12 @@ static inline Tensor to_impl(const Tensor& self, const TensorOptions& options, b } Tensor to(const Tensor& self, const TensorOptions& options, bool non_blocking, bool copy) { - AT_CHECK(options.requires_grad_opt() == c10::nullopt, + TORCH_CHECK(options.requires_grad_opt() == c10::nullopt, "to(options) expects unset requires_grad flag, but got " "options.requires_grad set as ", options.requires_grad()); const auto & layout_opt = options.layout_opt(); - AT_CHECK(!layout_opt || self.layout() == layout_opt.value(), + TORCH_CHECK(!layout_opt || self.layout() == layout_opt.value(), "to(options) doesn't support converting to a different layout, " "but got self.layout being ", self.layout(), " and options.layout set as ", options.layout()); diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp index 8a75f1851f1d..5cb6e18a6ffb 100644 --- a/aten/src/ATen/native/TensorFactories.cpp +++ b/aten/src/ATen/native/TensorFactories.cpp @@ -35,17 +35,17 @@ void window_function_checks( const char* function_name, const TensorOptions& options, int64_t window_length) { - AT_CHECK( + TORCH_CHECK( options.layout() != kSparse, function_name, " is not implemented for sparse types, got: ", options); - AT_CHECK( + TORCH_CHECK( at::isFloatingType(typeMetaToScalarType(options.dtype())), function_name, " expects floating point dtypes, got: ", options); - AT_CHECK( + TORCH_CHECK( window_length >= 0, function_name, " requires non-negative window_length, got window_length=", @@ -182,7 +182,7 @@ Tensor& eye_out_cpu(Tensor& result, int64_t n) { } Tensor& eye_out_cpu(Tensor& result, int64_t n, int64_t m) { - AT_CHECK(n >= 0, "n must be greater or equal to 0, got ", n); + TORCH_CHECK(n >= 0, "n must be greater or equal to 0, got ", n); if(m < 0) { m = n; @@ -459,7 +459,7 @@ Tensor& randperm_out(Tensor& result, int64_t n) { } Tensor& randperm_out_cpu(Tensor& result, int64_t n, Generator* generator) { - AT_CHECK(n >= 0, "n must be non-negative, got", n); + TORCH_CHECK(n >= 0, "n must be non-negative, got", n); result.resize_({n}); auto gen = get_generator(generator); AT_DISPATCH_ALL_TYPES(result.scalar_type(), "randperm", [&]() -> void { @@ -738,7 +738,7 @@ AT_FORALL_SCALAR_TYPES_EXCEPT_HALF_AND_QINT(TENSOR) #undef TENSOR Tensor from_file(std::string filename, c10::optional shared, c10::optional size, const TensorOptions& options) { - AT_CHECK(!options.pinned_memory(), "tensors constructed from a file cannot be pinned"); + TORCH_CHECK(!options.pinned_memory(), "tensors constructed from a file cannot be pinned"); size_t my_size = size.value_or(0); int flags = shared.value_or(false) ? TH_ALLOCATOR_MAPPED_SHARED : 0; auto dtype = options.dtype(); diff --git a/aten/src/ATen/native/TensorFactories.h b/aten/src/ATen/native/TensorFactories.h index c17b4c1b3d10..08916147d3f6 100644 --- a/aten/src/ATen/native/TensorFactories.h +++ b/aten/src/ATen/native/TensorFactories.h @@ -49,10 +49,10 @@ inline int64_t get_tril_size(int64_t row, int64_t col, int64_t offset) { inline void check_args( int64_t row, int64_t col, const TensorOptions& options) { - AT_CHECK(row >= 0, "row must be non-negative, got", row); - AT_CHECK(col >= 0, "col must be non-negative, got", col); + TORCH_CHECK(row >= 0, "row must be non-negative, got", row); + TORCH_CHECK(col >= 0, "col must be non-negative, got", col); if (options.has_layout()) { - AT_CHECK( + TORCH_CHECK( options.layout() == at::kStrided, "only support layout=torch.strided, got", options.layout()) @@ -61,7 +61,7 @@ inline void check_args( inline void check_size_nonnegative(IntArrayRef size) { for (auto x: size) { - AT_CHECK(x >= 0, "Trying to create tensor with negative dimension ", x, ": ", size); + TORCH_CHECK(x >= 0, "Trying to create tensor with negative dimension ", x, ": ", size); } } } // namespace native diff --git a/aten/src/ATen/native/TensorIterator.cpp b/aten/src/ATen/native/TensorIterator.cpp index ae823a30f7b1..b731b93b1dac 100644 --- a/aten/src/ATen/native/TensorIterator.cpp +++ b/aten/src/ATen/native/TensorIterator.cpp @@ -409,6 +409,23 @@ bool TensorIterator::is_trivial_1d() const { return ndim() == 1; } +bool TensorIterator::is_contiguous() const { + if (numel() == 1) { + return true; + } + if (ndim() != 1) { + return false; + } + int num_tensors = ntensors(); + for (int i = 0; i < num_tensors; i++) { + if (strides(i)[0] != element_size(i)) { + return false; + } + } + return true; +} + + bool TensorIterator::is_scalar(int arg) const { const auto& stride = operands_[arg].stride_bytes; for (int i = 0; i < ndim(); i++) { @@ -468,7 +485,7 @@ void TensorIterator::select_all_keeping_dim(int start_dim, IntArrayRef indices) std::unique_ptr TensorIterator::binary_op(Tensor& out, const Tensor& a, const Tensor& b) { auto builder = TensorIterator::Builder(); if (a.device().is_cuda() && b.device().is_cuda()) { - AT_CHECK(a.device() == b.device(), + TORCH_CHECK(a.device() == b.device(), "binary_op(): expected both inputs to be on same device, but input a " "is on ", a.device(), " and input b is on ", b.device()); } @@ -486,6 +503,14 @@ std::unique_ptr TensorIterator::unary_op(Tensor& out, const Tens return builder.build(); } +std::unique_ptr TensorIterator::nullary_op(Tensor& out) { + auto builder = TensorIterator::Builder(); + builder.add_output(out); + // FIXME: workaround for bug: https://github.com/pytorch/pytorch/issues/20342 + builder.iter_->resize_outputs_ = false; + return builder.build(); +} + std::unique_ptr TensorIterator::reduce_op(Tensor& out, const Tensor& a) { AT_ASSERT(out.defined()); auto builder = TensorIterator::Builder(); @@ -497,6 +522,28 @@ std::unique_ptr TensorIterator::reduce_op(Tensor& out, const Ten return builder.build(); } +std::unique_ptr TensorIterator::reduce_op(Tensor& out1, Tensor& out2, const Tensor& a) { + AT_ASSERT(out1.defined()); + AT_ASSERT(out2.defined()); + TORCH_CHECK((!a.is_cuda() && !out1.is_cuda() && !out2.is_cuda()) || (a.device() == out1.device() && out1.device() == out2.device()), + "reduce_op(): expected input and both outputs to be on same device, but input is on ", a.device(), + ", output1 is on ", out1.device(), " and output2 is on", out2.device()); + TORCH_CHECK(out1.dim() == out2.dim(), "reduce_op(): expected both outputs to have same number of dims, but output1 has ", out1.dim(), + " and output2 has ", out2.dim()); + TORCH_CHECK(out1.sizes() == out2.sizes(), "reduce_op(): expected both outputs to have same sizes, but output1 has ", out1.sizes(), + " and output2 has ", out2.sizes()); + TORCH_CHECK(out1.strides() == out2.strides(), "reduce_op(): expected both outputs to have same strides, but output1 has ", out1.strides(), + " and output2 has ", out2.strides()); + auto builder = TensorIterator::Builder(); + builder.add_output(out1); + builder.add_output(out2); + builder.add_input(a); + builder.iter_->promote_gpu_output_dtypes_ = true; + builder.iter_->resize_outputs_ = false; + builder.iter_->is_reduction_ = true; + return builder.build(); +} + void TensorIterator::mark_outputs() { for (int i = 0; i < num_outputs_; i++) { operands_[i].is_output = true; diff --git a/aten/src/ATen/native/TensorIterator.h b/aten/src/ATen/native/TensorIterator.h index 6a9ca8ca2150..9bbcda9531ee 100644 --- a/aten/src/ATen/native/TensorIterator.h +++ b/aten/src/ATen/native/TensorIterator.h @@ -64,6 +64,7 @@ struct DimCounter { DimVector values; int64_t offset; }; + struct CAFFE2_API OperandInfo { OperandInfo() {} explicit OperandInfo(const Tensor& t, const Backend backend=Backend::Undefined, const ScalarType dtype=ScalarType::Undefined) @@ -146,12 +147,16 @@ struct CAFFE2_API TensorIterator { static std::unique_ptr binary_op(Tensor& out, const Tensor& a, const Tensor& b); static std::unique_ptr unary_op(Tensor& out, const Tensor& a); + static std::unique_ptr nullary_op(Tensor& out); static std::unique_ptr reduce_op(Tensor& out, const Tensor& a); + static std::unique_ptr reduce_op(Tensor& out1, Tensor& out2, const Tensor& a); int ndim() const { return shape_.size(); } IntArrayRef shape() const { return shape_; } int64_t numel() const; int ntensors() const { return operands_.size(); } + int noutputs() const { return num_outputs_; } + int ninputs() const { return ntensors() - noutputs(); } /// number of elements in the output operand. this is the same as numel() for /// operations that are not reductions. @@ -162,6 +167,8 @@ struct CAFFE2_API TensorIterator { /// 1-dimensional iteration and no buffering or type conversion bool is_trivial_1d() const; + /// Reducible to 1-dimensional and all operands are contiguous + bool is_contiguous() const; bool is_dim_reduced(int dim) const; /// Accessors for each operand @@ -169,6 +176,7 @@ struct CAFFE2_API TensorIterator { void* data_ptr(int arg) const; ScalarType dtype(int arg=0) const { return operands_[arg].dtype; } DeviceType device_type(int arg=0) const { return backendToDeviceType(operands_[arg].backend); } + Device device(int arg=0) const { return operands_[arg].tensor.device(); } int64_t element_size(int arg) const { return elementSize(dtype(arg)); } bool is_scalar(int arg) const; bool is_cpu_scalar(int arg) const; @@ -181,6 +189,11 @@ struct CAFFE2_API TensorIterator { return operands_[arg].tensor; } + Tensor input(int arg=0) const { + AT_ASSERT(arg >= 0 && arg < ntensors() - num_outputs_); + return operands_[num_outputs_ + arg].tensor; + } + /// Removes an operand from this iterator void remove_operand(int arg); /// Removes a dimension from this iterator diff --git a/aten/src/ATen/native/TensorIteratorReduce.cpp b/aten/src/ATen/native/TensorIteratorReduce.cpp index 228bb1337774..e0dd738f7491 100644 --- a/aten/src/ATen/native/TensorIteratorReduce.cpp +++ b/aten/src/ATen/native/TensorIteratorReduce.cpp @@ -14,7 +14,7 @@ static void two_pass_reduction(TensorIterator& iter, const loop2d_t& loop); static void parallel_dim_reduction(TensorIterator& iter, const loop2d_t& loop); void TensorIterator::parallel_reduce(const loop2d_t& loop) { - AT_CHECK(ntensors() == 2, "parallel_reduce only supports one input and one output"); + TORCH_CHECK(ntensors() == 2, "parallel_reduce only supports one input and one output"); int64_t numel = this->numel(); if (numel < at::internal::GRAIN_SIZE || at::get_num_threads() == 1 || at::in_parallel_region()) { @@ -27,13 +27,13 @@ void TensorIterator::parallel_reduce(const loop2d_t& loop) { } static bool use_two_pass_reduction(TensorIterator& iter) { - return iter.tensor(0).numel() == 1; + return iter.output(0).numel() == 1; } static void two_pass_reduction(TensorIterator& iter, const loop2d_t& loop) { int max_threads = at::get_num_threads(); - auto& dst = iter.tensor(0); + auto dst = iter.output(0); auto buffer_shape = DimVector(dst.sizes()); buffer_shape.insert(buffer_shape.begin(), max_threads); auto buffer = at::empty(buffer_shape, dst.options()); @@ -47,7 +47,7 @@ static void two_pass_reduction(TensorIterator& iter, const loop2d_t& loop) { auto slice = buffer[thread_num]; slice.copy_(dst); - auto sub_iter = TensorIterator::reduce_op(slice, iter.tensor(1)); + auto sub_iter = TensorIterator::reduce_op(slice, iter.input(0)); sub_iter->serial_for_each(loop, {begin, end}); }); @@ -117,13 +117,14 @@ static void parallel_dim_reduction(TensorIterator& iter, const loop2d_t& loop) { } void TensorIterator::foreach_reduced_elt(const loop_subiter_t &loop, bool parallelize) { - AT_ASSERT(ntensors() == 2 && num_outputs_ == 1); + AT_ASSERT(ninputs() == 1); + AT_ASSERT(noutputs() >= 1); auto shape = this->shape(); - if (tensor(0).numel() == 0) { + if (output(0).numel() == 0) { return; } - if (tensor(0).numel() == 1) { + if (output(0).numel() == 1) { loop(*this); } else if (numel() < at::internal::GRAIN_SIZE || at::get_num_threads() == 1 || diff --git a/aten/src/ATen/native/TensorProperties.cpp b/aten/src/ATen/native/TensorProperties.cpp index 64ee1fb1b5cd..1b6adb17aebd 100644 --- a/aten/src/ATen/native/TensorProperties.cpp +++ b/aten/src/ATen/native/TensorProperties.cpp @@ -53,11 +53,37 @@ Tensor & detach_(Tensor & self) { } Tensor contiguous(const Tensor & self) { - if (self.is_contiguous()) { - return self; - } - return self.clone(); + return contiguous(self, MemoryFormat::Contiguous); } +Tensor contiguous(const Tensor& self, MemoryFormat memory_format) { + if (self.is_contiguous(memory_format)) { + return self; + } + auto result = at::empty_like(self); + switch (memory_format) { + case MemoryFormat::Any: // Back compatibility with old defaults + case MemoryFormat::Contiguous: { + break; + } + case MemoryFormat::ChannelsLast: { + AT_CHECK( + result.dim() == 4, + " required rank 4 tensor to use channels_last format"); + std::vector newStrides(self.dim()); + auto sizes = result.sizes(); + newStrides[1] = 1; + newStrides[3] = sizes[1]; + newStrides[2] = newStrides[3] * sizes[3]; + newStrides[0] = newStrides[2] * sizes[2]; + result = result.as_strided(sizes, newStrides); + break; + } + default: { + AT_CHECK(false, " unsupported memory format"); + } + } + return result.copy_(self); } +} // namespace native } diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp index 857585be9b55..277d715ee846 100644 --- a/aten/src/ATen/native/TensorShape.cpp +++ b/aten/src/ATen/native/TensorShape.cpp @@ -19,7 +19,7 @@ namespace at { namespace native { Tensor _reshape_from_tensor(const Tensor& self, const Tensor& shape_tensor) { - AT_CHECK(shape_tensor.dim() == 1); + TORCH_CHECK(shape_tensor.dim() == 1); std::vector shape; auto accessor = shape_tensor.accessor(); for (size_t i = 0; i < shape_tensor.numel(); ++i) { @@ -40,7 +40,7 @@ std::vector broadcast_tensors(TensorList tensors) { static void check_cat_no_zero_dim(TensorList tensors) { for(size_t i = 0; i < tensors.size(); ++i) { auto& t = tensors[i]; - AT_CHECK(t.dim() > 0, + TORCH_CHECK(t.dim() > 0, "zero-dimensional tensor (at position ", i, ") cannot be concatenated"); } } @@ -71,12 +71,12 @@ static void check_cat_sparse_dims(Tensor const &t, int64_t wrapped, int64_t sparse_dim, int64_t dense_dim) { - AT_CHECK(t.is_sparse(), + TORCH_CHECK(t.is_sparse(), "Concatenating sparse tensors, but a dense tensor was found at position ", pos, "."); - AT_CHECK(sizes_match_except(sizes, t.sizes(), wrapped), + TORCH_CHECK(sizes_match_except(sizes, t.sizes(), wrapped), "All tensors must have the same shape: ", sizes, " (except in the concatenating dimension)," " but found shape: ", t.sizes(), " at position ", pos, "."); - AT_CHECK(t.sparse_dim() == sparse_dim && t.dense_dim() == dense_dim, + TORCH_CHECK(t.sparse_dim() == sparse_dim && t.dense_dim() == dense_dim, "All tensors must have the same sparse_dim and dense_dim: ", sparse_dim, ", ", dense_dim, ", but tensor at position ", pos, " has ", t.sparse_dim(), ", ", t.dense_dim(), "."); } @@ -182,9 +182,9 @@ Tensor cat(TensorList tensors, int64_t dim) { } std::vector chunk(const Tensor& self, int64_t chunks, int64_t dim) { - AT_CHECK(self.dim() > 0, + TORCH_CHECK(self.dim() > 0, "chunk expects at least a 1-dimensional tensor"); - AT_CHECK(chunks > 0, + TORCH_CHECK(chunks > 0, "chunk expects `chunks` to be greater than 0, got: ", chunks); int64_t split_size = (self.size(dim) + chunks - 1) / chunks; @@ -210,7 +210,7 @@ Tensor diagonal(const Tensor& self, int64_t offset, int64_t dim1_, int64_t dim2_ int64_t nDims = self.dim(); int64_t dim1 = maybe_wrap_dim(dim1_, nDims); int64_t dim2 = maybe_wrap_dim(dim2_, nDims); - AT_CHECK(dim1 != dim2, "diagonal dimensions cannot be identical ", dim1_, ", ", dim2_); + TORCH_CHECK(dim1 != dim2, "diagonal dimensions cannot be identical ", dim1_, ", ", dim2_); int64_t diag_size; int64_t storage_offset = self.storage_offset(); // compute storage offset and size for the diagonal @@ -256,7 +256,7 @@ Tensor diag_embed(const Tensor& self, int64_t offset, int64_t dim1_, int64_t dim int64_t nDims = self.dim() + 1; int64_t dim1 = maybe_wrap_dim(dim1_, nDims); int64_t dim2 = maybe_wrap_dim(dim2_, nDims); - AT_CHECK(dim1 != dim2, "diagonal dimensions cannot be identical ", dim1_, ", ", dim2_); + TORCH_CHECK(dim1 != dim2, "diagonal dimensions cannot be identical ", dim1_, ", ", dim2_); int64_t new_dim_len = std::abs(offset) + self.size(-1); auto sizes = self.sizes().vec(); sizes.pop_back(); @@ -275,7 +275,7 @@ Tensor expand(const Tensor& self, IntArrayRef size, bool implicit) { // distinguish between expands inserted by broadcasts and those explicitly // requested by the user, because it is legal to remove implicit expands // from the graph, but not legal to remove the explicit ones. - AT_CHECK(size.size() >= (size_t)self.dim(), + TORCH_CHECK(size.size() >= (size_t)self.dim(), "expand(", self.type(), "{", self.sizes(), "}, size=", size, "): the number of sizes provided (", size.size(), ") ", "must be greater or equal to the number of dimensions in the tensor (", @@ -293,7 +293,7 @@ Tensor expand_as(const Tensor& self, const Tensor& other) { } Tensor sum_to_size(const Tensor& self, IntArrayRef size) { - AT_CHECK(is_expandable_to(size, self.sizes()), + TORCH_CHECK(is_expandable_to(size, self.sizes()), "size {", size, "} is not expandable to size {", self.sizes(), "}."); return sum_to(self, size); @@ -302,7 +302,7 @@ Tensor sum_to_size(const Tensor& self, IntArrayRef size) { Tensor as_strided_tensorimpl(const Tensor& self, IntArrayRef size, IntArrayRef stride, optional storage_offset_) { auto storage_offset = storage_offset_.value_or(self.storage_offset()); auto tid = self.type_id(); - AT_CHECK( + TORCH_CHECK( tid == CPUTensorId() || tid == CUDATensorId(), "as_strided is only implemented for strided CPU, CUDA and QuantizedCPU tensors."); auto result = detail::make_tensor(Storage(self.storage()), tid); @@ -313,7 +313,7 @@ Tensor as_strided_tensorimpl(const Tensor& self, IntArrayRef size, IntArrayRef s Tensor as_strided_qtensorimpl(const Tensor& self, IntArrayRef size, IntArrayRef stride, optional storage_offset_) { auto storage_offset = storage_offset_.value_or(self.storage_offset()); auto tid = self.type_id(); - AT_CHECK( + TORCH_CHECK( tid == QuantizedCPUTensorId(), "as_strided is only implemented for strided CPU, CUDA and QuantizedCPU tensors."); auto result = detail::make_tensor(Storage(self.storage()), tid, get_qtensorimpl(self)->quantizer()); @@ -330,10 +330,10 @@ Tensor &as_strided_(Tensor& self, IntArrayRef size, IntArrayRef stride, optional Tensor narrow_copy_sparse(const Tensor& self, int64_t dim, int64_t start, int64_t length) { int64_t allDim = self.dim(); int64_t end = start+length; - AT_CHECK(allDim > 0, "narrow() cannot be applied to a 0-dim tensor."); - AT_CHECK(dim >= 0 && dim < allDim, + TORCH_CHECK(allDim > 0, "narrow() cannot be applied to a 0-dim tensor."); + TORCH_CHECK(dim >= 0 && dim < allDim, "Dimension ", dim, " out of range. Expecting 0 <= dim < ", allDim, "."); - AT_CHECK(start >= 0 && length >= 0 && end <= self.size(dim), + TORCH_CHECK(start >= 0 && length >= 0 && end <= self.size(dim), "Invalid range to narrow. range(start, start+length) must be a subset of range(0, ", self.size(dim), ").") Tensor indices = self._indices(); int64_t sparse_dim = self.sparse_dim(); @@ -366,19 +366,19 @@ Tensor narrow_copy_dense(const Tensor& self, int64_t dim, int64_t start, int64_t } Tensor narrow(const Tensor& self, int64_t dim, int64_t start, int64_t length) { - AT_CHECK(self.dim() > 0, "narrow() cannot be applied to a 0-dim tensor."); + TORCH_CHECK(self.dim() > 0, "narrow() cannot be applied to a 0-dim tensor."); auto cur_size = self.size(dim); if (start != cur_size) { // start being the end is valid, but not a valid dim specification. start = maybe_wrap_dim(start, cur_size); } - AT_CHECK(length >= 0 && start <= cur_size - length, + TORCH_CHECK(length >= 0 && start <= cur_size - length, "start (", start, ") + length (", length, ") exceeds dimension size (", cur_size, ")."); return at::slice(self, dim, start, start + length, 1); } Tensor permute(const Tensor& self, IntArrayRef dims) { auto nDims = self.dim(); - AT_CHECK(dims.size() == (size_t)nDims, + TORCH_CHECK(dims.size() == (size_t)nDims, "number of dims don't match in permute"); auto oldSizes = self.sizes(); auto oldStrides = self.strides(); @@ -387,7 +387,7 @@ Tensor permute(const Tensor& self, IntArrayRef dims) { std::vector seen(nDims); for (int64_t i = 0; i < nDims; i++) { auto dim = maybe_wrap_dim(dims[i], nDims); - AT_CHECK(!seen[dim], + TORCH_CHECK(!seen[dim], "repeated dim in permute"); seen[dim] = true; newSizes[i] = oldSizes[dim]; @@ -397,7 +397,7 @@ Tensor permute(const Tensor& self, IntArrayRef dims) { } Tensor repeat(const Tensor& self, IntArrayRef repeats) { - AT_CHECK(repeats.size() >= (size_t)self.dim(), + TORCH_CHECK(repeats.size() >= (size_t)self.dim(), "Number of dimensions of repeat dims can not be smaller than number of dimensions of tensor"); // Add new leading dimensions to the tensor if the @@ -477,7 +477,7 @@ Tensor slice(const Tensor& self, int64_t dim, int64_t start, int64_t end, int64_ auto sizes = self.sizes().vec(); auto strides = self.strides().vec(); // TODO: support negative strides - AT_CHECK(step > 0, "slice step must be positive"); + TORCH_CHECK(step > 0, "slice step must be positive"); if (start < 0) { start += sizes[dim]; } @@ -502,10 +502,10 @@ Tensor slice(const Tensor& self, int64_t dim, int64_t start, int64_t end, int64_ } std::vector split(const Tensor& self, int64_t split_size, int64_t dim) { - AT_CHECK(self.dim() != 0, "split expects at least a 1-dimensional tensor"); - AT_CHECK(split_size >= 0, "split expects split_size be non-negative, but got split_size=", split_size); + TORCH_CHECK(self.dim() != 0, "split expects at least a 1-dimensional tensor"); + TORCH_CHECK(split_size >= 0, "split expects split_size be non-negative, but got split_size=", split_size); int64_t dim_size = self.size(dim); - AT_CHECK(split_size > 0 || self.size(dim) == 0, + TORCH_CHECK(split_size > 0 || self.size(dim) == 0, "split_size can only be 0 if dimension size is 0, " "but got dimension size of ", dim_size); // if split_size is 0 and dimension size is 0, there is 1 split. @@ -526,7 +526,7 @@ std::vector split(const Tensor& self, int64_t split_size, int64_t dim) { } std::vector split_with_sizes(const Tensor& self, IntArrayRef split_sizes, int64_t dim) { - AT_CHECK(self.dim() != 0, "split expects at least a 1-dimensional tensor"); + TORCH_CHECK(self.dim() != 0, "split expects at least a 1-dimensional tensor"); int64_t dim_size = self.size(dim); int64_t num_splits = split_sizes.size(); std::vector splits(num_splits); @@ -535,13 +535,13 @@ std::vector split_with_sizes(const Tensor& self, IntArrayRef split_sizes for (i = 0; i < num_splits; ++i) { auto length = split_sizes[i]; - AT_CHECK(length >= 0, + TORCH_CHECK(length >= 0, "split_with_sizes expects split_sizes have only non-negative ", "entries, but got split_sizes=", split_sizes); splits[i] = self.narrow(dim, start_idx, length); start_idx += length; } - AT_CHECK(start_idx == dim_size, + TORCH_CHECK(start_idx == dim_size, "split_with_sizes expects split_sizes to sum exactly to ", dim_size, " (input tensor's size at dimension ", dim, "), ", "but got split_sizes=", split_sizes); return splits; @@ -556,14 +556,14 @@ static inline std::vector get_stack_inputs(TensorList tensors, int64_t d } Tensor stack(TensorList tensors, int64_t dim) { - AT_CHECK(tensors.size() > 0, + TORCH_CHECK(tensors.size() > 0, "stack expects a non-empty TensorList"); dim = maybe_wrap_dim(dim, tensors[0].dim() + 1); return at::cat(get_stack_inputs(tensors, dim), dim); } Tensor& stack_out(Tensor& result, TensorList tensors, int64_t dim) { - AT_CHECK(tensors.size() > 0, + TORCH_CHECK(tensors.size() > 0, "stack expects a non-empty TensorList"); dim = maybe_wrap_dim(dim, tensors[0].dim() + 1); return at::cat_out(result, get_stack_inputs(tensors, dim), dim); @@ -571,7 +571,7 @@ Tensor& stack_out(Tensor& result, TensorList tensors, int64_t dim) { static inline Tensor & sparse_transpose_(Tensor & self, int64_t dim0, int64_t dim1) { int64_t nsparse_dim = self.sparse_dim(); - AT_CHECK(dim0 < nsparse_dim && dim1 < nsparse_dim, + TORCH_CHECK(dim0 < nsparse_dim && dim1 < nsparse_dim, "sparse transpose: transposed dimensions must be sparse ", "Got sparse_dim: ", nsparse_dim, ", d0: ", dim0, ", d1: ", dim1); @@ -644,11 +644,11 @@ static void check_t(const Tensor& self, const char *fn) { if (self.is_sparse()) { int64_t sparse_dim = self.sparse_dim(); int64_t dense_dim = self.dense_dim(); - AT_CHECK(sparse_dim <= 2 && dense_dim == 0, + TORCH_CHECK(sparse_dim <= 2 && dense_dim == 0, fn, " expects a tensor with <= 2 sparse and 0 dense dimensions, but got ", sparse_dim, " sparse and ", dense_dim, " dense dimensions"); } else { - AT_CHECK(self.dim() <= 2, + TORCH_CHECK(self.dim() <= 2, fn, " expects a tensor with <= 2 dimensions, but self is ", self.dim(), "D"); } } @@ -790,7 +790,7 @@ Tensor & unsqueeze_(Tensor& self, int64_t dim) { Tensor flatten(const Tensor& self, int64_t start_dim, int64_t end_dim) { start_dim = maybe_wrap_dim(start_dim, self.dim()); end_dim = maybe_wrap_dim(end_dim, self.dim()); - AT_CHECK(start_dim <= end_dim, "flatten() has invalid args: start_dim cannot come after end_dim"); + TORCH_CHECK(start_dim <= end_dim, "flatten() has invalid args: start_dim cannot come after end_dim"); if (start_dim == end_dim) { return self; @@ -834,7 +834,7 @@ std::vector unbind(const Tensor &self, int64_t dim) { std::vector meshgrid(TensorList tensors) { int64_t size = tensors.size(); - AT_CHECK(size > 0, "meshgrid expects a non-empty TensorList"); + TORCH_CHECK(size > 0, "meshgrid expects a non-empty TensorList"); std::vector shape(size); for(int64_t i = 0; i < size; i++) { switch (tensors[i].dim()) { @@ -849,8 +849,8 @@ std::vector meshgrid(TensorList tensors) { } } for(int64_t i = 0; i < size - 1; i++){ - AT_CHECK(tensors[i].dtype() == tensors[i+1].dtype(), "meshgrid expects all tensors to have the same dtype"); - AT_CHECK(tensors[i].device() == tensors[i+1].device(), "meshgrid expects all tensors to have the same device"); + TORCH_CHECK(tensors[i].dtype() == tensors[i+1].dtype(), "meshgrid expects all tensors to have the same dtype"); + TORCH_CHECK(tensors[i].device() == tensors[i+1].device(), "meshgrid expects all tensors to have the same device"); } std::vector grids; for(int64_t i = 0; i < size; i++) { diff --git a/aten/src/ATen/native/TensorTransformations.cpp b/aten/src/ATen/native/TensorTransformations.cpp index 86b7feb405b7..8bfd15ee63bb 100644 --- a/aten/src/ATen/native/TensorTransformations.cpp +++ b/aten/src/ATen/native/TensorTransformations.cpp @@ -107,21 +107,21 @@ Tensor roll_cpu(const Tensor& self, IntArrayRef shifts, IntArrayRef dims) { Tensor rot90(const Tensor& self, int64_t k, IntArrayRef dims) { const int64_t total_dims = self.dim(), total_rot_dims = dims.size(); - AT_CHECK(total_rot_dims == 2, + TORCH_CHECK(total_rot_dims == 2, "expected total rotation dims == 2, but got dims = ", total_rot_dims); - AT_CHECK(total_dims >= 2, + TORCH_CHECK(total_dims >= 2, "expected total dims >= 2, but got total dims = ", total_dims); - AT_CHECK(dims[0] != dims[1] && std::abs(dims[0] - dims[1]) != total_dims, + TORCH_CHECK(dims[0] != dims[1] && std::abs(dims[0] - dims[1]) != total_dims, "expected rotation dims to be different, but got dim0 = ", dims[0], " and dim1 = ", dims[1]); // check range of dims - AT_CHECK(dims[0] < total_dims && dims[0] >= -total_dims, + TORCH_CHECK(dims[0] < total_dims && dims[0] >= -total_dims, "Rotation dim0 out of range, dim0 = ", dims[0]); - AT_CHECK(dims[1] < total_dims && dims[1] >= -total_dims, + TORCH_CHECK(dims[1] < total_dims && dims[1] >= -total_dims, "Rotation dim1 out of range, dim1 = ", dims[1]); // handle modulo with negative k diff --git a/aten/src/ATen/native/TensorTransformations.h b/aten/src/ATen/native/TensorTransformations.h index 7e54c46bfb7f..c8435c736bff 100644 --- a/aten/src/ATen/native/TensorTransformations.h +++ b/aten/src/ATen/native/TensorTransformations.h @@ -31,18 +31,18 @@ static inline void flip_check_errors(int64_t total_dims, int64_t flip_dims_size, // check duplicates in dims wrap_all_dims(flip_dims_v, total_dims); flip_dims_v.erase(std::unique(flip_dims_v.begin(), flip_dims_v.end()), flip_dims_v.end()); - AT_CHECK((int64_t)flip_dims_v.size() == flip_dims_size, + TORCH_CHECK((int64_t)flip_dims_v.size() == flip_dims_size, "dims has duplicates, original flip dims size=", flip_dims_size, ", but unique flip dims size=", flip_dims_v.size()); } static inline Tensor roll_common(const Tensor& self, IntArrayRef shifts, IntArrayRef dims) { - AT_CHECK(shifts.size() > 0, "`shifts` required"); + TORCH_CHECK(shifts.size() > 0, "`shifts` required"); if (dims.size() == 0 && shifts.size() == 1) { auto flattened = self.contiguous().view(self.numel()); return roll(flattened, shifts[0], 0).view(self.sizes()); } - AT_CHECK( + TORCH_CHECK( shifts.size() == dims.size(), "shifts and dimensions must align. shifts: ", shifts.size(), ", dims:", dims.size() ); diff --git a/aten/src/ATen/native/UnaryOps.cpp b/aten/src/ATen/native/UnaryOps.cpp index 9121a36041de..2f5d47ffccb5 100644 --- a/aten/src/ATen/native/UnaryOps.cpp +++ b/aten/src/ATen/native/UnaryOps.cpp @@ -85,31 +85,38 @@ Tensor& _clamp_min_out_cpu(Tensor& result, const Tensor& self, Scalar min) { return legacy::th::_th_clamp_min_out(result, self, min); } +Tensor& fill_out(Tensor& self, const Scalar value) { + auto iter = TensorIterator::nullary_op(self); + fill_stub(iter->device_type(), *iter, value); + return self; +} + Tensor& fill_(Tensor& self, Scalar value) { - return at::legacy::th::_th_fill_(self, value); + return fill_out(self, value); } Tensor& fill_(Tensor& self, const Tensor& value) { - return at::legacy::th::_th_fill_(self, value); + TORCH_CHECK(value.dim() == 0, "fill_ only supports 0-dimension value tensor but got tensor with ", value.dim(), " dimensions."); + return fill_out(self, value.item()); } Tensor mvlgamma(const Tensor& self, int64_t p) { - AT_CHECK(at::isFloatingType(self.scalar_type()), + TORCH_CHECK(at::isFloatingType(self.scalar_type()), "mvlgamma is not implemented for ", self.type()); - AT_CHECK((self > 0.5 * (p - 1.)).all().item(), + TORCH_CHECK((self > 0.5 * (p - 1.)).all().item(), "Condition for computing multivariate log-gamma not met"); - AT_CHECK(p >= 1, "p has to be greater than or equal to 1"); + TORCH_CHECK(p >= 1, "p has to be greater than or equal to 1"); Tensor args = native::arange(-p / 2. + 0.5, 0.5, 0.5, self.options()); args = args.add(self.unsqueeze(-1)); return args.lgamma_().sum(-1).add_(p * (p - 1) * std::log(M_PI) / 4.); } Tensor& mvlgamma_(Tensor& self, int64_t p) { - AT_CHECK(at::isFloatingType(self.scalar_type()), + TORCH_CHECK(at::isFloatingType(self.scalar_type()), "mvlgamma is not implemented for ", self.type()); - AT_CHECK((self > 0.5 * (p - 1.)).all().item(), + TORCH_CHECK((self > 0.5 * (p - 1.)).all().item(), "Condition for computing multivariate log-gamma not met"); - AT_CHECK(p >= 1, "p has to be greater than or equal to 1"); + TORCH_CHECK(p >= 1, "p has to be greater than or equal to 1"); Tensor args = native::arange(-p / 2. + 0.5, 0.5, 0.5, self.options()); args = args.add(self.unsqueeze(-1)); return self.copy_(args.lgamma_().sum(-1).add_(p * (p - 1) * std::log(M_PI) / 4.)); @@ -136,7 +143,8 @@ Tensor& _sigmoid_out_cpu(Tensor& result, const Tensor& self) { #define IMPLEMENT_UNARY_OP_VEC(op) \ Tensor op(const Tensor& self) { \ Tensor result = at::empty({0}, self.options()); \ - return at::op##_out(result, self); \ + at::op##_out(result, self); \ + return result; \ } \ Tensor& _##op##__cpu(Tensor& self) { \ return at::op##_out(self, self); \ @@ -152,7 +160,8 @@ Tensor& _sigmoid_out_cpu(Tensor& result, const Tensor& self) { #define IMPLEMENT_UNARY_OP_TH(op) \ Tensor op(const Tensor& self) { \ Tensor result = at::empty({0}, self.options()); \ - return at::op##_out(result, self); \ + at::op##_out(result, self); \ + return result; \ } \ Tensor& _##op##__cpu(Tensor& self) { \ return at::op##_out(self, self); \ @@ -220,6 +229,6 @@ DEFINE_DISPATCH(sqrt_stub); DEFINE_DISPATCH(tan_stub); DEFINE_DISPATCH(tanh_stub); DEFINE_DISPATCH(trunc_stub); - +DEFINE_DISPATCH(fill_stub); } } // namespace at diff --git a/aten/src/ATen/native/UnaryOps.h b/aten/src/ATen/native/UnaryOps.h index b6758ca71ebd..a74d2b72763f 100644 --- a/aten/src/ATen/native/UnaryOps.h +++ b/aten/src/ATen/native/UnaryOps.h @@ -11,6 +11,8 @@ namespace at { namespace native { using unary_fn = void(*)(TensorIterator&); +DECLARE_DISPATCH(void(*)(TensorIterator&, Scalar), fill_stub); + DECLARE_DISPATCH(unary_fn, abs_stub); DECLARE_DISPATCH(unary_fn, acos_stub); DECLARE_DISPATCH(unary_fn, asin_stub); @@ -46,7 +48,6 @@ DECLARE_DISPATCH(void(*)(Tensor&, const double, Generator *), bernoulli_mkl_stub // digamma // lgamma // erfinv -// fill // clone // contiguous // clamp/_min/_max diff --git a/aten/src/ATen/native/UpSample.h b/aten/src/ATen/native/UpSample.h index 7291fda83778..058baa6086ed 100644 --- a/aten/src/ATen/native/UpSample.h +++ b/aten/src/ATen/native/UpSample.h @@ -1,33 +1,11 @@ #include #include +#include namespace at { namespace native { -// Corresponds to THNN_CHECK_DIM_SIZE -static inline void check_dim_size( - const Tensor& data, - int64_t dim, - int64_t dim_size, - int64_t size) { - /* Check dimension size of a tensor */ - AT_CHECK( - data.dim() == dim && data.size(dim_size) == size, - "Expected tensor of dimension ", - dim, - " and tensor.size[", - dim_size, - "] == ", - size, - " but got: dimension ", - data.dim(), - " and tensor.size[", - dim_size, - "] = ", - data.size(dim_size)); -} - static inline void upsample_1d_shape_check( const Tensor& input, const Tensor& grad_output, @@ -35,7 +13,7 @@ static inline void upsample_1d_shape_check( int64_t nchannels, int64_t input_width, int64_t output_width) { - AT_CHECK( + TORCH_CHECK( input_width > 0 && output_width > 0, "Input and output sizes should be greater than 0, but got input (W: ", input_width, @@ -44,7 +22,7 @@ static inline void upsample_1d_shape_check( ")"); if (input.defined()) { - AT_CHECK( + TORCH_CHECK( input.numel() != 0 && input.dim() == 3, "Non-empty 3D data tensor expected but got a tensor with sizes ", input.sizes()); @@ -64,7 +42,7 @@ static inline void upsample_2d_shape_check( int64_t input_width, int64_t output_height, int64_t output_width) { - AT_CHECK( + TORCH_CHECK( input_height > 0 && input_width > 0 && output_height > 0 && output_width > 0, "Input and output sizes should be greater than 0," @@ -79,7 +57,7 @@ static inline void upsample_2d_shape_check( ")"); if (input.defined()) { - AT_CHECK( + TORCH_CHECK( input.numel() != 0 && input.dim() == 4, "Non-empty 4D data tensor expected but got a tensor with sizes ", input.sizes()); @@ -102,7 +80,7 @@ static inline void upsample_3d_shape_check( int64_t output_depth, int64_t output_height, int64_t output_width) { - AT_CHECK( + TORCH_CHECK( input_depth > 0 && input_height > 0 && input_width > 0 && output_depth > 0 && output_height > 0 && output_width > 0, "Input and output sizes should be greater than 0, but got input (D: ", @@ -120,7 +98,7 @@ static inline void upsample_3d_shape_check( ")"); if (input.defined()) { - AT_CHECK( + TORCH_CHECK( input.numel() != 0 && input.dim() == 5, "Non-empty 5D data tensor expected but got a tensor with sizes ", input.sizes()); diff --git a/aten/src/ATen/native/UpSampleBicubic2d.cpp b/aten/src/ATen/native/UpSampleBicubic2d.cpp index 95e219879702..056520893676 100644 --- a/aten/src/ATen/native/UpSampleBicubic2d.cpp +++ b/aten/src/ATen/native/UpSampleBicubic2d.cpp @@ -166,7 +166,7 @@ static void upsample_bicubic2d_out_cpu_template( const Tensor& input_, IntArrayRef output_size, bool align_corners) { - AT_CHECK( + TORCH_CHECK( output_size.size() == 2, "It is expected output_size equals to 2, but got size ", output_size.size()); @@ -217,12 +217,12 @@ static void upsample_bicubic2d_backward_out_cpu_template( IntArrayRef output_size, IntArrayRef input_size, bool align_corners) { - AT_CHECK( + TORCH_CHECK( output_size.size() == 2, "It is expected output_size equals to 2, but got size ", output_size.size()); - AT_CHECK( + TORCH_CHECK( input_size.size() == 4, "It is expected input_size equals to 4, but got size ", input_size.size()); diff --git a/aten/src/ATen/native/UpSampleBilinear2d.cpp b/aten/src/ATen/native/UpSampleBilinear2d.cpp index 6c91d688c791..3d273dca2ef7 100644 --- a/aten/src/ATen/native/UpSampleBilinear2d.cpp +++ b/aten/src/ATen/native/UpSampleBilinear2d.cpp @@ -159,7 +159,7 @@ static void upsample_bilinear2d_out_cpu_template( const Tensor& input_, IntArrayRef output_size, bool align_corners) { - AT_CHECK( + TORCH_CHECK( output_size.size() == 2, "It is expected output_size equals to 2, but got size ", output_size.size()); @@ -214,12 +214,12 @@ static void upsample_bilinear2d_backward_out_cpu_template( IntArrayRef output_size, IntArrayRef input_size, bool align_corners) { - AT_CHECK( + TORCH_CHECK( output_size.size() == 2, "It is expected output_size equals to 2, but got size ", output_size.size()); - AT_CHECK( + TORCH_CHECK( input_size.size() == 4, "It is expected input_size equals to 4, but got size ", input_size.size()); diff --git a/aten/src/ATen/native/UpSampleLinear1d.cpp b/aten/src/ATen/native/UpSampleLinear1d.cpp index 3f34aabcf583..07dd58b63f57 100644 --- a/aten/src/ATen/native/UpSampleLinear1d.cpp +++ b/aten/src/ATen/native/UpSampleLinear1d.cpp @@ -112,7 +112,7 @@ static void upsample_linear1d_out_cpu_template( const Tensor& input_, IntArrayRef output_size, bool align_corners) { - AT_CHECK( + TORCH_CHECK( output_size.size() == 1, "It is expected output_size equals to 1, but got size ", output_size.size()); @@ -159,12 +159,12 @@ static void upsample_linear1d_backward_out_cpu_template( IntArrayRef output_size, IntArrayRef input_size, bool align_corners) { - AT_CHECK( + TORCH_CHECK( output_size.size() == 1, "It is expected output_size equals to 1, but got size ", output_size.size()); - AT_CHECK( + TORCH_CHECK( input_size.size() == 3, "It is expected input_size equals to 3, but got size ", input_size.size()); diff --git a/aten/src/ATen/native/UpSampleNearest1d.cpp b/aten/src/ATen/native/UpSampleNearest1d.cpp index 4d99943b727a..176aab3bad18 100644 --- a/aten/src/ATen/native/UpSampleNearest1d.cpp +++ b/aten/src/ATen/native/UpSampleNearest1d.cpp @@ -94,7 +94,7 @@ static void upsample_nearest1d_out_cpu_template( Tensor& output, const Tensor& input_, IntArrayRef output_size) { - AT_CHECK( + TORCH_CHECK( output_size.size() == 1, "It is expected output_size equals to 1, but got size ", output_size.size()); @@ -139,12 +139,12 @@ static void upsample_nearest1d_backward_out_cpu_template( const Tensor& grad_output_, IntArrayRef output_size, IntArrayRef input_size) { - AT_CHECK( + TORCH_CHECK( output_size.size() == 1, "It is expected output_size equals to 1, but got size ", output_size.size()); - AT_CHECK( + TORCH_CHECK( input_size.size() == 3, "It is expected input_size equals to 3, but got size ", input_size.size()); diff --git a/aten/src/ATen/native/UpSampleNearest2d.cpp b/aten/src/ATen/native/UpSampleNearest2d.cpp index eb9d5fc477aa..797c8e9d5123 100644 --- a/aten/src/ATen/native/UpSampleNearest2d.cpp +++ b/aten/src/ATen/native/UpSampleNearest2d.cpp @@ -119,7 +119,7 @@ static void upsample_nearest2d_out_cpu_template( Tensor& output, const Tensor& input_, IntArrayRef output_size) { - AT_CHECK( + TORCH_CHECK( output_size.size() == 2, "It is expected output_size equals to 2, but got size ", output_size.size()); @@ -170,12 +170,12 @@ static void upsample_nearest2d_backward_out_cpu_template( const Tensor& grad_output_, IntArrayRef output_size, IntArrayRef input_size) { - AT_CHECK( + TORCH_CHECK( output_size.size() == 2, "It is expected output_size equals to 2, but got size ", output_size.size()); - AT_CHECK( + TORCH_CHECK( input_size.size() == 4, "It is expected input_size equals to 4, but got size ", input_size.size()); diff --git a/aten/src/ATen/native/UpSampleNearest3d.cpp b/aten/src/ATen/native/UpSampleNearest3d.cpp index fd550fdf0bd8..37d763613426 100644 --- a/aten/src/ATen/native/UpSampleNearest3d.cpp +++ b/aten/src/ATen/native/UpSampleNearest3d.cpp @@ -154,7 +154,7 @@ static void upsample_nearest3d_out_cpu_template( Tensor& output, const Tensor& input_, IntArrayRef output_size) { - AT_CHECK( + TORCH_CHECK( output_size.size() == 3, "It is expected output_size equals to 3, but got size ", output_size.size()); @@ -213,12 +213,12 @@ static void upsample_nearest3d_backward_out_cpu_template( const Tensor& grad_output_, IntArrayRef output_size, IntArrayRef input_size) { - AT_CHECK( + TORCH_CHECK( output_size.size() == 3, "It is expected output_size equals to 3, but got size ", output_size.size()); - AT_CHECK( + TORCH_CHECK( input_size.size() == 5, "It is expected input_size equals to 5, but got size ", input_size.size()); diff --git a/aten/src/ATen/native/UpSampleTrilinear3d.cpp b/aten/src/ATen/native/UpSampleTrilinear3d.cpp index 37f6082cd0fb..34096dfa09e3 100644 --- a/aten/src/ATen/native/UpSampleTrilinear3d.cpp +++ b/aten/src/ATen/native/UpSampleTrilinear3d.cpp @@ -222,7 +222,7 @@ static void upsample_trilinear3d_out_cpu_template( const Tensor& input_, IntArrayRef output_size, bool align_corners) { - AT_CHECK( + TORCH_CHECK( output_size.size() == 3, "It is expected output_size equals to 3, but got size ", output_size.size()); @@ -284,12 +284,12 @@ static void upsample_trilinear3d_backward_out_cpu_template( IntArrayRef output_size, IntArrayRef input_size, bool align_corners) { - AT_CHECK( + TORCH_CHECK( output_size.size() == 3, "It is expected output_size equals to 3, but got size ", output_size.size()); - AT_CHECK( + TORCH_CHECK( input_size.size() == 5, "It is expected input_size equals to 5, but got size ", input_size.size()); diff --git a/aten/src/ATen/native/WeightNorm.cpp b/aten/src/ATen/native/WeightNorm.cpp index caaeed77b7de..d1bc46809c53 100644 --- a/aten/src/ATen/native/WeightNorm.cpp +++ b/aten/src/ATen/native/WeightNorm.cpp @@ -38,7 +38,7 @@ Tensor _weight_norm int64_t dim) { - AT_CHECK( + TORCH_CHECK( v_in.device() == g_in.device(), "weight_norm: expected v_in and g_in to be on the same device, but v_in is " "on ", v_in.device(), " and g_in is on ", g_in.device()); @@ -73,17 +73,17 @@ std::tuple _weight_norm_differentiable_backward // In Functions.cpp, the HardshrinkBackward object supplies "grad.contiguous()" // as the first argument, so grad_w should be contiguous here. // All these checks should succeed: - AT_CHECK(grad_w.is_contiguous(), "grad_w must be contiguous"); - AT_CHECK(saved_v.is_contiguous(), "saved_v must be contiguous"); - AT_CHECK(saved_g.is_contiguous(), "saved_g must be contiguous"); - AT_CHECK(saved_norms.is_contiguous(), "saved_norms must be contiguous"); + TORCH_CHECK(grad_w.is_contiguous(), "grad_w must be contiguous"); + TORCH_CHECK(saved_v.is_contiguous(), "saved_v must be contiguous"); + TORCH_CHECK(saved_g.is_contiguous(), "saved_g must be contiguous"); + TORCH_CHECK(saved_norms.is_contiguous(), "saved_norms must be contiguous"); int64_t last_dim = saved_v.dim() - 1; int64_t last_size = saved_v.size(last_dim); // Like weight_norm_fused_backward, weight_norm_differentiable_backward should only ever be called // through a WeightNormFusedBackward object, so we expect that dim == 0 || dim == saved_v.size(-1) - AT_CHECK(dim == 0 || dim == last_dim, "Expected dim to be the first or last dimension"); + TORCH_CHECK(dim == 0 || dim == last_dim, "Expected dim to be the first or last dimension"); // saved_g and saved_norms are already shaped to broadcast over the correct dimensions diff --git a/aten/src/ATen/native/cpu/CopyKernel.cpp b/aten/src/ATen/native/cpu/CopyKernel.cpp index 529075a1e20f..86331a31ecf6 100644 --- a/aten/src/ATen/native/cpu/CopyKernel.cpp +++ b/aten/src/ATen/native/cpu/CopyKernel.cpp @@ -1,68 +1,53 @@ -#include - #include -#include + #include -#include +#include #include #include -#include namespace at { namespace native { namespace { template -void copy_kernel_cast_t_impl(Tensor& self, const Tensor& src) { - auto builder = TensorIterator::Builder(); - builder.add_output(self); - builder.add_input(src); - builder.dont_resize_outputs(); - builder.dont_compute_common_dtype(); - auto iter = builder.build(); - +void copy_kernel_cast(TensorIterator& iter) { AT_DISPATCH_ALL_TYPES_AND2( - at::ScalarType::Half, - at::ScalarType::Bool, - src.scalar_type(), + ScalarType::Half, + ScalarType::Bool, + iter.dtype(1), "copy_kernel_cast", [&] { - at::native::unary_kernel(*iter, [=](scalar_t a) -> self_T { + at::native::unary_kernel(iter, [=](scalar_t a) -> self_T { return static_cast( static_cast>(a)); }); }); } -static void copy_kernel_cast_impl(Tensor& self, const Tensor& src) { - AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::Bool, - self.scalar_type(), "copy_kernel_cast", [&]() { copy_kernel_cast_t_impl(self, src); }); -} - -static void copy_kernel_same_type_impl(Tensor& self, const Tensor& src) { - auto builder = TensorIterator::Builder(); - builder.add_output(self); - builder.add_input(src); - builder.dont_resize_outputs(); - auto iter = builder.build(); - - if (self.scalar_type() == at::ScalarType::Half) { - unary_kernel(*iter, [=](at::Half a) -> at::Half { return a; }); +static void copy_kernel(TensorIterator& iter, bool non_blocking) { + ScalarType dtype = iter.dtype(0); + if (dtype == iter.dtype(1)) { + if (dtype == ScalarType::Half) { + unary_kernel(iter, [=](at::Half a) -> at::Half { return a; }); + } else { + AT_DISPATCH_ALL_TYPES_AND( + ScalarType::Bool, dtype, "copy_kernel", [&] { + unary_kernel_vec( + iter, + [=](scalar_t a) -> scalar_t { return a; }, + [=](Vec256 a) { return a; }); + }); + } } else { - AT_DISPATCH_ALL_TYPES_AND( - at::ScalarType::Bool, self.scalar_type(), "copy_kernel_same_type", [&] { - unary_kernel_vec( - *iter, - [=](scalar_t a) -> scalar_t { return a; }, - [=](Vec256 a) { return a; }); - }); + AT_DISPATCH_ALL_TYPES_AND2(ScalarType::Half, ScalarType::Bool, dtype, "copy_", [&] { + copy_kernel_cast(iter); + }); } } } // anonymous namespace -REGISTER_DISPATCH(copy_kernel_same_type, ©_kernel_same_type_impl); -REGISTER_DISPATCH(copy_kernel_cast, ©_kernel_cast_impl); +REGISTER_DISPATCH(copy_stub, ©_kernel); } // namespace native } // namespace at diff --git a/aten/src/ATen/native/cpu/CopyKernel.h b/aten/src/ATen/native/cpu/CopyKernel.h deleted file mode 100644 index 917c546bffbe..000000000000 --- a/aten/src/ATen/native/cpu/CopyKernel.h +++ /dev/null @@ -1,15 +0,0 @@ -#pragma once - -#include -#include - -namespace at { -namespace native { - -using forward_fn = void (*)(Tensor&, const Tensor&); - -DECLARE_DISPATCH(forward_fn, copy_kernel_same_type); -DECLARE_DISPATCH(forward_fn, copy_kernel_cast); - -} // namespace native -} // namespace at diff --git a/aten/src/ATen/native/cpu/Loops.h b/aten/src/ATen/native/cpu/Loops.h index 320ad862e4be..26bb69e8e589 100644 --- a/aten/src/ATen/native/cpu/Loops.h +++ b/aten/src/ATen/native/cpu/Loops.h @@ -47,6 +47,11 @@ static inline bool is_unary_contiguous_s1(const int64_t* strides) { strides[1] == 0; } +template +static inline bool is_nullary_contiguous(const int64_t* strides) { + return strides[0] == sizeof(typename traits::result_type); +} + // result is static inline bool is_reduction(char** data, const int64_t* strides) { return strides[0] == 0 && @@ -93,6 +98,77 @@ static inline bool is_reduction(char** data, const int64_t* strides) { const char* in1_ptr = data[1]; \ const char* in2_ptr = data[2]; +#define NULLARY_LOOP_HEADER(func_t, data, strides) \ + using traits = nullary_function_traits; \ + using arg0_t = typename traits::result_type; \ + char* out_ptr = data[0]; \ + int64_t s0 = strides[0]; + + #define NULLARY_VEC_HEADER(func_t) \ + using traits = nullary_function_traits; \ + using scalar_t = typename traits::result_type; \ + using Vec = Vec256; + + #define NULLARY_VEC_LOOP_HEADER(func_t, data) \ + NULLARY_VEC_HEADER(func_t) \ + char* out_ptr = data[0]; + + +// Basic loop fill operation (zero inputs, one output). May be auto-vectorized +// by the compiler. +template +static inline void nullary_loop(char** data, const int64_t* strides, int64_t i, int64_t n, func_t op) { + NULLARY_LOOP_HEADER(func_t, data, strides) + for (; i < n; i++) { + arg0_t out = op(); + *(arg0_t*)(out_ptr + i * s0) = out; + } +} + + // computes out = op() +template +static inline void vectorized_nullary_loop(char** data, int64_t n, func_t op, vec_func_t vop) { + NULLARY_VEC_LOOP_HEADER(func_t, data) + int64_t i = 0; + for (; i <= n - 2 * Vec::size(); i += 2 * Vec::size()) { + auto out1 = vop(); + auto out2 = vop(); + out1.store(out_ptr + i * sizeof(scalar_t)); + out2.store(out_ptr + (i + Vec::size()) * sizeof(scalar_t)); + } + int64_t strides[] = { sizeof(scalar_t) }; + nullary_loop(data, strides, i, n, op); +} + +template +void nullary_kernel(TensorIterator& iter, func_t op) { + AT_ASSERT(iter.ntensors() > 0) + using traits = nullary_function_traits; + + iter.for_each([&](int ntensor, char** data, const int64_t* strides, int64_t n) { + // Specializations to encourage auto-vectorization (trick from Numpy's loops.c.src) + if (is_nullary_contiguous(strides)) { + nullary_loop(data, strides, 0, n, op); + } else { + nullary_loop(data, strides, 0, n, op); + } + }); +} + + template +void nullary_kernel_vec(TensorIterator& iter, func_t op, vec_func_t vop) { + AT_ASSERT(iter.ntensors() > 0) + using traits = nullary_function_traits; + + iter.for_each([&](int ntensor, char** data, const int64_t* strides, int64_t n) { + if (is_nullary_contiguous(strides)) { + vectorized_nullary_loop(data, n, op, vop); + } else { + nullary_loop(data, strides, 0, n, op); + } + }); +} + // Basic loop unary operation (one input, one output). May be auto-vectorized // by the compiler. template diff --git a/aten/src/ATen/native/cpu/Reduce.h b/aten/src/ATen/native/cpu/Reduce.h index dbc469a73342..b6adf4ae5717 100644 --- a/aten/src/ATen/native/cpu/Reduce.h +++ b/aten/src/ATen/native/cpu/Reduce.h @@ -25,6 +25,44 @@ static inline bool is_outer_reduction(const int64_t* strides) { strides[3] == sizeof(typename traits::arg2_t); } +template +static void set_result(const int index, const res_t result, const TensorIterator &iter, const int num_outputs) { + static_assert(std::is_same::value, "data types must match"); + if (index < num_outputs) { + char *out = (char *) iter.data_ptr(index); + *(res_t *) out = result; + } +} + +template +static void set_results(const res_t result, const TensorIterator &iter, const int num_outputs) { + AT_ASSERT(num_outputs == 1); + set_result(0, result, iter, num_outputs); +} + +template +static inline typename std::enable_if::type +for_each_in_tuple(const std::tuple& t, const TensorIterator &iter, const int num_outputs) { + return i; +} + +template +static inline typename std::enable_if::type +for_each_in_tuple(const std::tuple& t, const TensorIterator &iter, const int num_outputs) { + if (i < num_outputs) { + set_result(i, std::get(t), iter, num_outputs); + return for_each_in_tuple(t, iter, num_outputs); + } + return i; +} + +template +static void set_results(const std::tuple& result, const TensorIterator &iter, const int num_outputs) { + AT_ASSERT(num_outputs >= 1); + std::size_t result_size = for_each_in_tuple(result, iter, num_outputs); + AT_ASSERT(num_outputs == result_size); +} + template struct all_same : c10::guts::conjunction< std::is_same... @@ -64,7 +102,7 @@ void binary_kernel_reduce(TensorIterator& iter, ops_t ops, init_t init) { using c_traits = binary_function_traits; using p_traits = unary_function_traits; using acc_t = typename p_traits::arg1_t; - using data_t = typename p_traits::result_type; + using data_t = typename r_traits::arg2_t; static_assert( all_same< acc_t, @@ -75,19 +113,17 @@ void binary_kernel_reduce(TensorIterator& iter, ops_t ops, init_t init) { typename c_traits::arg2_t, typename c_traits::result_type>::value, "all accumulate types must match"); - static_assert( - std::is_same::value, - "all data types must match"); static_assert( std::is_default_constructible::value, "the accumulate type must be default-constructible" ); - iter.foreach_reduced_elt([&](TensorIterator &sub_iter) { - auto reduction_body = [&](acc_t acc, int64_t begin, int64_t end) -> acc_t { - sub_iter.serial_for_each([&acc, &ops](int ntensors, char** data, const int64_t* strides, int64_t size) { - AT_ASSERT(ntensors == 2); - char *in = data[1]; - int64_t stride = strides[1]; + const int num_outputs = iter.noutputs(); + iter.foreach_reduced_elt([&ops, &init, num_outputs](TensorIterator &sub_iter) { + auto reduction_body = [&ops, &sub_iter, num_outputs](acc_t acc, int64_t begin, int64_t end) -> acc_t { + sub_iter.serial_for_each([&acc, &ops, num_outputs](int ntensors, char** data, const int64_t* strides, int64_t size) { + AT_ASSERT(ntensors - num_outputs == 1); + char *in = data[ntensors - 1]; + int64_t stride = strides[ntensors - 1]; for (int64_t i = 0; i < size; ++i) { acc = ops.reduce(acc, *(data_t*)in); in += stride; @@ -118,8 +154,7 @@ void binary_kernel_reduce(TensorIterator& iter, ops_t ops, init_t init) { total_acc = ops.combine(total_acc, buffer[i]); } } - char *out = (char *)sub_iter.data_ptr(0); - *(data_t*)out = ops.project(total_acc); + set_results(ops.project(total_acc), sub_iter, num_outputs); }); } diff --git a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp index b895636b2ffd..95a1b9c1bfb2 100644 --- a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp @@ -38,7 +38,7 @@ static void std_var_kernel_impl(TensorIterator &iter, bool unbiased, bool take_s AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.dtype(), "std_cpu", [&] { binary_kernel_reduce( iter, - WelfordOps { unbiased, take_sqrt }, + WelfordOps> { unbiased, take_sqrt }, WelfordData() ); }); diff --git a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp index 9ede5385367f..3f9aecac5474 100644 --- a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp @@ -54,6 +54,25 @@ static void abs_kernel(TensorIterator& iter) { }); } +static void fill_kernel(TensorIterator& iter, Scalar value_scalar) { + if( iter.dtype() == ScalarType::Half ) { + auto value = value_scalar.to().x; + using H = decltype(value); + nullary_kernel_vec( + iter, + [=]() -> H { return value; }, + [=]() { return Vec256(value); }); + } else { + AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Bool, iter.dtype(), "fill_cpu", [&]() { + scalar_t value = value_scalar.to(); + nullary_kernel_vec( + iter, + [=]() -> scalar_t { return value; }, + [=]() { return Vec256(value); }); + }); + } +} + static void frac_kernel(TensorIterator& iter) { AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "frac_cpu", [&]() { unary_kernel_vec( @@ -192,6 +211,7 @@ REGISTER_DISPATCH(abs_stub, &abs_kernel); REGISTER_DISPATCH(frac_stub, &frac_kernel); REGISTER_DISPATCH(reciprocal_stub, &reciprocal_kernel); REGISTER_DISPATCH(neg_stub, &neg_kernel); +REGISTER_DISPATCH(fill_stub, &fill_kernel); // IMPLEMENT_FLOAT_KERNEL(ALL, abs) IMPLEMENT_FLOAT_KERNEL(FLOATING, acos) diff --git a/aten/src/ATen/native/cpu/layer_norm_kernel.cpp b/aten/src/ATen/native/cpu/layer_norm_kernel.cpp new file mode 100644 index 000000000000..638d010371d9 --- /dev/null +++ b/aten/src/ATen/native/cpu/layer_norm_kernel.cpp @@ -0,0 +1,79 @@ +#include + +#include +#include +#include + +namespace at { +namespace native { + +namespace { + +template +void LayerNormKernelImplInternal( + const Tensor& X, + const Tensor& gamma, + const Tensor& beta, + int64_t M, + int64_t N, + T eps, + Tensor* Y, + Tensor* mean, + Tensor* rstd) { + DCHECK_EQ(X.numel(), M * N); + DCHECK(!gamma.defined() || gamma.numel() == N); + DCHECK(!beta.defined() || beta.numel() == N); + const T* X_data = X.data(); + const T* gamma_data = gamma.defined() ? gamma.data() : nullptr; + const T* beta_data = beta.defined() ? beta.data() : nullptr; + T* Y_data = Y->data(); + T* mean_data = mean->data(); + T* rstd_data = rstd->data(); + const T c = T(1) / static_cast(N); + const bool gamma_null = gamma_data == nullptr; + const bool beta_null = beta_data == nullptr; + for (int64_t i = 0; i < M; ++i) { + const T* X_ptr = X_data + i * N; + T* Y_ptr = Y_data + i * N; + T mean_val = T(0); + T rstd_val = T(0); + for (int64_t j = 0; j < N; ++j) { + mean_val += X_ptr[j]; + rstd_val += X_ptr[j] * X_ptr[j]; + } + mean_val *= c; + rstd_val = T(1) / std::sqrt(rstd_val * c - mean_val * mean_val + eps); + const T scale = rstd_val; + const T bias = -rstd_val * mean_val; + for (int64_t j = 0; j < N; ++j) { + const T gamma_v = gamma_null ? T(1) : gamma_data[j]; + const T beta_v = beta_null ? T(0) : beta_data[j]; + Y_ptr[j] = (X_ptr[j] * scale + bias) * gamma_v + beta_v; + } + mean_data[i] = mean_val; + rstd_data[i] = rstd_val; + } +} + +void LayerNormKernelImpl( + const Tensor& X, + const Tensor& gamma, + const Tensor& beta, + int64_t M, + int64_t N, + double eps, + Tensor* Y, + Tensor* mean, + Tensor* rstd) { + AT_DISPATCH_FLOATING_TYPES(X.scalar_type(), "LayerNormKernelImpl", [&]() { + LayerNormKernelImplInternal( + X, gamma, beta, M, N, static_cast(eps), Y, mean, rstd); + }); +} + +} // namespace + +REGISTER_DISPATCH(LayerNormKernel, &LayerNormKernelImpl); + +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/cpu/layer_norm_kernel.h b/aten/src/ATen/native/cpu/layer_norm_kernel.h new file mode 100644 index 000000000000..ae39aa76e5e8 --- /dev/null +++ b/aten/src/ATen/native/cpu/layer_norm_kernel.h @@ -0,0 +1,26 @@ +#ifndef ATEN_SRC_NATIVE_CPU_LAYER_NORM_KERNEL_H_ +#define ATEN_SRC_NATIVE_CPU_LAYER_NORM_KERNEL_H_ + +#include +#include + +namespace at { +namespace native { + +using forward_fn = void (*)( + const Tensor& /* X */, + const Tensor& /* gamma */, + const Tensor& /* beta */, + int64_t /* M */, + int64_t /* N */, + double /* eps */, + Tensor* /* Y */, + Tensor* /* mean */, + Tensor* /* rstd */); + +DECLARE_DISPATCH(forward_fn, LayerNormKernel); + +} // namespace native +} // namespace at + +#endif // ATEN_SRC_NATIVE_CPU_LAYER_NORM_KERNEL_H_ diff --git a/aten/src/ATen/native/cuda/Activation.cu b/aten/src/ATen/native/cuda/Activation.cu index e0c2148f35e0..8c95313c4a09 100644 --- a/aten/src/ATen/native/cuda/Activation.cu +++ b/aten/src/ATen/native/cuda/Activation.cu @@ -47,14 +47,14 @@ __global__ void prelu_cuda_kernel_multi_weights( } Tensor prelu_cuda(const Tensor& self, const Tensor& weight_) { - AT_CHECK(self.is_cuda()); - AT_CHECK(weight_.is_cuda()); + TORCH_CHECK(self.is_cuda()); + TORCH_CHECK(weight_.is_cuda()); auto input = self.contiguous(); auto weight = weight_.contiguous(); - AT_CHECK(input.is_contiguous()); - AT_CHECK(weight.is_contiguous()); + TORCH_CHECK(input.is_contiguous()); + TORCH_CHECK(weight.is_contiguous()); int64_t weight_num = weight.numel(); Tensor result = at::empty_like(input); @@ -71,7 +71,7 @@ Tensor prelu_cuda(const Tensor& self, const Tensor& weight_) { } else { // case2: multiple weights, one for each channel int64_t input_ndim = input.dim(); - AT_CHECK(input_ndim > 0, "Not allow zero-dim input tensor."); + TORCH_CHECK(input_ndim > 0, "Not allow zero-dim input tensor."); int64_t channel_size = 1; // channel_size default to 1 int64_t input_stride0 = 1, input_stride1 = 1; @@ -81,7 +81,7 @@ Tensor prelu_cuda(const Tensor& self, const Tensor& weight_) { input_stride0 = strides[0]; input_stride1 = strides[1]; } - AT_CHECK(channel_size == weight_num, + TORCH_CHECK(channel_size == weight_num, "Mismatch of parameter numbers and input channel size. Found parameter numbers = ", weight_num, " and channel size = ", channel_size, "."); @@ -92,7 +92,7 @@ Tensor prelu_cuda(const Tensor& self, const Tensor& weight_) { int curDevice = -1; cudaGetDevice(&curDevice); cudaStream_t stream = at::cuda::getCurrentCUDAStream(curDevice); - AT_CHECK(cuda::getApplyGrid(input_numel, grid, curDevice), "prelu: input too large or too many dimensions"); + TORCH_CHECK(cuda::getApplyGrid(input_numel, grid, curDevice), "prelu: input too large or too many dimensions"); AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "prelu_cuda", [&] { prelu_cuda_kernel_multi_weights @@ -155,17 +155,17 @@ __global__ void prelu_cuda_backward_kernel_multi_weights( } std::tuple prelu_backward_cuda(const Tensor& grad_out_, const Tensor& self, const Tensor& weight_) { - AT_CHECK(grad_out_.is_cuda()); - AT_CHECK(self.is_cuda()); - AT_CHECK(weight_.is_cuda()); + TORCH_CHECK(grad_out_.is_cuda()); + TORCH_CHECK(self.is_cuda()); + TORCH_CHECK(weight_.is_cuda()); auto input = self.contiguous(); auto grad_out = grad_out_.contiguous(); auto weight = weight_.contiguous(); - AT_CHECK(input.is_contiguous()); - AT_CHECK(weight.is_contiguous()); - AT_CHECK(grad_out.is_contiguous()); + TORCH_CHECK(input.is_contiguous()); + TORCH_CHECK(weight.is_contiguous()); + TORCH_CHECK(grad_out.is_contiguous()); int64_t weight_num = weight.numel(); auto strides = input.strides(); @@ -187,7 +187,7 @@ std::tuple prelu_backward_cuda(const Tensor& grad_out_, const Te } else { // case2: multiple parameters, one for each channel int64_t input_ndim = input.dim(); - AT_CHECK(input_ndim > 0, "Not allow zero-dim input tensor."); + TORCH_CHECK(input_ndim > 0, "Not allow zero-dim input tensor."); int64_t channel_size = 1; // channel_size default to 1 int64_t input_stride0 = 1, input_stride1 = 1; @@ -197,7 +197,7 @@ std::tuple prelu_backward_cuda(const Tensor& grad_out_, const Te input_stride0 = strides[0]; input_stride1 = strides[1]; } - AT_CHECK(channel_size == weight_num, + TORCH_CHECK(channel_size == weight_num, "Mismatch of parameter numbers and input channel size. Found parameter numbers = ", weight_num, " and channel size = ", channel_size, "."); @@ -208,7 +208,7 @@ std::tuple prelu_backward_cuda(const Tensor& grad_out_, const Te int curDevice = -1; cudaGetDevice(&curDevice); cudaStream_t stream = at::cuda::getCurrentCUDAStream(curDevice); - AT_CHECK(cuda::getApplyGrid(input_numel, grid, curDevice), "prelu_backward_cuda: input too large or too many dimensions"); + TORCH_CHECK(cuda::getApplyGrid(input_numel, grid, curDevice), "prelu_backward_cuda: input too large or too many dimensions"); AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "prelu_backward_cuda", [&] { prelu_cuda_backward_kernel_multi_weights diff --git a/aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu b/aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu index 7211aa3e895b..7f963f1c7a2e 100644 --- a/aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu +++ b/aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu @@ -214,13 +214,13 @@ namespace { checkAllSameGPU("cudnn_adaptive_avg_pooling2d", {input_arg, output_arg}); for (int64_t i = 0; i < input.ndimension(); i++) { - AT_CHECK(input.size(i) > 0, + TORCH_CHECK(input.size(i) > 0, "adaptive_avg_pooling2d(): expected input to have non-empty spatial dimensions, " "but input has sizes ", input.sizes(), " with dimension ", i, " being " "empty"); } - AT_CHECK((input.ndimension() == 3 || input.ndimension() == 4), + TORCH_CHECK((input.ndimension() == 3 || input.ndimension() == 4), "non-empty 3D or 4D (batch mode) tensor expected for input"); Tensor input_ = input; int64_t grid_x = input.size(-3); diff --git a/aten/src/ATen/native/cuda/AdaptiveAveragePooling3d.cu b/aten/src/ATen/native/cuda/AdaptiveAveragePooling3d.cu new file mode 100644 index 000000000000..a088dcc3f95d --- /dev/null +++ b/aten/src/ATen/native/cuda/AdaptiveAveragePooling3d.cu @@ -0,0 +1,517 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include // for atomicAdd +#include + +#include +#include +#include + +namespace at { +namespace native { + +namespace { + +__device__ inline int start_index(int a, int b, int c) { + return (int)std::floor((float)(a * c) / b); +} + +__device__ inline int end_index(int a, int b, int c) { + return (int)std::ceil((float)((a + 1) * c) / b); +} + +// 5d tensor B x D x T x H x W +// All kernels view batch dim B and dim D as collapsed. + +/* + * Description: + * this function adaptively average pools an input 5D tensor along dimensions + * 2, 3, and 4 5D input, 5D output + * + * gridDim.y blocks work together on a single 2D output plane specified by + * (blockIdx.x + offsetZ). + */ +template +__global__ void adaptiveaveragepool( + scalar_t *input, scalar_t *output, + int isizeT, int isizeH, int isizeW, + int osizeT, int osizeH, int osizeW, + int64_t istrideD, + int64_t istrideT, int64_t istrideH, int64_t istrideW, + int64_t offsetZ) { + // iterates on output pixels + int ot, oh, ow; + + // compute offsets based on thread/block ID + int ostartH = blockIdx.y * blockDim.y + threadIdx.y; + int oendH = osizeH; + int ostepH = gridDim.y * blockDim.y; + int ostartW = threadIdx.x; + int oendW = osizeW; + int ostepW = blockDim.x; + + // select output plane + int64_t o_plane = blockIdx.x + offsetZ; + ot = o_plane % osizeT; // output frame/time + int d = o_plane / osizeT; // slice/feature + + // input frame/time range is fixed. + int istartT = start_index(ot, osizeT, isizeT); + int iendT = end_index(ot, osizeT, isizeT); + int kT = iendT - istartT; + + // input offset by slice/feature and earliest relevant frame/time + scalar_t *input_dt = input + d*istrideD + istartT*istrideT; + // output offset by slice/feature and frame/time + scalar_t *output_dt = output + o_plane*osizeH*osizeW; + + // For all output pixels... + for (oh = ostartH; oh < oendH; oh += ostepH) { + int istartH = start_index(oh, osizeH, isizeH); + int iendH = end_index(oh, osizeH, isizeH); + int kH = iendH - istartH; + + for (ow = ostartW; ow < oendW; ow += ostepW) { + int istartW = start_index(ow, osizeW, isizeW); + int iendW = end_index(ow, osizeW, isizeW); + int kW = iendW - istartW; + + // Compute the average pooling from corresponding input pixels + scalar_t *ptr_input = input_dt + istartH*istrideH + istartW*istrideW; + scalar_t *ptr_output = output_dt + oh*osizeW + ow; + scalar_t sum = ScalarConvert::to(0); + + int it, ih, iw; + for (it = 0; it < kT; ++it) { + for (ih = 0; ih < kH; ++ih) { + for (iw = 0; iw < kW; ++iw) { + scalar_t val = ptr_input[ih*istrideH + iw*istrideW]; + sum += val; + } + } + ptr_input += istrideT; // next input frame + } + // Update output + *ptr_output = sum / kT / kH / kW; + } + } +} + +template +void adaptiveaveragepool_loop( + scalar_t *input_data, scalar_t *output_data, + int64_t totalZ, + int isizeT, int isizeH, int isizeW, + int osizeT, int osizeH, int osizeW, + int64_t istrideD, int64_t istrideT, int64_t istrideH, int64_t istrideW) { + int64_t offsetZ = 0; + dim3 threads(32, 8); + // each H*W plane is processed by blocksH thread blocks + int blocksH = std::max((int)(16L / totalZ), 1); + while (totalZ > 0) { + dim3 blocks(totalZ > 65535 ? 65535 : totalZ, blocksH); + adaptiveaveragepool<<>>( + input_data, output_data, + isizeT, isizeH, isizeW, + osizeT, osizeH, osizeW, + istrideD, + istrideT, istrideH, istrideW, + offsetZ); + + totalZ -= 65535; + offsetZ += 65535; + AT_CUDA_CHECK(cudaGetLastError()); + } +} + +/* + * Description: + * This function computes the gradInput from gradOutput. + * + * gridDim.y blocks work together on a single 2D output plane specified by + * (blockIdx.x + offsetZ). + */ +template +__global__ void adaptiveaveragegradinput( + scalar_t *gradInput, scalar_t *gradOutput, + int isizeT, int isizeH, int isizeW, + int osizeT, int osizeH, int osizeW, + int64_t offsetZ) +{ + // iterators on input pixels + int it, ih, iw; + + // compute offsets based on thread/block ID + int istartH = blockIdx.y * blockDim.y + threadIdx.y; + int iendH = isizeH; + int istepH = gridDim.y * blockDim.y; + int istartW = threadIdx.x; + int iendW = isizeW; + int istepW = blockDim.x; + + // select input plane + int64_t i_plane = blockIdx.x + offsetZ; + it = i_plane % isizeT; // output frame/time + int d = i_plane / isizeT; // slice/feature + + // output frame/time range is fixed. + int ostartT = start_index(it, isizeT, osizeT); + int oendT = end_index(it, isizeT, osizeT); + + // gradInput offset by slice/feature and frame/time. + scalar_t *gradInput_dt = gradInput + i_plane*isizeH*isizeW; + // gradOutput offset by slice/feature and earliest relevant frame/time + scalar_t *gradOutput_dt = gradOutput + (d*osizeT + ostartT)*osizeH*osizeW; + + // For all input pixels... + for (ih = istartH; ih < iendH; ih += istepH) { + int ostartH = start_index(ih, isizeH, osizeH); + int oendH = end_index(ih, isizeH, osizeH); + + for (iw = istartW; iw < iendW; iw += istepW) { + int ostartW = start_index(iw, isizeW, osizeW); + int oendW = end_index(iw, isizeW, osizeW); + + // Compute the gradients from corresponding output pixels + scalar_t *ptr_gradInput = gradInput_dt + ih*isizeW + iw; + scalar_t *ptr_gradOutput = gradOutput_dt; + + // for all relevant output pixels + int ot, oh, ow; + for (ot = ostartT; ot < oendT; ++ot) { + int kT = end_index(ot, osizeT, isizeT) - start_index(ot, osizeT, isizeT); + for (oh = ostartH; oh < oendH; ++oh) { + int kH = end_index(oh, osizeH, isizeH) - start_index(oh, osizeH, isizeH); + for (ow = ostartW; ow < oendW; ++ow) { + int kW = end_index(ow, osizeW, isizeW) - start_index(ow, osizeW, isizeW); + scalar_t grad_delta = ptr_gradOutput[oh*isizeW + ow] / kW / kH / kT; + *ptr_gradInput += grad_delta; + } + } + ptr_gradOutput += osizeH*osizeW; // next output frame + } + } + } +} + +template +void adaptiveaveragegradinput_loop( + scalar_t *gradInput_data, scalar_t *gradOutput_data, + int64_t totalZ, + int isizeT, int isizeH, int isizeW, + int osizeT, int osizeH, int osizeW) { + int64_t offsetZ = 0; + dim3 threads(32, 8); + // each H*W plane is processed by blocksH thread blocks + int blocksH = std::max((int)(16L / totalZ), 1); + while (totalZ > 0) { + dim3 blocks(totalZ > 65535 ? 65535 : totalZ, blocksH); + adaptiveaveragegradinput<<>>( + gradInput_data, gradOutput_data, + isizeT, isizeH, isizeW, + osizeT, osizeH, osizeW, + offsetZ); + + totalZ -= 65535; + offsetZ += 65535; + AT_CUDA_CHECK(cudaGetLastError()); + } +} + +/* + * Description: + * This function computes the gradInput from gradOutput. + * + * gridDim.y blocks work together on a single 2D output plane specified by + * (blockIdx.x + offsetZ). + * + * (uses atomic add) + * + */ +template +__global__ void atomicadaptiveaveragegradinput( + scalar_t *gradInput, scalar_t *gradOutput, + int isizeT, int isizeH, int isizeW, + int osizeT, int osizeH, int osizeW, + int64_t offsetZ) +{ + // iterators on output pixels + int ot, oh, ow; + + // compute offsets based on thread/block ID + int ostartH = blockIdx.y * blockDim.y + threadIdx.y; + int oendH = osizeH; + int ostepH = gridDim.y * blockDim.y; + int ostartW = threadIdx.x; + int oendW = osizeW; + int ostepW = blockDim.x; + + // select output plane + int64_t o_plane = blockIdx.x + offsetZ; + ot = o_plane % osizeT; // output frame/time + int d = o_plane / osizeT; // output slice/feature + + // input frame/time range is fixed. + int istartT = start_index(ot, osizeT, isizeT); + int iendT = end_index(ot, osizeT, isizeT); + int kT = iendT - istartT; + + // gradInput offset by slice/feature and earliest relevant frame/time + scalar_t *gradInput_nt = gradInput + (d*isizeT + istartT)*isizeH*isizeW; + // gradOutput offset by slice/feature and frame/time + scalar_t *gradOutput_nt = gradOutput + o_plane*osizeH*osizeW; + + // For all output pixels... + for (oh = ostartH; oh < oendH; oh += ostepH) { + int istartH = start_index(oh, osizeH, isizeH); + int iendH = end_index(oh, osizeH, isizeH); + int kH = iendH - istartH; + + for (ow = ostartW; ow < oendW; ow += ostepW) { + int istartW = start_index(ow, osizeW, isizeW); + int iendW = end_index(ow, osizeW, isizeW); + int kW = iendW - istartW; + + // Compute the gradients from corresponding input pixels + scalar_t *ptr_gradInput = gradInput_nt + istartH*isizeW + istartW; + scalar_t *ptr_gradOutput = gradOutput_nt + oh*osizeW + ow; + scalar_t grad_delta = *ptr_gradOutput / kT / kH / kW; + + int it, ih, iw; + for (it = 0; it < kT; ++it) { + for (ih = 0; ih < kH; ++ih) { + for (iw = 0; iw < kW; ++iw) { + atomicAdd(&(ptr_gradInput[ih*isizeW + iw]), grad_delta); + } + } + ptr_gradInput += isizeH*isizeW; // next input frame + } + } + } +} + +template +void atomicadaptiveaveragegradinput_loop( + scalar_t* gradInput_data, scalar_t* gradOutput_data, + int64_t totalZ, + int isizeT, int isizeH, int isizeW, + int osizeT, int osizeH, int osizeW) { + int64_t offsetZ = 0; + dim3 threads(32, 8); + int blocksH = std::max((int)(16L / totalZ), 1); + while (totalZ > 0) { + dim3 blocks(totalZ > 65535 ? 65535 : totalZ, blocksH); + atomicadaptiveaveragegradinput<<>>( + gradInput_data, gradOutput_data, + isizeT, isizeH, isizeW, + osizeT, osizeH, osizeW, + offsetZ); + + totalZ -= 65535; + offsetZ += 65535; + AT_CUDA_CHECK(cudaGetLastError()); + } +} + +// 5D tensor B x D x T x H x w + +void adaptive_avg_pool3d_out_cuda_template( + Tensor& output, + const Tensor& input_, + IntArrayRef& output_size) { + TensorArg output_arg{output, "output", 1}; + TensorArg input_arg{input_, "input_", 2}; + + checkAllSameGPU("adaptive_avg_pool3d_cuda", {output_arg, input_arg}); + + for (int64_t i = 0; i < input_.ndimension(); i++) { + TORCH_CHECK( + input_.size(i) > 0, + "adaptive_avg_pool3d_cuda(): expected input to have non-empty spatial dimensions, " + "but input has sizes ", input_.sizes(), + " with dimension ", i, " being empty"); + } + + TORCH_CHECK( + (input_.ndimension() == 4 || input_.ndimension() == 5), + "non-empty 4D or 5D (batch mode) tensor expected for input"); + + // the jit sometimes passes output_size.size() == 1 + TORCH_CHECK( + output_size.size() == 1 || output_size.size() == 3, + "adaptive_avg_pool3d: internal error: output_size.size() must be 1 or 3"); + + int64_t osizeT = output_size[0]; + int64_t osizeH = output_size[1]; + int64_t osizeW = output_size[2]; + + int64_t sizeD, isizeT, isizeH, isizeW; + int64_t istrideD, istrideT, istrideH, istrideW; + int64_t totalZ; + + const Tensor& input = input_.ndimension() == 4 ? input_ : input_.contiguous(); + + if (input.ndimension() == 4) { + sizeD = input.size(0); + isizeT = input.size(1); + isizeH = input.size(2); + isizeW = input.size(3); + + istrideD = input.stride(0); + istrideT = input.stride(1); + istrideH = input.stride(2); + istrideW = input.stride(3); + + output.resize_({sizeD, osizeT, osizeH, osizeW}); + + totalZ = sizeD * osizeT; + } else { + int64_t sizeB = input.size(0); + sizeD = input.size(1); + isizeT = input.size(2); + isizeH = input.size(3); + isizeW = input.size(4); + + istrideD = input.stride(1); + istrideT = input.stride(2); + istrideH = input.stride(3); + istrideW = input.stride(4); + + output.resize_({sizeB, sizeD, osizeT, osizeH, osizeW}); + + totalZ = sizeB * sizeD * osizeT; + } + + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + input.scalar_type(), "adaptive_avg_pool3d_cuda", [&] { + scalar_t* input_data = input.data(); + scalar_t* output_data = output.data(); + + adaptiveaveragepool_loop( + input_data, output_data, + totalZ, + isizeT, isizeH, isizeW, + osizeT, osizeH, osizeW, + istrideD, istrideT, istrideH, istrideW); + }); +} + +void adaptive_avg_pool3d_backward_out_cuda_template( + Tensor& gradInput, + const Tensor& gradOutput_, + const Tensor& input) { + TensorArg grad_input_arg{gradInput, "gradInput", 1}; + TensorArg grad_output_arg{gradOutput_, "gradOutput_", 2}; + TensorArg input_arg{input, "input", 3}; + + checkAllSameGPU( + "adaptive_avg_pool3d_out_cuda", + {grad_input_arg, grad_output_arg, input_arg}); + + const Tensor gradOutput = gradOutput_.contiguous(); + + gradInput.resize_as_(input); + gradInput.zero_(); + + int64_t sizeD, isizeT, isizeH, isizeW; + int64_t osizeT, osizeH, osizeW; + int64_t totalZ; + + if (input.ndimension() == 4) { + sizeD = input.size(0); + isizeT = input.size(1); + isizeH = input.size(2); + isizeW = input.size(3); + + osizeT = gradOutput.size(1); + osizeH = gradOutput.size(2); + osizeW = gradOutput.size(3); + } else { + sizeD = input.size(1); + isizeT = input.size(2); + isizeH = input.size(3); + isizeW = input.size(4); + + osizeT = gradOutput.size(2); + osizeH = gradOutput.size(3); + osizeW = gradOutput.size(4); + } + + bool atomic = (isizeW%osizeW != 0) || (isizeH%osizeH != 0) || (isizeT%osizeT != 0); + + if (input.ndimension() == 4) { + totalZ = atomic ? sizeD * osizeT : sizeD * isizeT; + } else { + int sizeB = input.size(0); + totalZ = atomic ? sizeB * sizeD * osizeT : sizeB * sizeD * isizeT; + } + + if (atomic) { + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + input.scalar_type(), "adaptive_avg_pool3d_backward_cuda", [&] { + scalar_t* gradInput_data = gradInput.data(); + scalar_t* gradOutput_data = gradOutput.data(); + + atomicadaptiveaveragegradinput_loop( + gradInput_data, gradOutput_data, + totalZ, + isizeT, isizeH, isizeW, + osizeT, osizeH, osizeW); + }); + } else { + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + input.scalar_type(), "adaptive_avg_pool3d_backward_cuda", [&] { + scalar_t* gradInput_data = gradInput.data(); + scalar_t* gradOutput_data = gradOutput.data(); + + adaptiveaveragegradinput_loop( + gradInput_data, gradOutput_data, + totalZ, + isizeT, isizeH, isizeW, + osizeT, osizeH, osizeW); + }); + } +} + +} // namespace + +Tensor& adaptive_avg_pool3d_out_cuda( + Tensor& output, + const Tensor& input, + IntArrayRef output_size) { + adaptive_avg_pool3d_out_cuda_template(output, input, output_size); + return output; +} + +Tensor adaptive_avg_pool3d_cuda( + const Tensor& input, + IntArrayRef output_size) { + auto output = at::empty({0}, input.options()); + adaptive_avg_pool3d_out_cuda_template(output, input, output_size); + return output; +} + +Tensor& adaptive_avg_pool3d_backward_out_cuda( + Tensor& gradInput, + const Tensor& gradOutput_, + const Tensor& input) { + adaptive_avg_pool3d_backward_out_cuda_template(gradInput, gradOutput_, input); + return gradInput; +} + +Tensor adaptive_avg_pool3d_backward_cuda( + const Tensor& gradOutput_, + const Tensor& input) { + auto gradInput = at::zeros_like(input); + adaptive_avg_pool3d_backward_out_cuda_template(gradInput, gradOutput_, input); + return gradInput; +} + +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/cuda/AdaptiveMaxPooling2d.cu b/aten/src/ATen/native/cuda/AdaptiveMaxPooling2d.cu index b5b47dbf6e2d..5c9ec4f8ceb1 100644 --- a/aten/src/ATen/native/cuda/AdaptiveMaxPooling2d.cu +++ b/aten/src/ATen/native/cuda/AdaptiveMaxPooling2d.cu @@ -204,16 +204,16 @@ void adaptive_max_pool2d_out_cuda_template( checkAllSameGPU("adaptive_max_pool2d_cuda", {output_arg, indices_arg, input_arg}); for (int64_t i = 0; i < input.ndimension(); i++) { - AT_CHECK(input.size(i) > 0, + TORCH_CHECK(input.size(i) > 0, "adaptive_max_pool2d_cuda(): expected input to have non-empty spatial dimensions, " "but input has sizes ", input.sizes(), " with dimension ", i, " being " "empty"); } - AT_CHECK((input.ndimension() == 3 || input.ndimension() == 4), + TORCH_CHECK((input.ndimension() == 3 || input.ndimension() == 4), "non-empty 3D or 4D (batch mode) tensor expected for input"); - AT_CHECK(output_size.size() == 2, + TORCH_CHECK(output_size.size() == 2, "adaptive_max_pool2d: internal error: output_size.size() must be 2"); int64_t osizeH = output_size[0]; diff --git a/aten/src/ATen/native/cuda/AdaptiveMaxPooling3d.cu b/aten/src/ATen/native/cuda/AdaptiveMaxPooling3d.cu index 9ee9d70280e8..21c57c0ce382 100644 --- a/aten/src/ATen/native/cuda/AdaptiveMaxPooling3d.cu +++ b/aten/src/ATen/native/cuda/AdaptiveMaxPooling3d.cu @@ -307,16 +307,16 @@ void adaptive_max_pool3d_out_cuda_template( checkAllSameGPU("adaptive_max_pool3d_cuda", {output_arg, indices_arg, input_arg}); for (int64_t i = 0; i < input_.ndimension(); i++) { - AT_CHECK(input_.size(i) > 0, + TORCH_CHECK(input_.size(i) > 0, "adaptive_max_pool3d_cuda(): expected input to have non-empty spatial dimensions, " "but input has sizes ", input_.sizes(), " with dimension ", i, " being " "empty"); } - AT_CHECK((input_.ndimension() == 4 || input_.ndimension() == 5), + TORCH_CHECK((input_.ndimension() == 4 || input_.ndimension() == 5), "non-empty 4D or 5D (batch mode) tensor expected for input"); - AT_CHECK(output_size.size() == 3, + TORCH_CHECK(output_size.size() == 3, "adaptive_max_pool3d: internal error: output_size.size() must be 3"); int64_t osizeT = output_size[0]; diff --git a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu index d91cfdab329a..0411850316f2 100644 --- a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu +++ b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu @@ -63,6 +63,18 @@ void magmaLuNoPivBatched( AT_ERROR("lu only takes float or double Tensors"); } +template +inline magma_int_t magmaGetriOptimalBlocksize(magma_int_t n) { + AT_ERROR("getri only takes float or double Tensors"); +} + +template +void magmaGetri( + magma_int_t n, scalar_t* dA, magma_int_t ldda, magma_int_t* ipiv, scalar_t* dwork, + magma_int_t lwork, magma_int_t* info) { + AT_ERROR("getri only takes float or double Tensors"); +} + template void magmaGetriBatched( magma_int_t n, scalar_t** dA_array, magma_int_t ldda, @@ -202,6 +214,30 @@ void magmaLuNoPivBatched( magma_sgetrf_nopiv_batched(m, n, dA_array, ldda, info_array, batchsize, magma_queue.get_queue()); } +template<> +inline magma_int_t magmaGetriOptimalBlocksize(magma_int_t n) { + return magma_get_dgetri_nb(n); +} + +template<> +inline magma_int_t magmaGetriOptimalBlocksize(magma_int_t n) { + return magma_get_sgetri_nb(n); +} + +template<> +void magmaGetri( + magma_int_t n, double* dA, magma_int_t ldda, magma_int_t* ipiv, double* dwork, + magma_int_t lwork, magma_int_t* info) { + magma_dgetri_gpu(n, dA, ldda, ipiv, dwork, lwork, info); +} + +template<> +void magmaGetri( + magma_int_t n, float* dA, magma_int_t ldda, magma_int_t* ipiv, float* dwork, + magma_int_t lwork, magma_int_t* info) { + magma_sgetri_gpu(n, dA, ldda, ipiv, dwork, lwork, info); +} + template<> void magmaGetriBatched( magma_int_t n, double** dA_array, magma_int_t ldda, @@ -382,7 +418,7 @@ std::tuple _solve_helper_cuda(const Tensor& self, const Tensor& // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ inverse ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ template -static void apply_inverse(Tensor& self, Tensor& self_inv, std::vector& infos) { +static void apply_batched_inverse(Tensor& self, Tensor& self_inv, std::vector& infos) { #ifndef USE_MAGMA AT_ERROR("inverse: MAGMA library not found in " "compilation. Please rebuild with MAGMA."); @@ -429,17 +465,47 @@ AT_ERROR("inverse: MAGMA library not found in " #endif } -// Because this is out-of-place inverse, the predefined macros will -// not work +template +static void apply_single_inverse(Tensor& self, int64_t& info) { +#ifndef USE_MAGMA +AT_ERROR("inverse: MAGMA library not found in " + "compilation. Please rebuild with MAGMA."); +#else + auto self_data = self.data(); + magma_int_t n = magma_int_cast(self.size(-2), "self.size(-2)"); + magma_int_t lwork = n * magmaGetriOptimalBlocksize(n); + magma_int_t info_tmp = 0; + + Tensor ipiv = at::empty({n}, at::kInt); + Tensor dwork = at::empty({lwork}, self.options()); + magmaLu(n, n, self_data, n, ipiv.data(), &info_tmp); + if (info_tmp != 0) { + info = info_tmp; + return; + } + magmaGetri( + n, self_data, n, ipiv.data(), dwork.data(), lwork, &info_tmp); + info = info_tmp; +#endif +} + Tensor _inverse_helper_cuda(const Tensor& self) { - std::vector infos(batchCount(self), 0); - auto self_working_copy = cloneBatchedColumnMajor(self); auto self_inv_working_copy = cloneBatchedColumnMajor(self); - AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "inverse_cuda", [&]{ - apply_inverse( - self_working_copy, self_inv_working_copy, infos); - }); - batchCheckErrors(infos, "inverse_cuda"); + if (self.dim() > 2) { + std::vector infos(batchCount(self), 0); + auto self_working_copy = cloneBatchedColumnMajor(self); + AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "inverse_cuda", [&]{ + apply_batched_inverse( + self_working_copy, self_inv_working_copy, infos); + }); + batchCheckErrors(infos, "inverse_cuda"); + } else { + int64_t info = 0; + AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "inverse_cuda", [&]{ + apply_single_inverse(self_inv_working_copy, info); + }); + singleCheckErrors(info, "inverse_cuda"); + } return self_inv_working_copy; } @@ -497,7 +563,7 @@ Tensor _cholesky_solve_helper_cuda(const Tensor& self, const Tensor& A, bool upp AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "cholesky_solve_cuda", [&]{ apply_cholesky_solve(self_working_copy, A_working_copy, upper, info); }); - AT_CHECK(info == 0, "MAGMA cholesky_solve : invalid argument: ", -info); + TORCH_CHECK(info == 0, "MAGMA cholesky_solve : invalid argument: ", -info); return self_working_copy; } @@ -633,7 +699,7 @@ AT_ERROR("lu: MAGMA library not found in " } std::tuple _lu_with_info_cuda(const Tensor& self, bool pivot, bool check_errors) { - AT_CHECK(self.dim() >= 2, + TORCH_CHECK(self.dim() >= 2, "expected tensor with 2 or more dimensions, got size: ", self.sizes(), " instead"); squareCheckInputs(self); diff --git a/aten/src/ATen/native/cuda/BinaryOpsKernel.cu b/aten/src/ATen/native/cuda/BinaryOpsKernel.cu index 2b8e33837348..acfb268e7ada 100644 --- a/aten/src/ATen/native/cuda/BinaryOpsKernel.cu +++ b/aten/src/ATen/native/cuda/BinaryOpsKernel.cu @@ -7,8 +7,7 @@ #include -// NOTE: CUDA 8 does not allow __device__ lambdas (GPU_LAMBDA) to be defined -// inside other lambdas. CUDA on Windows requires that the enclosing function +// NOTE: CUDA on Windows requires that the enclosing function // of a __device__ lambda not have internal linkage. namespace at { namespace native { diff --git a/aten/src/ATen/native/cuda/Copy.cu b/aten/src/ATen/native/cuda/Copy.cu index 9ddab337401e..318454dbf09b 100644 --- a/aten/src/ATen/native/cuda/Copy.cu +++ b/aten/src/ATen/native/cuda/Copy.cu @@ -1,75 +1,40 @@ #include #include #include -#include -#include #include #include #include #include +#include +#include +#include -namespace { +namespace at { +namespace native { -using namespace at; using namespace at::cuda; -// Copy operator for the pointwise apply kernel -template -struct CopyOp { - static void apply(Tensor& dst, const Tensor& src) { - CUDA_tensor_apply2( - dst, src, [] __device__(dst_T & dst_val, const src_T& src_val) { -#if __CUDA_ARCH__ >= 350 - dst_val = static_cast( - static_cast>(__ldg(&src_val))); -#else - dst_val = static_cast(static_cast>(src_val)); -#endif - }); - } -}; - -template -struct CopyOp { - static void apply(Tensor& dst, const Tensor& src) { - CUDA_tensor_apply2( - dst, src, [] __device__(dst_T & dst_val, const bool& src_val) { - dst_val = static_cast(static_cast>(src_val)); - }); - } -}; +template +void copy_kernel_impl(TensorIterator& iter) { + gpu_unary_kernel(iter, []GPU_LAMBDA(src_t x) -> dst_t { + return static_cast(static_cast>(x)); + }); +} // device-to-device copy, does type conversion -template -void copy_device_to_device(Tensor& dst, const Tensor& src) { - auto numel = dst.numel(); - if (dst.is_same(src) || numel == 0) { - return; - } +static void copy_device_to_device(TensorIterator& iter, bool non_blocking) { + int64_t numel = iter.numel(); - // We can memcpy the memory if: - // -both tensors are contiguous; or, - // -there is only one element to copy; or, - // -FIXME: if both tensors have matching size and stride arrays, and no - // holes within (in other words, there is some permutation that can be applied - // to the size/strides such that the resulting tensor is - // contiguous). - // -AND: both tensors have the same type. - bool same_type = std::is_same::value; - bool memcpy_eligible = - ((src.is_contiguous() && dst.is_contiguous()) || (numel == 1)) && - same_type; + // We can memcpy the memory if both tensors have the same type AND both + // tensors are contiguous after dimension coalescing and reordering. + bool same_type = iter.dtype(0) == iter.dtype(1); + bool memcpy_eligible = same_type && iter.is_contiguous(); - Device src_device = src.device(); - Device dst_device = dst.device(); + Device dst_device = iter.device(0); + Device src_device = iter.device(1); CUDAGuard device_guard(src_device); - // Try to enable p2p access. This also handles the case src_device == - // dst_device. - bool p2pEnabled = THCState_getPeerToPeerAccess( - globalContext().getTHCState(), src_device.index(), dst_device.index()); - // We always perform the copy on the source device, using the current stream // on the source device, and we fully synchronize on both src and dst's // current streams for completion of the copy. We have to explicitly do this @@ -94,61 +59,18 @@ void copy_device_to_device(Tensor& dst, const Tensor& src) { if (memcpy_eligible) { // Perform the copy AT_CUDA_CHECK(cudaMemcpyAsync( - dst.data(), - src.data(), - numel * sizeof(dst_T), + iter.data_ptr(0), + iter.data_ptr(1), + numel * iter.element_size(0), cudaMemcpyDeviceToDevice, copy_stream)); } else { - // Non-contiguous copy or a type-conversion copy - - // We avoid creating temporary memory copies if possible. - // If both src and dst are on the same device, or if they are on - // different devices and p2p access is enabled, perform the copy - // by a pointwise copy kernel. - // Otherwise, we'll have to make contiguous (which will in fact - // invoke copy() again), and then perform the copy. - // FIXME: might want to consider only running the pointwise kernel - // if both src and dst innermost dimensions are contiguous. If - // they are not, then taking the hit of the memory allocation/free - // might be worth it to avoid non-coalesced reads or writes. - if (p2pEnabled) { - CopyOp::apply(dst, src); - } else { - // GPUs can't access each other directly, but the tensors - // involved are non-contiguous and/or are different types. - - // Make sure the src is contiguous and in the same type as dst - Tensor src_contig; - if (same_type) { - src_contig = src.contiguous(); - } else { - // Types are different - // Copy into the new format, contiguous, on the source device - src_contig = at::empty_like(dst, src.options().dtype(dst.dtype())); - - CopyOp::apply(src_contig, src); - } - - // Make sure the dst is contiguous - device_guard.set_device(dst_device); - Tensor dst_contig = dst.contiguous(); - - // Now, we are ready for a cross-device memcpy of contiguous - // data, of the same layout and type - device_guard.set_device(src_device); - - AT_CUDA_CHECK(cudaMemcpyAsync( - dst_contig.data(), - src_contig.data(), - numel * sizeof(dst_T), - cudaMemcpyDeviceToDevice, - copy_stream)); - - if (!dst.is_contiguous()) { - copy_device_to_device(dst, dst_contig); - } - } + AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBool, iter.dtype(0), "copy_", [&] { + using dst_t = scalar_t; + AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBool, iter.dtype(1), "copy_", [&] { + copy_kernel_impl(iter); + }); + }); } if (src_device != dst_device) { @@ -166,143 +88,103 @@ void copy_device_to_device(Tensor& dst, const Tensor& src) { AT_CUDA_CHECK(cudaGetLastError()); } -void copy_from_cpu(Tensor& dst, const Tensor& src) { - Tensor dst_contig = dst.contiguous(); - Tensor src_contig = src.contiguous(); +static bool copy_requires_temporaries(TensorIterator& iter) { + Device dst_device = iter.device(0); + Device src_device = iter.device(1); - CUDAStream stream = getCurrentCUDAStream(); + if (dst_device == src_device) { + // We never require temporaries for copies on the same GPU. + TORCH_INTERNAL_ASSERT(dst_device.is_cuda() && src_device.is_cuda()); + return false; + } - AT_CUDA_CHECK(cudaMemcpyAsync( - dst_contig.data_ptr(), - src_contig.data_ptr(), - src.numel() * src.element_size(), - cudaMemcpyHostToDevice, - stream)); - AT_CUDA_CHECK(cudaStreamSynchronize(stream)); - AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::Bool, src.scalar_type(), "copy_from_cpu", [&]() { - copy_device_to_device(dst, dst_contig); - }); + bool same_dtype = iter.dtype(0) == iter.dtype(1); + if (same_dtype && iter.is_contiguous()) { + // Contiguous same-dtype copies can always use cudaMemcpyAsync + return false; + } else if (dst_device.is_cuda() && src_device.is_cuda()) { + // Copies between GPUs can use the copy kernel if P2P is supported + return !THCState_getPeerToPeerAccess( + globalContext().getTHCState(), src_device.index(), dst_device.index()); + } else { + // The remaining cases require temporaries. For example, this includes + // non-contiguous copies between CPU and GPU. + return true; + } } -void copy_to_cpu(Tensor& dst, const Tensor& src) { - Tensor dst_contig = dst.contiguous(); - Tensor src_contig = src.contiguous(); - - CUDAGuard device_guard(src.device()); - CUDAStream stream = getCurrentCUDAStream(); - - AT_CUDA_CHECK(cudaMemcpyAsync( - dst_contig.data_ptr(), - src_contig.data_ptr(), - src.numel() * src.element_size(), - cudaMemcpyDeviceToHost, - stream)); - AT_CUDA_CHECK(cudaStreamSynchronize(stream)); - _copy_same_type_(dst, dst_contig); -} +static void copy_kernel_cuda(TensorIterator& iter, bool non_blocking) { + AT_ASSERT(iter.ntensors() == 2); + + if (copy_requires_temporaries(iter)) { + // NB: this involves recursive calls to copy. Be careful that those copies + // don't require temporaries or you will cause an infinite recursion! + auto& dst = iter.tensor(0); + Tensor dst_contig; + Tensor src_contig; + + // Type conversions are performed on the CPU for CPU-GPU copies and on + // the src device for GPU-GPU copies. + if (iter.device_type(0) == kCUDA) { + dst_contig = dst.is_contiguous() ? dst : at::empty_like(dst); + src_contig = iter.tensor(1).to(iter.dtype(0)).expand_as(dst).contiguous(); + } else { + bool same_type = iter.dtype(0) == iter.dtype(1); + dst_contig = (dst.is_contiguous() && same_type) ? dst : at::empty_like(dst, iter.dtype(1)); + src_contig = iter.tensor(1).expand_as(dst).contiguous(); + } -void copy_from_cpu_async_(Tensor& dst, const Tensor& src) { - AT_CHECK(dst.is_contiguous(), "Target tensor must be contiguous."); - AT_CHECK(src.is_contiguous(), "Source tensor must be contiguous."); + // perform a same-dtype copy on contiguous tensors + TORCH_INTERNAL_ASSERT(dst_contig.sizes().equals(src_contig.sizes())); + TORCH_INTERNAL_ASSERT(dst_contig.scalar_type() == src_contig.scalar_type()); + dst_contig.copy_(src_contig, non_blocking); - if (dst.numel() == 0) { + // if necessary, copy back into dst + if (!dst_contig.is_same(dst)) { + TORCH_INTERNAL_ASSERT(dst_contig.device() == dst.device()); + dst.copy_(dst_contig, non_blocking); + } return; } - CUDAGuard device_guard(dst.device()); - CUDAStream stream = getCurrentCUDAStream(); - - AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::Bool, src.scalar_type(), "copy_from_cpu_async", [&]() { - AT_CUDA_CHECK(cudaMemcpyAsync( - dst.data(), - src.data(), - src.numel() * sizeof(scalar_t), - cudaMemcpyHostToDevice, - stream)); - AT_CUDA_CHECK(THCCachingHostAllocator_recordEvent( - src.storage().data(), stream)); - }); -} + Device dst_device = iter.device(0); + Device src_device = iter.device(1); -void copy_to_cpu_async_(Tensor& dst, const Tensor& src) { - AT_CHECK(dst.is_contiguous(), "Target tensor must be contiguous."); - AT_CHECK(src.is_contiguous(), "Source tensor must be contiguous."); - - if (dst.numel() == 0) { + // Copy on GPU (or between GPUs) + if (dst_device.is_cuda() && src_device.is_cuda()) { + copy_device_to_device(iter, non_blocking); return; } - CUDAGuard device_guard(src.device()); - CUDAStream stream = getCurrentCUDAStream(); - - AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::Bool, src.scalar_type(), "copy_to_cpu_async", [&]() { - AT_CUDA_CHECK(cudaMemcpyAsync( - dst.data(), - src.data(), - src.numel() * sizeof(scalar_t), - cudaMemcpyDeviceToHost, - stream)); - AT_CUDA_CHECK(THCCachingHostAllocator_recordEvent( - src.storage().data(), stream)); - }); -} - -template -void _copy__cuda(Tensor& dst, const Tensor& src, bool non_blocking) { - AT_CHECK(dst.numel() == src.numel(), "sizes do not match"); - AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::Bool, src.scalar_type(), "_copy__cuda", [&]() { - if (dst.is_cuda() && src.is_cuda()) { - copy_device_to_device(dst, src); - } else if (dst.is_cuda()) { - if (std::is_same::value) { - if (non_blocking) { - copy_from_cpu_async_(dst, src); - } else { - copy_from_cpu(dst, src); - } - } else { - // Do a dtype converting copy on the CPU, then copy to device - Tensor srcf = at::empty_like(src, src.options().dtype(dst.dtype())); - s_copy_(srcf, src); - copy_from_cpu(dst, srcf); - } - } else { - if (std::is_same::value) { - if (non_blocking) { - copy_to_cpu_async_(dst, src); - } else { - copy_to_cpu(dst, src); - } - } else { - // Copy to CPU as the same dtype, then do a dtype converting copy - Tensor srcf = at::empty_like(src, dst.options().dtype(src.dtype())); - copy_to_cpu(srcf, src); - s_copy_(dst, srcf); - } - } - }); -} + // Copy between CPU and GPU + cuda::OptionalCUDAGuard device_guard; + cudaMemcpyKind kind; + if (dst_device.is_cuda() && src_device.is_cpu()) { + device_guard.set_device(dst_device); + kind = cudaMemcpyHostToDevice; + } else if (dst_device.is_cpu() && src_device.is_cuda()) { + device_guard.set_device(src_device); + kind = cudaMemcpyDeviceToHost; + } else { + TORCH_INTERNAL_ASSERT(false, "unsupported devices in GPU copy_()"); + } -} // namespace + void* dst = iter.data_ptr(0); + void* src = iter.data_ptr(1); + int64_t nbytes = iter.numel() * iter.element_size(0); + CUDAStream stream = getCurrentCUDAStream(); -namespace at { -namespace native { + AT_CUDA_CHECK(cudaMemcpyAsync(dst, src, nbytes, kind, stream)); -Tensor& _s_copy__cuda(Tensor& self, const Tensor& src, bool non_blocking) { - AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::Bool, self.scalar_type(), "_copy__cuda", [&]() { - ::_copy__cuda(self, src, non_blocking); - }); - return self; + if (non_blocking) { + void* ptr = (dst_device == kCPU ? dst : src); + AT_CUDA_CHECK(THCCachingHostAllocator_recordEvent(ptr, stream)); + } else { + AT_CUDA_CHECK(cudaStreamSynchronize(stream)); + } } -Tensor _s_copy_from_cuda( - const Tensor& self, - const Tensor& dst, - bool non_blocking) { - Tensor dst_ = dst; - _s_copy__cuda(dst_, self); - return dst; -} +REGISTER_DISPATCH(copy_stub, ©_kernel_cuda); } // namespace native } // namespace at diff --git a/aten/src/ATen/native/cuda/CuFFTPlanCache.h b/aten/src/ATen/native/cuda/CuFFTPlanCache.h index ea894edc5953..d239551f5475 100644 --- a/aten/src/ATen/native/cuda/CuFFTPlanCache.h +++ b/aten/src/ATen/native/cuda/CuFFTPlanCache.h @@ -109,13 +109,13 @@ class CuFFTConfig { if (input.scalar_type() == ScalarType::Half) { // cuFFT on half requires compute capability of at least SM_53 auto dev_prop = at::cuda::getCurrentDeviceProperties(); - AT_CHECK(dev_prop->major >= 5 && !(dev_prop->major == 5 && dev_prop->minor < 3), + TORCH_CHECK(dev_prop->major >= 5 && !(dev_prop->major == 5 && dev_prop->minor < 3), "cuFFT doesn't support signals of half type with compute " "capability less than SM_53, but the device containing input half " "tensor only has SM_", dev_prop->major, dev_prop->minor); for (int64_t i = 0; i < signal_ndim; i++) { auto signal_size = checked_signal_sizes[i]; - AT_CHECK(is_pow_of_two(signal_size), + TORCH_CHECK(is_pow_of_two(signal_size), "cuFFT doesn't support signals of half type with size at any ", "dimension that is not a power of two, but got a signal size of ", checked_signal_sizes); @@ -451,9 +451,9 @@ class CuFFTParamsLRUCache { // We check that 0 <= new_size <= CUFFT_MAX_PLAN_NUM here. Since // CUFFT_MAX_PLAN_NUM is of type size_t, we need to do non-negativity check // first. - AT_CHECK(new_size >= 0, + TORCH_CHECK(new_size >= 0, "cuFFT plan cache size must be non-negative, but got ", new_size); - AT_CHECK(new_size <= CUFFT_MAX_PLAN_NUM, + TORCH_CHECK(new_size <= CUFFT_MAX_PLAN_NUM, "cuFFT plan cache size can not be larger than ", CUFFT_MAX_PLAN_NUM, ", but got ", new_size); _max_size = static_cast(new_size); } diff --git a/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu b/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu new file mode 100644 index 000000000000..8ef0ed8dbf04 --- /dev/null +++ b/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu @@ -0,0 +1,420 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +namespace { + +__device__ inline int min(int a, int b) { + return a <= b ? a : b; +} + +// kernels borrowed from Caffe +template +__global__ void MaxPoolForward(const int nthreads, const scalar_t* bottom_data, + const int num, const int channels, const int height, + const int width, const int pooled_height, const int pooled_width, + const int kernel_h, const int kernel_w, const int stride_h, + const int stride_w, const int pad_h, const int pad_w, + const int dilation_h, const int dilation_w, scalar_t* top_data, + int64_t* top_mask) { + CUDA_KERNEL_LOOP(index, nthreads) { + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + int hstart = ph * stride_h - pad_h; + int wstart = pw * stride_w - pad_w; + int hend = min(hstart + (kernel_h - 1) * dilation_h + 1, height); + int wend = min(wstart + (kernel_w - 1) * dilation_w + 1, width); + while(hstart < 0) + hstart += dilation_h; + while(wstart < 0) + wstart += dilation_w; + accscalar_t maxval = THCNumerics::min(); + int maxidx = -1; + bottom_data += (n * channels + c) * height * width; + for (int h = hstart; h < hend; h += dilation_h) { + for (int w = wstart; w < wend; w += dilation_w) { + scalar_t val = bottom_data[h * width + w]; + if ((ScalarConvert::to(val) > maxval) || THCNumerics::isnan(val)) { + maxidx = h * width + w; + maxval = ScalarConvert::to(val); + } + } + } + top_data[index] = ScalarConvert::to(maxval); + top_mask[index] = maxidx; + } +} + +static const int BACKWARD_THREADS = 256; + +template +#if defined (__HIP_PLATFORM_HCC__) +C10_LAUNCH_BOUNDS_2(BACKWARD_THREADS, 4) +#else +C10_LAUNCH_BOUNDS_2(BACKWARD_THREADS, 8) +#endif +__global__ void MaxPoolBackward(const int nthreads, const scalar_t* top_diff, + const int64_t* top_mask, const int num, const int channels, + const int height, const int width, const int pooled_height, + const int pooled_width, const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, const int pad_h, const int pad_w, + const int dilation_h, const int dilation_w, + scalar_t* bottom_diff) { + CUDA_KERNEL_LOOP(index, height*width) { + int h = index/width; + int w = index - h * width; +//get some templating performance benefits without actually templating + int phstart, phend, pwstart, pwend; + if (stride_h == 1) { + phstart = + (h + pad_h < ((kernel_h - 1) * dilation_h + 1)) ? 0 : (h + pad_h - ((kernel_h - 1) * dilation_h + 1)) + 1; + phend = min((h + pad_h) + 1, pooled_height); + } else if (stride_h == 2) { + phstart = + (h + pad_h < ((kernel_h - 1) * dilation_h + 1)) ? 0 : (h + pad_h - ((kernel_h - 1) * dilation_h + 1)) / 2 + 1; + phend = min((h + pad_h) / 2 + 1, pooled_height); + } else { + phstart = + (h + pad_h < ((kernel_h - 1) * dilation_h + 1)) ? 0 : (h + pad_h - ((kernel_h - 1) * dilation_h + 1)) / stride_h + 1; + phend = min((h + pad_h) / stride_h + 1, pooled_height); + } + if (stride_w == 1) { + pwstart = + (w + pad_w < ((kernel_w - 1) * dilation_w + 1)) ? 0 : (w + pad_w - ((kernel_w - 1) * dilation_w + 1)) + 1; + pwend = min((w + pad_w) + 1, pooled_width); + } else if (stride_w == 2) { + pwstart = + (w + pad_w < ((kernel_w - 1) * dilation_w + 1)) ? 0 : (w + pad_w - ((kernel_w - 1) * dilation_w + 1)) / 2 + 1; + pwend = min((w + pad_w) / 2 + 1, pooled_width); + } else { + pwstart = + (w + pad_w < ((kernel_w - 1) * dilation_w + 1)) ? 0 : (w + pad_w - ((kernel_w - 1) * dilation_w + 1)) / stride_w + 1; + pwend = min((w + pad_w) / stride_w + 1, pooled_width); + } + for (int n = blockIdx.y; n < num; n += gridDim.y) + for (int c = blockIdx.z; c < channels; c+= gridDim.z) { + + accscalar_t gradient = accscalar_t(0); + int offset = (n * channels + c) * pooled_height * pooled_width; + top_diff += offset; + top_mask += offset; +//get some templating performance benefits without actually templating + if ((phstart + 1 != phend) || (pwstart + 1 != pwend)) { + for (int ph = phstart; ph < phend; ++ph) { + for (int pw = pwstart; pw < pwend; ++pw) { + if (top_mask[ph * pooled_width + pw] == h * width + w) { + gradient += ScalarConvert::to(top_diff[ph * pooled_width + pw]); + } + } + } + } else { + if (top_mask[phstart * pooled_width + pwstart] == h * width + w) { + gradient += ScalarConvert::to(top_diff[phstart * pooled_width + pwstart]); + } + } + bottom_diff[(n*channels+c)*height*width+index] = ScalarConvert::to(gradient); + } + } +} + +void max_pool2d_with_indices_out_cuda_template( + Tensor& output, + Tensor& indices, + const Tensor& input_, + IntArrayRef kernel_size, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + bool ceil_mode) +{ + TensorArg output_arg{ output, "output", 1 }; + TensorArg indices_arg{ indices, "indices", 2 }; + TensorArg input_arg{ input_, "input_", 3 }; + + checkAllSameGPU("max_pool2d_with_indices_out_cuda", + {output_arg, indices_arg, input_arg}); + + // XXX JIT: Pooling.cpp allows stride.empty(). + // XXX IntegrationTest.MNIST: padding.size() == 1 && dilation.size() == 1. + TORCH_CHECK(kernel_size.size() == 2 && + (stride.empty() || stride.size() == 2) && + (padding.size() == 1 || padding.size() == 2) && + (dilation.size() == 1 || dilation.size() == 2), + "max_pool2d_with_indices: internal error: all IntArrayRef sizes must be 2"); + + TORCH_CHECK((input_.ndimension() == 3 || input_.ndimension() == 4), + "non-empty 3D or 4D (batch mode) tensor expected for input"); + + const int kH = safe_downcast(kernel_size[0]); + const int kW = safe_downcast(kernel_size[1]); + + const int dH = stride.empty() ? kH : safe_downcast(stride[0]); + const int dW = stride.empty() ? kW : safe_downcast(stride[1]); + + const int padH = safe_downcast(padding[0]); + const int padW = padding.size() == 1 ? padH : safe_downcast(padding[1]); + + const int dilationH = safe_downcast(dilation[0]); + const int dilationW = dilation.size() == 1 ? dilationH : safe_downcast(dilation[1]); + + const int64_t nbatch = input_.ndimension() == 4 ? input_.size(-4) : 1; + const int64_t nInputPlane = input_.size(-3); + const int64_t inputHeight = input_.size(-2); + const int64_t inputWidth = input_.size(-1); + + const int64_t outputWidth = pooling_output_shape(inputWidth, kW, padW, dW, dilationW, ceil_mode); + const int64_t outputHeight = pooling_output_shape(inputHeight, kH, padH, dH, dilationH, ceil_mode); + + max_pool2d_with_indices_shape_check( + input_, + kH, kW, dH, dW, padH, padW, dilationH, dilationW, + nInputPlane, + inputHeight, inputWidth, + outputHeight, outputWidth); + + Tensor input = input_.contiguous(); + + output.resize_({nbatch, nInputPlane, outputHeight, outputWidth}); + indices.resize_({nbatch, nInputPlane, outputHeight, outputWidth}); + + const int count = safe_downcast(output.numel()); + const int num_threads = std::min(at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, + BACKWARD_THREADS); + + AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), + "max_pool2d_with_indices_out_cuda_frame", + [&] { + using accscalar_t = acc_type; + + scalar_t *output_data = output.data(); + scalar_t *input_data = input.data(); + int64_t *indices_data = indices.data(); + + MaxPoolForward + <<>>( + count, input_data, + nbatch, nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth, + kH, kW, dH, dW, padH, padW, dilationH, dilationW, output_data, indices_data); } + ); + + TORCH_CHECK(cudaGetLastError() == cudaSuccess, + "max_pool2d_with_indices_out_cuda_frame failed with error code ", + cudaGetLastError()); + + if(input.ndimension() == 3) { + output.resize_({nInputPlane, outputHeight, outputWidth}); + } +} + +void max_pool2d_with_indices_backward_out_cuda_template( + Tensor& gradInput, + const Tensor& gradOutput_, + const Tensor& input_, + const Tensor& indices, + IntArrayRef kernel_size, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + bool ceil_mode) +{ + TensorArg gradInput_arg{ gradInput, "gradInput", 1 }; + TensorArg gradOutput_arg{ gradOutput_, "gradOutput_", 2 }; + TensorArg input_arg{ input_, "input_", 3 }; + TensorArg indices_arg{ indices, "indices", 4 }; + + checkAllSameGPU("max_pool2d_with_indices_out_cuda", + {gradInput_arg, gradOutput_arg, input_arg, indices_arg}); + + // XXX JIT: Pooling.cpp allows stride.empty(). + // XXX IntegrationTest.MNIST: padding.size() == 1 && dilation.size() == 1. + TORCH_CHECK(kernel_size.size() == 2 && + (stride.empty() || stride.size() == 2) && + (padding.size() == 1 || padding.size() == 2) && + (dilation.size() == 1 || dilation.size() == 2), + "max_pool2d_with_indices: internal error: all IntArrayRef sizes must be 2"); + + TORCH_CHECK((input_.ndimension() == 3 || input_.ndimension() == 4), + "non-empty 3D or 4D (batch mode) tensor expected for input"); + + const int kH = safe_downcast(kernel_size[0]); + const int kW = safe_downcast(kernel_size[1]); + + const int dH = stride.empty() ? kH : safe_downcast(stride[0]); + const int dW = stride.empty() ? kW : safe_downcast(stride[1]); + + const int padH = safe_downcast(padding[0]); + const int padW = padding.size() == 1 ? padH : safe_downcast(padding[1]); + + const int dilationH = safe_downcast(dilation[0]); + const int dilationW = dilation.size() == 1 ? dilationH : safe_downcast(dilation[1]); + + const Tensor input = input_.contiguous(); + + const int64_t nbatch = input.ndimension() == 4 ? input.size(-4) : 1; + const int64_t nInputPlane = input.size(-3); + const int64_t inputHeight = input.size(-2); + const int64_t inputWidth = input.size(-1); + + const int64_t outputHeight = pooling_output_shape(inputHeight, kH, padH, dH, dilationH, ceil_mode); + const int64_t outputWidth = pooling_output_shape(inputWidth, kW, padW, dW, dilationW, ceil_mode); + + max_pool2d_with_indices_shape_check( + input_, + gradOutput_, + indices, + nbatch, + kH, kW, dH, dW, padH, padW, dilationH, dilationW, + nInputPlane, + inputHeight, inputWidth, + outputHeight, outputWidth, + /*cuda=*/ true); + + const Tensor gradOutput = gradOutput_.contiguous(); + gradInput.resize_as_(input); + + int64_t count = input.numel(); + dim3 grid; + int imgcount = inputWidth * inputHeight; + const int blocks = (imgcount + BACKWARD_THREADS - 1) / BACKWARD_THREADS; + grid.x = blocks; + grid.y = nbatch; + grid.z = nInputPlane; + uint64_t maxGridY = at::cuda::getCurrentDeviceProperties()->maxGridSize[1]; + uint64_t maxGridZ = at::cuda::getCurrentDeviceProperties()->maxGridSize[2]; + if (maxGridY < grid.y) grid.y = maxGridY; + if (maxGridZ < grid.z) grid.z = maxGridZ; + + AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), + "max_pool2d_with_indices_out_cuda_frame", + [&] { + using accscalar_t = acc_type; + + scalar_t *gradOutput_data = gradOutput.data(); + scalar_t *gradInput_data = gradInput.data(); + int64_t *indices_data = indices.data(); + + MaxPoolBackward + <<>>( + count, + gradOutput_data, + indices_data, + nbatch, + nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth, + kH, kW, dH, dW, padH, padW, dilationH, dilationW, + gradInput_data); + } + ); + + TORCH_CHECK(cudaGetLastError() == cudaSuccess, + "fractional_max_pool2d_backward_out_cuda failed with error code ", + cudaGetLastError()); +} + +} // namespace + +std::tuple max_pool2d_with_indices_out_cuda( + Tensor& output, + Tensor& indices, + const Tensor& input, + IntArrayRef kernel_size, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + bool ceil_mode) +{ + max_pool2d_with_indices_out_cuda_template( + output, + indices, + input, + kernel_size, + stride, + padding, + dilation, + ceil_mode); + return std::tuple(output, indices); +} + +std::tuple max_pool2d_with_indices_cuda( + const Tensor& input, + IntArrayRef kernel_size, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + bool ceil_mode) +{ + Tensor output = at::empty({0}, input.options()); + Tensor indices = at::empty({0}, input.options().dtype(kLong)); + max_pool2d_with_indices_out_cuda_template( + output, + indices, + input, + kernel_size, + stride, + padding, + dilation, + ceil_mode); + return std::tuple(output, indices); +} + +Tensor& max_pool2d_with_indices_backward_out_cuda( + Tensor& gradInput, + const Tensor& gradOutput_, + const Tensor& input, + IntArrayRef kernel_size, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + bool ceil_mode, + const Tensor& indices) +{ + max_pool2d_with_indices_backward_out_cuda_template( + gradInput, + gradOutput_, + input, + indices, + kernel_size, + stride, + padding, + dilation, + ceil_mode); + return gradInput; +} + +Tensor max_pool2d_with_indices_backward_cuda( + const Tensor& gradOutput_, + const Tensor& input, + IntArrayRef kernel_size, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + bool ceil_mode, + const Tensor& indices) +{ + auto gradInput = at::zeros_like(input); + max_pool2d_with_indices_backward_out_cuda_template( + gradInput, + gradOutput_, + input, + indices, + kernel_size, + stride, + padding, + dilation, + ceil_mode); + return gradInput; +} + +} // at::native +} // at diff --git a/aten/src/ATen/native/cuda/Distributions.cu b/aten/src/ATen/native/cuda/Distributions.cu index 079a26f98022..0aec588b5476 100644 --- a/aten/src/ATen/native/cuda/Distributions.cu +++ b/aten/src/ATen/native/cuda/Distributions.cu @@ -11,6 +11,8 @@ #include #include +#include +#include #include #include @@ -23,17 +25,160 @@ #include #include +/** + * Note [Register spilling in curand call for CUDA < 10] + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * For CUDA < 10, curandStatePhilox4_32_10_t engine achieves poor performance (60% SOL bandwidth) + * when called to generate one random number at a time. This is because the line + * unsigned ret = (&state->output.x)[state->STATE++]; + * in + * QUALIFIERS unsigned int curand(curandStatePhilox4_32_10_t *state) + * in curand_kernel.h dynamically indexes into state.output, preventing the compiler from ever + * storing state.output in registers. + * + * CUDA 10 fixed this problem. However, for backwards compatibility, in the following kernels + * we are using curand distributions that utilize curand4 call. curand4 call doesn't have the + * register spilling problem. + */ + THCGenerator* THCRandom_getGenerator(THCState* state); namespace { -// increment should be at least the number of curand() random numbers used in -// each thread. +// Increment should be at least the number of curand() random numbers used in +// each thread. It is the user's responsibility to make sure that the increment for philox is never +// smaller than the number of curand() calls. Increment value > the number of curand() calls +// won't harm but anything less would mean that you would be reusing random values from +// previous calls. +// e.g. In many kernels below, we use distributions that utilize curand4 call in the kernel. +// Hence, increment value should be at least 4 for those kernels. std::pair next_philox_seed(at::Generator* gen, uint64_t increment) { auto gen_ = THCRandom_getGenerator(at::globalContext().getTHCState()); uint64_t offset = gen_->state.philox_seed_offset.fetch_add(increment); return std::make_pair(gen_->state.initial_seed, offset); } +// launch bounds used for kernels utilizing TensorIterator +const uint32_t block_size_bound = 256; +const uint32_t grid_size_bound = 4; +// number of randoms given by distributions like curand_uniform4, curand_uniform2_double +// used in calculating philox offset. +const uint32_t curand4_engine_calls = 4; + +// utility function that calculates proper philox_offset +// for distributions utilizing TensorIterator. For distributions using +// TensorIterator, we are using a grid-stride loop with each +// thread yielding one element per thread. For the edge of the grid-stride +// loop, if the tensor size is large, the unroll loop will kick in and the float4 +// from curand4 will start getting utilized (for common tensor sizes, we end up +// using rand.x from each thread). Hence, the philox_offset is +// (number of elements per thread * number of engine calls), which makes +// sure that philox offset increment is not less than the number of randoms used +// in each thread. +std::tuple calc_execution_policy(int64_t total_elements) { + const uint64_t numel = static_cast(total_elements); + const uint32_t block_size = block_size_bound; + const uint32_t unroll = curand4_engine_calls; + dim3 dim_block(block_size); + dim3 grid((numel + block_size - 1) / block_size); + uint32_t blocks_per_sm = at::cuda::getCurrentDeviceProperties()->maxThreadsPerMultiProcessor / block_size; + grid.x = std::min( + static_cast(at::cuda::getCurrentDeviceProperties()->multiProcessorCount) * blocks_per_sm, + grid.x); + //number of times random will be generated per thread, to offset philox counter in thc random state + uint64_t counter_offset = ((numel - 1) / (block_size * grid.x * unroll) + 1) + * curand4_engine_calls; + return std::make_tuple(counter_offset, grid, dim_block); +} + +// grid stride loop kernel for distributions +template +C10_LAUNCH_BOUNDS_2(block_size_bound, grid_size_bound) +__global__ void distribution_elementwise_grid_stride_kernel(int numel, + std::pair seeds, + const dist_t dist_func, + const transform_t transform_func) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + curandStatePhilox4_32_10_t state; + curand_init( + seeds.first, + idx, + seeds.second, + &state); + int rounded_size = ((numel - 1)/(blockDim.x * gridDim.x * unroll_factor)+1) * + blockDim.x * gridDim.x * unroll_factor; + for(int linear_index = idx; linear_index < rounded_size; linear_index += blockDim.x * gridDim.x * unroll_factor) { + auto rand = dist_func(&state); + #pragma unroll + for (int ii = 0; ii < unroll_factor; ii++) { + int li = linear_index + blockDim.x * gridDim.x * ii; + if (li < numel) { + transform_func(li, static_cast((&rand.x)[ii])); + } + } + __syncthreads(); + } +} + +template +void distribution_nullary_kernel(at::TensorIterator& iter, + at::Generator* gen, + const dist_t& dist_func, + const transform_t transform_func) { + static_assert(unroll_factor >= 1, "unroll_factor must be >= 1."); + int64_t numel = iter.numel(); + if (numel == 0) { + return; + } + + auto execution_policy = calc_execution_policy(numel); + auto counter_offset = std::get<0>(execution_policy); + auto grid = std::get<1>(execution_policy); + auto block = std::get<2>(execution_policy); + auto seeds = next_philox_seed(gen, counter_offset); + + if (!iter.can_use_32bit_indexing()) { + for (auto& sub_iter : iter.with_32bit_indexing()) { + distribution_nullary_kernel(sub_iter, + gen, dist_func, transform_func); + } + return; + } + + char* out_data = (char*)iter.data_ptr(0); + + auto stream = at::cuda::getCurrentCUDAStream(); + if (iter.is_trivial_1d()) { + auto strides = iter.get_inner_strides(); + int stride0 = strides[0]; + distribution_elementwise_grid_stride_kernel<<>>( + numel, + seeds, + dist_func, + [=]__device__(int idx, accscalar_t rand) { + scalar_t* out = (scalar_t*)&out_data[stride0 * idx]; + *out = transform_func(rand); + } + ); + } else { + auto offset_calc = at::native::make_offset_calculator<1>(iter); + distribution_elementwise_grid_stride_kernel<<>>( + numel, + seeds, + dist_func, + [=]__device__(int idx, accscalar_t rand) { + auto offsets = offset_calc.get(idx); + scalar_t* out = (scalar_t*)&out_data[offsets[0]]; + *out = transform_func(rand); + } + ); + } + AT_CUDA_CHECK(cudaGetLastError()); +} + template void poisson_cuda_kernel( at::Tensor& ret, @@ -117,6 +262,7 @@ void bernoulli_tensor_cuda_kernel( blockIdx.x * blockDim.x + threadIdx.x, seeds.second, &state); + // See Note [Register spilling in curand call for CUDA < 10] float4 rand = curand_uniform4(&state); switch (n) { case 4: { @@ -159,6 +305,7 @@ void bernoulli_scalar_cuda_kernel( blockIdx.x * blockDim.x + threadIdx.x, seeds.second, &state); + // See Note [Register spilling in curand call for CUDA < 10] float4 rand = curand_uniform4(&state); switch (n) { case 4: { @@ -248,7 +395,7 @@ Tensor& bernoulli_tensor_cuda_(Tensor &self, const Tensor& p_, Generator* gen) { } Tensor& bernoulli_scalar_cuda_(Tensor &self, double p, Generator* gen) { - AT_CHECK(0 <= p && p <= 1, "bernoulli_ expects p to be in [0, 1], but got p=", p); + TORCH_CHECK(0 <= p && p <= 1, "bernoulli_ expects p to be in [0, 1], but got p=", p); AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, self.scalar_type(), "bernoulli_scalar_cuda_", [&] { auto seeds = next_philox_seed(gen, 10); bernoulli_scalar_cuda_kernel(self, p, seeds); @@ -256,5 +403,49 @@ Tensor& bernoulli_scalar_cuda_(Tensor &self, double p, Generator* gen) { return self; } +void uniform_kernel_cuda(TensorIterator& iter, double from_, double to_, Generator* gen) { + AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.dtype(), "uniform_cuda", [&] { + auto from = static_cast(from_); + auto to = static_cast(to_); + TORCH_CHECK(from <= to, + "uniform_ expects to return a [from, to) range, but found from=", from, + " > to=", to); + TORCH_CHECK((to - from) <= std::numeric_limits::max(), + "uniform_ expects to-from <= std::numeric_limits<", toString(iter.dtype()), + ">::max(), but found to=", to, " and from=", from, + " which result in to-from to exceed the limit"); + + using accscalar_t = at::acc_type; + auto range = static_cast(to-from); + from = static_cast(from); + // define lambda to reverse bounds, multiply 'range' and add 'from_' + auto uniform_func = [range, from] __device__ (accscalar_t rand) { + // reverse the bounds of curand4 from (0, 1] to [0, 1) + // Note that this method is from legacy THCTensorRandom and is likely to give + // you more 0-s, since, the probability of gettings 1-s is higher than 0-s and + // by reversing the bounds, we are flipping the probabilities of 1-s and 0-s. + auto reverse_bound_rand = rand == static_cast(1.0) ? static_cast(0.0) : rand; + return static_cast(reverse_bound_rand * range + from); + }; + if (std::is_same::value) { + distribution_nullary_kernel(iter, + gen, + [] __device__ (curandStatePhilox4_32_10_t* state) { return curand_uniform2_double(state); }, + uniform_func); + } else { + distribution_nullary_kernel(iter, + gen, + [] __device__ (curandStatePhilox4_32_10_t* state) { return curand_uniform4(state); }, + uniform_func); + } + }); +} + +Tensor& uniform_cuda_(Tensor& self, double from, double to, Generator* gen) { + auto iter = TensorIterator::nullary_op(self); + uniform_kernel_cuda(*iter, from, to, gen); + return self; +} + }} // namespace at::native diff --git a/aten/src/ATen/native/cuda/Dropout.cu b/aten/src/ATen/native/cuda/Dropout.cu index d858a0ece80b..e617af6611e5 100644 --- a/aten/src/ATen/native/cuda/Dropout.cu +++ b/aten/src/ATen/native/cuda/Dropout.cu @@ -102,6 +102,8 @@ fused_dropout_cuda(const Tensor& self, double p, Generator * gen){ Tensor ret = at::empty_like(self); Tensor mask = at::empty(self.sizes(), self.options().dtype(kByte)); const int64_t nelem = self.numel(); +//empty tensors should not get here, but just in case, avoid FPE + if (nelem==0) return std::tuple(self, mask); const int64_t block_size = 256; unsigned int blocks_per_sm = at::cuda::getCurrentDeviceProperties()->maxThreadsPerMultiProcessor/block_size; dim3 dim_block(block_size); @@ -152,7 +154,7 @@ fused_dropout_cuda(const Tensor& self, double p, Generator * gen){ Tensor masked_scale_cuda(const Tensor& self, const Tensor& mask, double scale){ Tensor ret = at::empty_like(self); - AT_CHECK(mask.scalar_type() == at::ScalarType::Byte, "mask should be torch.uint8 dtype"); + TORCH_CHECK(mask.scalar_type() == at::ScalarType::Byte, "mask should be torch.uint8 dtype"); AT_DISPATCH_FLOATING_TYPES_AND_HALF(ret.scalar_type(), "masked_scale", [&] { using accscalar_t = acc_type; accscalar_t pa = (accscalar_t)(scale); diff --git a/aten/src/ATen/native/cuda/EmbeddingBag.cu b/aten/src/ATen/native/cuda/EmbeddingBag.cu index ca9132ad90e7..f0761bd9b213 100644 --- a/aten/src/ATen/native/cuda/EmbeddingBag.cu +++ b/aten/src/ATen/native/cuda/EmbeddingBag.cu @@ -472,7 +472,7 @@ Tensor _embedding_bag_per_sample_weights_backward_cuda( const Tensor& offsets, const Tensor& offset2bag, int64_t mode) { - AT_CHECK( + TORCH_CHECK( mode == MODE_SUM, "embedding_bag_backward: per_sample_weights only supported for mode='sum'"); diff --git a/aten/src/ATen/native/cuda/FractionalMaxPool2d.cu b/aten/src/ATen/native/cuda/FractionalMaxPool2d.cu index 7a0951419c7f..219eaa656667 100644 --- a/aten/src/ATen/native/cuda/FractionalMaxPool2d.cu +++ b/aten/src/ATen/native/cuda/FractionalMaxPool2d.cu @@ -135,11 +135,11 @@ void fractional_max_pool2d_out_cuda_template( int numBatch = 1; int ndims = input.ndimension(); - AT_CHECK(input.numel() > 0, + TORCH_CHECK(input.numel() > 0, "fractional_max_pool2d(): expected input to have non-empty ", "spatial dimensions."); - AT_CHECK((ndims == 3 || ndims == 4), + TORCH_CHECK((ndims == 3 || ndims == 4), "non-empty 3D or 4D (batch mode) tensor expected for input"); if (ndims == 4) { @@ -159,10 +159,10 @@ void fractional_max_pool2d_out_cuda_template( int poolSizeH = pool_size[0]; int poolSizeW = pool_size[1]; - AT_CHECK(outputH + poolSizeH - 1 <= inputH, + TORCH_CHECK(outputH + poolSizeH - 1 <= inputH, "fractional_max_pool2d(): pool_size height ", poolSizeH, " too large relative to input height ", inputH); - AT_CHECK(outputW + poolSizeW - 1 <= inputW, + TORCH_CHECK(outputW + poolSizeW - 1 <= inputW, "pool_size width ", poolSizeW, " too large relative to input width ", inputW); @@ -208,7 +208,7 @@ void fractional_max_pool2d_out_cuda_template( poolSizeH, poolSizeW); } ); - AT_CHECK(cudaGetLastError() == cudaSuccess, + TORCH_CHECK(cudaGetLastError() == cudaSuccess, "fractional_max_pool2d_out_cuda_frame failed with error code ", cudaGetLastError()); } @@ -237,9 +237,9 @@ void fractional_max_pool2d_backward_out_cuda_template( int outputH = output_size[0]; int outputW = output_size[1]; - AT_CHECK(outputH == gradOutput.size(dimh), + TORCH_CHECK(outputH == gradOutput.size(dimh), "fractional_max_pool2d(): gradOutput height unexpected"); - AT_CHECK(outputW == gradOutput.size(dimw), + TORCH_CHECK(outputW == gradOutput.size(dimw), "fractional_max_pool2d(): gradOutput width unexpected"); /* resize */ @@ -277,7 +277,7 @@ void fractional_max_pool2d_backward_out_cuda_template( devGradInput, devGradOutput, devIndices); } ); - AT_CHECK(cudaGetLastError() == cudaSuccess, + TORCH_CHECK(cudaGetLastError() == cudaSuccess, "fractional_max_pool2d_backward_out_cuda_frame failed with error code ", cudaGetLastError()); } diff --git a/aten/src/ATen/native/cuda/FractionalMaxPool3d.cu b/aten/src/ATen/native/cuda/FractionalMaxPool3d.cu index 95f9b1a73bfc..c44b49c004d4 100644 --- a/aten/src/ATen/native/cuda/FractionalMaxPool3d.cu +++ b/aten/src/ATen/native/cuda/FractionalMaxPool3d.cu @@ -163,7 +163,7 @@ void fractional_max_pool3d_out_cuda_template( int64_t poolSizeW = pool_size[2]; int64_t ndims = input.ndimension(); - AT_CHECK( + TORCH_CHECK( input.numel() != 0 && (ndims == 4 || ndims == 5), "fractional_max_pool3d_out_cuda_template(): ", "non-empty 4D or 5D (batch mode) tensor expected for input, but got: ", @@ -183,17 +183,17 @@ void fractional_max_pool3d_out_cuda_template( int64_t inputH = input.size(dimh); int64_t inputW = input.size(dimw); - AT_CHECK( + TORCH_CHECK( outputT + poolSizeT - 1 < inputT, "fractional_max_pool3d_out_cuda_template(): ", "pool time (", poolSizeT, ") too large relative to input time (", inputT, ")"); - AT_CHECK( + TORCH_CHECK( outputH + poolSizeH - 1 < inputH, "fractional_max_pool3d_out_cuda_template(): ", "pool height (", poolSizeH, ") too large relative to input height (", inputH, ")"); - AT_CHECK( + TORCH_CHECK( outputW + poolSizeW - 1 < inputW, "fractional_max_pool3d_out_cuda_template(): ", "pool width (", poolSizeW, ") too large relative to input width (", @@ -244,7 +244,7 @@ void fractional_max_pool3d_out_cuda_template( ); } ); - AT_CHECK(cudaGetLastError() == cudaSuccess, + TORCH_CHECK(cudaGetLastError() == cudaSuccess, "fractional_max_pool2d_out_cuda_template failed with error code ", cudaGetLastError()); } @@ -276,17 +276,17 @@ void fractional_max_pool3d_backward_out_cuda_template( int64_t inputH = input.size(dimh); int64_t inputW = input.size(dimw); - AT_CHECK( + TORCH_CHECK( outputT == gradOutput.size(dimt), "fractional_max_pool3d_backward_out_cuda_template(): ", "gradOutput time unexpected" ); - AT_CHECK( + TORCH_CHECK( outputH == gradOutput.size(dimh), "fractional_max_pool3d_backward_out_cuda_template(): ", "gradOutput height unexpected" ); - AT_CHECK( + TORCH_CHECK( outputW == gradOutput.size(dimw), "fractional_max_pool3d_backward_out_cuda_template(): ", "gradOutput width unexpected" @@ -332,7 +332,7 @@ void fractional_max_pool3d_backward_out_cuda_template( ); } ); - AT_CHECK(cudaGetLastError() == cudaSuccess, + TORCH_CHECK(cudaGetLastError() == cudaSuccess, "fractional_max_pool2d_out_cuda_template failed with error code ", cudaGetLastError()); } diff --git a/aten/src/ATen/native/cuda/Lerp.cu b/aten/src/ATen/native/cuda/Lerp.cu index d7a660e21c9a..2861485c8563 100644 --- a/aten/src/ATen/native/cuda/Lerp.cu +++ b/aten/src/ATen/native/cuda/Lerp.cu @@ -38,7 +38,7 @@ namespace native { Tensor& lerp_cuda_tensor_out(Tensor& result, const Tensor& self, const Tensor& end, const Tensor& weight) { Tensor b_self, b_end, b_weight; - AT_CHECK(weight.dim() <= std::max(self.dim(), end.dim()), + TORCH_CHECK(weight.dim() <= std::max(self.dim(), end.dim()), "weight should be of dimension max(self.dim(), end.dim()) or lesser"); std::tie(b_self, b_end, b_weight) = expand_outplace(self, end, weight, "lerp_out_cuda"); result.resize_as_(b_self); @@ -62,10 +62,10 @@ Tensor& lerp_cuda_scalar_out(Tensor& result, const Tensor& self, Tensor& lerp_cuda_tensor_(Tensor& self, const Tensor& end, const Tensor& weight) { Tensor b_self, b_end, b_weight; std::tie(b_self, b_end, b_weight) = expand_outplace(self, end, weight, "lerp__cuda"); - AT_CHECK(b_self.sizes() == self.sizes(), + TORCH_CHECK(b_self.sizes() == self.sizes(), "output with shape ", self.sizes(), " doesn't match the broadcast shape ", b_self.sizes()); - AT_CHECK(weight.dim() <= std::max(self.dim(), end.dim()), + TORCH_CHECK(weight.dim() <= std::max(self.dim(), end.dim()), "weight should be of dimension max(self.dim(), end.dim()) or lesser"); AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "lerp__cuda", [&]{ lerp_cuda(self, b_self, b_end, b_weight); @@ -76,7 +76,7 @@ Tensor& lerp_cuda_tensor_(Tensor& self, const Tensor& end, const Tensor& weight) Tensor& lerp_cuda_scalar_(Tensor& self, const Tensor& end, Scalar weight) { Tensor b_self, b_end; std::tie(b_self, b_end) = expand_outplace(self, end, "lerp__cuda"); - AT_CHECK(b_self.sizes() == self.sizes(), + TORCH_CHECK(b_self.sizes() == self.sizes(), "output with shape ", self.sizes(), " doesn't match the broadcast shape ", b_self.sizes()); AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "lerp__cuda", [&]{ @@ -87,7 +87,7 @@ Tensor& lerp_cuda_scalar_(Tensor& self, const Tensor& end, Scalar weight) { Tensor lerp_cuda_tensor(const Tensor& self, const Tensor& end, const Tensor& weight) { Tensor b_self, b_end, b_weight; - AT_CHECK(weight.dim() <= std::max(self.dim(), end.dim()), + TORCH_CHECK(weight.dim() <= std::max(self.dim(), end.dim()), "weight should be of dimension max(self.dim(), end.dim()) or lesser"); std::tie(b_self, b_end, b_weight) = expand_outplace(self, end, weight, "lerp_cuda"); Tensor result = at::empty_like(b_self); diff --git a/aten/src/ATen/native/cuda/LossCTC.cu b/aten/src/ATen/native/cuda/LossCTC.cu index 8521a344a77b..141771e3c363 100644 --- a/aten/src/ATen/native/cuda/LossCTC.cu +++ b/aten/src/ATen/native/cuda/LossCTC.cu @@ -179,9 +179,9 @@ std::tuple ctc_loss_gpu_template(const Tensor& log_probs, const int64_t batch_size = log_probs.size(1); int64_t num_labels = log_probs.size(2); - AT_CHECK((0 <= BLANK) && (BLANK < num_labels), "blank must be in label range"); - AT_CHECK(input_lengths.size() == batch_size, "input_lengths must be of size batch_size"); - AT_CHECK(target_lengths.size() == batch_size, "target_lengths must be of size batch_size"); + TORCH_CHECK((0 <= BLANK) && (BLANK < num_labels), "blank must be in label range"); + TORCH_CHECK(input_lengths.size() == batch_size, "input_lengths must be of size batch_size"); + TORCH_CHECK(target_lengths.size() == batch_size, "target_lengths must be of size batch_size"); int64_t lp_input_stride = log_probs.stride(0); int64_t lp_char_stride = log_probs.stride(2); @@ -211,13 +211,13 @@ std::tuple ctc_loss_gpu_template(const Tensor& log_probs, const } tg_target_stride = targets.stride(1); checkSize(c, targets_arg, 0, batch_size); - AT_CHECK(targets.size(1) >= max_target_length, + TORCH_CHECK(targets.size(1) >= max_target_length, "Expected tensor to have size at least ", max_target_length, " at dimension 1, but got size ", targets.size(1), " for ", targets_arg, " (while checking arguments for ", c, ")"); } int64_t max_input_length = log_probs.size(0); for (int64_t b = 0; b < batch_size; b++) { - AT_CHECK(input_lengths[b] <= max_input_length, + TORCH_CHECK(input_lengths[b] <= max_input_length, "Expected tensor to have size at least ", max_input_length, " at dimension 1, but got size ", targets.size(0), " for ", targets_arg, " (while checking arguments for ", c, ")"); } diff --git a/aten/src/ATen/native/cuda/Normalization.cu b/aten/src/ATen/native/cuda/Normalization.cu index 8f92acd4393d..35e4684ee480 100644 --- a/aten/src/ATen/native/cuda/Normalization.cu +++ b/aten/src/ATen/native/cuda/Normalization.cu @@ -86,7 +86,7 @@ std::tuple batch_norm_update_stats_cuda( return AT_DISPATCH_FLOATING_TYPES_AND_HALF(self.scalar_type(), "batch_norm_backward", [&] { auto mean_st = running_mean.dtype(); auto var_st = running_var.dtype(); - AT_CHECK(mean_st == var_st, "running_mean and running_var need to have the same data types"); + TORCH_CHECK(mean_st == var_st, "running_mean and running_var need to have the same data types"); // Some workloads depend on passing in half input and float stats, which is // usually handled by cuDNN. However, the JIT sometimes replaces cuDNN calls with this // one so it needs to support the same case, or people start to complain. diff --git a/aten/src/ATen/native/cuda/RangeFactories.cu b/aten/src/ATen/native/cuda/RangeFactories.cu index 3286e1bd0629..d3ad7792a07f 100644 --- a/aten/src/ATen/native/cuda/RangeFactories.cu +++ b/aten/src/ATen/native/cuda/RangeFactories.cu @@ -38,7 +38,7 @@ struct LogspaceOp { }; Tensor& linspace_cuda_out(Tensor& result, Scalar start, Scalar end, int64_t steps) { - AT_CHECK(steps >= 0, "number of steps must be non-negative"); + TORCH_CHECK(steps >= 0, "number of steps must be non-negative"); if (result.numel() != steps) { result.resize_({steps}); @@ -68,7 +68,7 @@ Tensor& linspace_cuda_out(Tensor& result, Scalar start, Scalar end, int64_t step } Tensor& logspace_cuda_out(Tensor& result, Scalar start, Scalar end, int64_t steps, double base) { - AT_CHECK(steps >= 0, "number of steps must be non-negative"); + TORCH_CHECK(steps >= 0, "number of steps must be non-negative"); if (result.numel() != steps) { result.resize_({steps}); @@ -105,11 +105,11 @@ Tensor& range_cuda_out(Tensor& result, Scalar start, Scalar end, Scalar step) { auto xend = end.to(); auto xstep = step.to(); - AT_CHECK(xstep > 0 || xstep < 0, "step must be nonzero"); - AT_CHECK(std::isfinite(static_cast(xstart)) && + TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero"); + TORCH_CHECK(std::isfinite(static_cast(xstart)) && std::isfinite(static_cast(xend)), "unsupported range: ", xstart, " -> ", xend); - AT_CHECK(((xstep > 0) && (xend >= xstart)) || ((xstep < 0) && (xend <= xstart)), + TORCH_CHECK(((xstep > 0) && (xend >= xstart)) || ((xstep < 0) && (xend <= xstart)), "upper bound and larger bound inconsistent with step sign"); int64_t size = static_cast(((xend - xstart) / xstep) + 1); if (result.numel() != size) { @@ -152,14 +152,14 @@ Tensor& arange_cuda_out(Tensor& result, Scalar start, Scalar end, Scalar step) { / step.to()); } - AT_CHECK(xstep > 0 || xstep < 0, "step must be nonzero"); - AT_CHECK(std::isfinite(static_cast(xstart)) && + TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero"); + TORCH_CHECK(std::isfinite(static_cast(xstart)) && std::isfinite(static_cast(xend)), "unsupported range: ", xstart, " -> ", xend); - AT_CHECK(((xstep > 0) && (xend >= xstart)) || ((xstep < 0) && (xend <= xstart)), + TORCH_CHECK(((xstep > 0) && (xend >= xstart)) || ((xstep < 0) && (xend <= xstart)), "upper bound and larger bound inconsistent with step sign"); - AT_CHECK(size_d >= 0 && size_d <= static_cast(std::numeric_limits::max()), + TORCH_CHECK(size_d >= 0 && size_d <= static_cast(std::numeric_limits::max()), "invalid size, possible overflow?"); int64_t size = static_cast(size_d); diff --git a/aten/src/ATen/native/cuda/Reduce.cuh b/aten/src/ATen/native/cuda/Reduce.cuh index 574bf86eb361..d18758497131 100644 --- a/aten/src/ATen/native/cuda/Reduce.cuh +++ b/aten/src/ATen/native/cuda/Reduce.cuh @@ -16,6 +16,7 @@ #include #include #include +#include namespace at { namespace native { @@ -48,7 +49,7 @@ C10_HOST_DEVICE static void reduce_fraction(size_t &numerator, size_t &denominat a = b; b = tmp; } - + // a is now the GCD numerator /= a; denominator /= a; @@ -200,9 +201,11 @@ template static OffsetCalculator<2, index_t> make_output_calculator(const TensorIterator& iter) { int num_reduce_dims = iter.num_reduce_dims(); int num_output_dims = iter.ndim() - num_reduce_dims; + int input_index = iter.ntensors() - 1; + int output_index = 0; std::array strides = { - iter.strides(0).data() + num_reduce_dims, - iter.strides(1).data() + num_reduce_dims, + iter.strides(output_index).data() + num_reduce_dims, + iter.strides(input_index).data() + num_reduce_dims, }; auto shape = iter.shape().data() + num_reduce_dims; return OffsetCalculator<2, index_t>(num_output_dims, shape, strides.data()); @@ -211,8 +214,9 @@ static OffsetCalculator<2, index_t> make_output_calculator(const TensorIterator& template static OffsetCalculator<1, index_t> make_input_calculator(const TensorIterator& iter) { int num_reduce_dims = iter.num_reduce_dims(); + int input_index = iter.ntensors() - 1; std::array strides = { - iter.strides(1).data(), + iter.strides(input_index).data(), }; return OffsetCalculator<1, index_t>(num_reduce_dims, iter.shape().data(), strides.data()); } @@ -277,7 +281,7 @@ struct ReduceOp { InputCalculator input_calc; OutputCalculator output_calc; const void* src; - void* dst; + const char* dst[2]; //it accepts at most two destinations // acc_buf used for accumulation among sub Tensor Iterator when accumulation on // output is not permissible void* acc_buf; @@ -286,19 +290,24 @@ struct ReduceOp { int* semaphores; bool accumulate; bool final_output; + int noutputs; ReduceOp(ops_t ops, ReduceConfig config, InputCalculator input_calc, OutputCalculator output_calc, - const void* src, void* dst, void* acc_buf, void* cta_buf, int* semaphores, arg_t ident) + const void* src, char* dst0, optional dst1, void* acc_buf, void* cta_buf, int* semaphores, arg_t ident, int noutputs) : ops(ops) , config(config) , input_calc(input_calc) , output_calc(output_calc) , src(src) - , dst(dst) , acc_buf(acc_buf) , cta_buf(cta_buf) , semaphores(semaphores) - , ident(ident) { + , ident(ident) + , noutputs(noutputs) { + dst[0] = dst0; + if (dst1.has_value()) { + dst[1] = dst1.value(); + } } C10_DEVICE void run() const { @@ -320,7 +329,7 @@ struct ReduceOp { value = block_x_reduce(value, shared_memory); } - auto out = (out_scalar_t*)((char*)dst + base_offsets[0]); + auto out = (out_scalar_t*)((char*)dst[0] + base_offsets[0]); arg_t* acc = nullptr; if (acc_buf != nullptr) { size_t numerator = sizeof(arg_t); @@ -330,19 +339,23 @@ struct ReduceOp { } if (config.should_global_reduce()) { - value = global_reduce(value, out, acc, shared_memory); + value = global_reduce(value, acc, shared_memory); } else if (config.should_store(output_idx)) { if (acc == nullptr) { if (accumulate) { value = accumulate_in_output(out, value); } - *out = project_if_necessary(value); + if (final_output) { + set_results_to_output(value, base_offsets[0]); + } else { + *out = get_accumulated_output(out, value); + } } else { if (accumulate) { value = ops.combine(*acc, value); } if (final_output) { - *out = ops.project(value); + set_results_to_output(value, base_offsets[0]); } else { *acc = value; } @@ -453,14 +466,14 @@ struct ReduceOp { } template - C10_DEVICE out_scalar_t project_if_necessary( - arg_t value, + C10_DEVICE out_scalar_t get_accumulated_output( + out_scalar_t* out, arg_t value, typename std::enable_if::type* = nullptr ) const { - return final_output ? (out_scalar_t)ops.project(value) : (out_scalar_t)value; + assert(!final_output); + return (out_scalar_t)value; } - // This function should never be called -- // it's the version of `accumulate_in_output` // when accumulation in the output is not possible. @@ -473,17 +486,48 @@ struct ReduceOp { return arg_t {}; } + // This function should never be called -- + // it's the version of `get_accumulated_output` + // when accumulation in the output is not possible. template - C10_DEVICE out_scalar_t project_if_necessary( - arg_t value, + C10_DEVICE out_scalar_t get_accumulated_output( + out_scalar_t* out, arg_t value, typename std::enable_if::type* = nullptr ) const { + assert(false); + return *out; + } + + template + C10_DEVICE void set_results(const T x, const index_t base_offset) const { + assert(noutputs == 1); + auto res = (out_scalar_t*)((char*)dst[0] + base_offset); + *res = x; + } + + //Currently implemented for max of two outputs + template + C10_DEVICE void set_results(const thrust::tuple x, const index_t base_offset) const { + if (noutputs >= 1) { + auto res0 = (out_scalar_t*)((char*)dst[0] + base_offset); + *res0 = thrust::get<0>(x); + } + if (noutputs >= 2) { + auto res1 = (out_scalar_t *) ((char *) dst[1] + base_offset); + *res1 = thrust::get<1>(x); + } + } + + C10_DEVICE void set_results_to_output(arg_t value, index_t base_offset) const { assert(final_output); - return ops.project(value); + set_results(ops.project(value), base_offset); } - C10_DEVICE arg_t global_reduce(arg_t value, out_scalar_t* out, arg_t* acc, char* shared_memory) const { + C10_DEVICE arg_t global_reduce(arg_t value, arg_t* acc, char* shared_memory) const { arg_t* reduce_buffer = (arg_t*)cta_buf; + index_t output_idx = config.output_idx(); + auto base_offsets = output_calc.get(output_idx); + auto out = (out_scalar_t*)((char*)dst[0] + base_offsets[0]); bool should_store = config.should_store(config.output_idx()); if (should_store) { @@ -523,13 +567,17 @@ struct ReduceOp { if (accumulate) { value = accumulate_in_output(out, value); } - *out = project_if_necessary(value); + if (final_output) { + set_results_to_output(value, base_offsets[0]); + } else { + *out = get_accumulated_output(out, value); + } } else { if (accumulate) { value = ops.combine(*acc, value); } if (final_output) { - *out = ops.project(value); + set_results_to_output(value, base_offsets[0]); } else { *acc = value; } @@ -590,7 +638,7 @@ struct AccumulationBuffer { template inline void gpu_reduce_kernel(TensorIterator& iter, const ops_t& ops, ident_t ident=0, AccumulationBuffer* acc_buf_ptr=nullptr) { - AT_ASSERT(iter.numel() > 0 && iter.ntensors() == 2); + AT_ASSERT(iter.numel() > 0 && iter.ntensors() - iter.noutputs() == 1 && iter.noutputs() >= 1); using traits = binary_function_traits; using arg_t = typename traits::arg1_t; @@ -604,7 +652,7 @@ inline void gpu_reduce_kernel(TensorIterator& iter, const ops_t& ops, ident_t id // reused by all recursive function calls. if (acc_buf_ptr == NULL) { // acc_buf_ptr holds buffer used for accumulation among multiple sub_iter - // when accumulation in output is not possible. + // when accumulation in output is not possible. if (!can_accumulate_in_output && !can_use_32bit_indexing) { int64_t output_memory_size = 1; for (int dim = 0; dim < iter.ndim(); dim++) { @@ -627,21 +675,29 @@ inline void gpu_reduce_kernel(TensorIterator& iter, const ops_t& ops, ident_t id return; } + const char* in_data = (char*)iter.data_ptr(iter.ntensors() - 1); char* out_data = (char*)iter.data_ptr(0); - const char* in_data = (char*)iter.data_ptr(1); + const auto noutputs = iter.noutputs(); + optional out_data_extra; + if (noutputs > 1) { + out_data_extra = (char*)iter.data_ptr(1); + } else { + out_data_extra = nullopt; + } char* acc_data = acc_buf_ptr->get_acc_slice(out_data); // Start by assuming that each thread handles a single output and all // the inputs for that output. int64_t num_outputs = iter.num_output_elements(); int64_t inputs_per_output = iter.numel() / num_outputs; + int input_index = iter.ntensors() - 1; auto config = ReduceConfig(sizeof(arg_t), num_outputs, inputs_per_output); int64_t dim0; int64_t dim1; // adjust block size to fit width to fast changing dimension - if (iter.strides(/*arg=*/1)[0] == sizeof(scalar_t)) { + if (iter.strides(/*arg=*/input_index)[0] == sizeof(scalar_t)) { dim0 = iter.shape()[0]; dim1 = num_outputs; } else { @@ -654,7 +710,7 @@ inline void gpu_reduce_kernel(TensorIterator& iter, const ops_t& ops, ident_t id int block_width = config.block_width; int block_height = config.block_height; - if (iter.ndim() == 0 || iter.strides(/*arg=*/1)[0] == sizeof(scalar_t)) { + if (iter.ndim() == 0 || iter.strides(/*arg=*/input_index)[0] == sizeof(scalar_t)) { // Split the input across lanes if the input is contiguous in the reduced // dimension. This will require reduction between threads using warp // shuffle instructions and shared memory (if block_width > warpSize). @@ -706,10 +762,12 @@ inline void gpu_reduce_kernel(TensorIterator& iter, const ops_t& ops, ident_t id output_calc, in_data, out_data, + out_data_extra, acc_data, buffer.get(), (int*)semaphores.get(), - ident); + ident, + noutputs); reduce.accumulate = iter.should_accumulate(); reduce.final_output = iter.is_final_output(); diff --git a/aten/src/ATen/native/cuda/ReduceOpsKernel.cu b/aten/src/ATen/native/cuda/ReduceOpsKernel.cu index 5c2c22981622..58dfb7cc738a 100644 --- a/aten/src/ATen/native/cuda/ReduceOpsKernel.cu +++ b/aten/src/ATen/native/cuda/ReduceOpsKernel.cu @@ -12,6 +12,7 @@ #include #include #include +#include namespace at { namespace native { @@ -27,14 +28,14 @@ template void std_var_kernel_impl(TensorIterator& iter, bool unbiased, bool take_sqrt) { // reducing unrolling factor to 2 for welford kernel // This is necessary to lower register usage that leads to register spills. - gpu_reduce_kernel(iter, WelfordOps { unbiased, take_sqrt }, WelfordData {}); + gpu_reduce_kernel(iter, WelfordOps> { unbiased, take_sqrt }, WelfordData {}); } template <> void std_var_kernel_impl(TensorIterator& iter, bool unbiased, bool take_sqrt) { // reducing unrolling factor to 2 for welford kernel // This is necessary to lower register usage that leads to register spills. - gpu_reduce_kernel(iter, WelfordOps { unbiased, take_sqrt }, WelfordData {}); + gpu_reduce_kernel(iter, WelfordOps> { unbiased, take_sqrt }, WelfordData {}); } template diff --git a/aten/src/ATen/native/cuda/ReflectionPad.cu b/aten/src/ATen/native/cuda/ReflectionPad.cu index 0a1cacf416bc..67d9795f5651 100644 --- a/aten/src/ATen/native/cuda/ReflectionPad.cu +++ b/aten/src/ATen/native/cuda/ReflectionPad.cu @@ -148,14 +148,14 @@ __global__ void reflection_pad2d_backward_out_kernel( void reflection_pad1d_out_template( Tensor &output, const Tensor &input_, IntArrayRef padding) { - AT_CHECK(canUse32BitIndexMath(input_), + TORCH_CHECK(canUse32BitIndexMath(input_), "input tensor must fit into 32-bit index math"); int64_t dim_plane = 0; int64_t dim_w = 1; int64_t nbatch = 1; - AT_CHECK(input_.numel() > 0 && + TORCH_CHECK(input_.numel() > 0 && (input_.ndimension() == 2 || input_.ndimension() == 3), "non-empty 2D " "or 3D (batch mode) tensor expected for input, but got: ", input_); @@ -172,11 +172,11 @@ void reflection_pad1d_out_template( int64_t input_w = input_.size(dim_w); int64_t output_w = input_w + pad_l + pad_r; - AT_CHECK(pad_l < input_w && pad_r < input_w, "Padding size should be less " + TORCH_CHECK(pad_l < input_w && pad_r < input_w, "Padding size should be less " "than the corresponding input dimension, but got: padding (", pad_l, ", ", pad_r, ") at dimension ", dim_w, " of input ", input_); - AT_CHECK(output_w >= 1, + TORCH_CHECK(output_w >= 1, "input (W: ", input_w, ")is too small. Calculated output W: ", output_w); if (input_.ndimension() == 2) { @@ -206,10 +206,10 @@ void reflection_pad1d_backward_out_template( Tensor & grad_input, const Tensor & grad_output_, const Tensor & input, IntArrayRef padding) { - AT_CHECK(canUse32BitIndexMath(input), + TORCH_CHECK(canUse32BitIndexMath(input), "input tensor must fit into 32-bit index math"); - AT_CHECK(canUse32BitIndexMath(grad_output_), + TORCH_CHECK(canUse32BitIndexMath(grad_output_), "input tensor must fit into 32-bit index math"); int64_t dim_plane = 0; @@ -231,7 +231,7 @@ void reflection_pad1d_backward_out_template( Tensor grad_output = grad_output_.contiguous(); - AT_CHECK(output_w == grad_output.size(dim_w), + TORCH_CHECK(output_w == grad_output.size(dim_w), "gradOutput width unexpected. Expected: ", output_w, ", Got: ", grad_output.size(dim_w)); @@ -252,7 +252,7 @@ void reflection_pad1d_backward_out_template( void reflection_pad2d_out_template( Tensor &output, const Tensor &input_, IntArrayRef padding) { - AT_CHECK(canUse32BitIndexMath(input_), + TORCH_CHECK(canUse32BitIndexMath(input_), "input tensor must fit into 32-bit index math"); int plane_dim = 0; @@ -260,7 +260,7 @@ void reflection_pad2d_out_template( int dim_w = 2; int nbatch = 1; - AT_CHECK(input_.numel() > 0 && + TORCH_CHECK(input_.numel() > 0 && (input_.ndimension() == 3 || input_.ndimension() == 4), "non-empty 3D or " "4D (batch mode) tensor expected for input, but got: ", input_); @@ -280,12 +280,12 @@ void reflection_pad2d_out_template( int input_h = input_.size(dim_h); int input_w = input_.size(dim_w); - AT_CHECK(pad_l < input_w && pad_r < input_w, + TORCH_CHECK(pad_l < input_w && pad_r < input_w, "Padding size should be less than the corresponding input dimension, but " "got: padding (", pad_l, ", ", pad_r, ") at dimension ", dim_w, " of input ", input_.sizes()); - AT_CHECK(pad_t < input_h && pad_b < input_h, + TORCH_CHECK(pad_t < input_h && pad_b < input_h, "Padding size should be less than the corresponding input dimension, but " "got: padding (", pad_t, ", ", pad_b, ") at dimension ", dim_h, " of input ", input_.sizes()); @@ -293,7 +293,7 @@ void reflection_pad2d_out_template( int output_h = input_h + pad_t + pad_b; int output_w = input_w + pad_l + pad_r; - AT_CHECK(output_w >= 1 || output_h >= 1, + TORCH_CHECK(output_w >= 1 || output_h >= 1, "input (H: ", input_h, ", W: ", input_w, ")is too small. Calculated " "output H: ", output_h, " W: ", output_w); @@ -326,9 +326,9 @@ void reflection_pad2d_out_template( void reflection_pad2d_backward_out_template( Tensor &grad_input, const Tensor &grad_output_, const Tensor &input, IntArrayRef padding) { - AT_CHECK(canUse32BitIndexMath(input), + TORCH_CHECK(canUse32BitIndexMath(input), "input tensor must fit into 32-bit index math"); - AT_CHECK(canUse32BitIndexMath(grad_output_), + TORCH_CHECK(canUse32BitIndexMath(grad_output_), "output gradient tensor must fit into 32-bit index math"); int plane_dim = 0; @@ -355,9 +355,9 @@ void reflection_pad2d_backward_out_template( int output_h = input_h + pad_t + pad_b; int output_w = input_w + pad_l + pad_r; - AT_CHECK(output_w == grad_output_.size(dim_w), "grad_output width " + TORCH_CHECK(output_w == grad_output_.size(dim_w), "grad_output width " "unexpected. Expected: ", output_w, ", Got: ", grad_output_.size(dim_w)); - AT_CHECK(output_h == grad_output_.size(dim_h), "grad_output height " + TORCH_CHECK(output_h == grad_output_.size(dim_h), "grad_output height " "unexpected. Expected: ", output_h, ", Got: ", grad_output_.size(dim_h)); Tensor grad_output = grad_output_.contiguous(); diff --git a/aten/src/ATen/native/cuda/ReplicationPadding.cu b/aten/src/ATen/native/cuda/ReplicationPadding.cu index 867ebf275bfa..c9da8f440b72 100644 --- a/aten/src/ATen/native/cuda/ReplicationPadding.cu +++ b/aten/src/ATen/native/cuda/ReplicationPadding.cu @@ -205,9 +205,9 @@ void replication_pad1d_out_cuda_template( const Tensor& input, IntArrayRef paddingSize) { - AT_CHECK(at::cuda::detail::canUse32BitIndexMath(input), + TORCH_CHECK(at::cuda::detail::canUse32BitIndexMath(input), "input tensor must fit into 32-bit index math"); - AT_CHECK(paddingSize.size() == 2, "padding Size is expected to be 2"); + TORCH_CHECK(paddingSize.size() == 2, "padding Size is expected to be 2"); int padL = paddingSize[0]; int padR = paddingSize[1]; @@ -216,7 +216,7 @@ void replication_pad1d_out_cuda_template( int numBatch = 1; int numInputDims = input.ndimension(); - AT_CHECK(input.numel() > 0 && (numInputDims == 2 || numInputDims == 3), + TORCH_CHECK(input.numel() > 0 && (numInputDims == 2 || numInputDims == 3), "2D or 3D (batch mode) tensor expected for input") if (numInputDims == 3) { @@ -229,7 +229,7 @@ void replication_pad1d_out_cuda_template( int inputW = input.size(dimw); int outputW = inputW + padL + padR; - AT_CHECK(outputW >= 1, + TORCH_CHECK(outputW >= 1, "input (W: ", inputW, ")is too small." " Calculated output W: ", outputW); @@ -279,11 +279,11 @@ void replication_pad1d_backward_out_cuda_template( IntArrayRef paddingSize) { - AT_CHECK(at::cuda::detail::canUse32BitIndexMath(input), + TORCH_CHECK(at::cuda::detail::canUse32BitIndexMath(input), "input tensor must fit into 32-bit index math"); - AT_CHECK(at::cuda::detail::canUse32BitIndexMath(gradOutput), + TORCH_CHECK(at::cuda::detail::canUse32BitIndexMath(gradOutput), "output gradient tensor must fit into 32-bit index math"); - AT_CHECK(paddingSize.size() == 2, "padding Size is expected to be 2"); + TORCH_CHECK(paddingSize.size() == 2, "padding Size is expected to be 2"); int padL = paddingSize[0]; int padR = paddingSize[1]; @@ -298,7 +298,7 @@ void replication_pad1d_backward_out_cuda_template( int iwidth = input.size(dimw); int owidth = iwidth + padL + padR; - AT_CHECK(owidth == gradOutput.size(dimw), + TORCH_CHECK(owidth == gradOutput.size(dimw), "gradOutput width unexpected. Expected: ", owidth, ", Got: ", gradOutput.size(dimw)); @@ -336,9 +336,9 @@ void replication_pad2d_out_cuda_template( const Tensor& input, IntArrayRef paddingSize) { - AT_CHECK(at::cuda::detail::canUse32BitIndexMath(input), + TORCH_CHECK(at::cuda::detail::canUse32BitIndexMath(input), "input tensor must fit into 32-bit index math"); - AT_CHECK(paddingSize.size() == 4, "padding Size is expected to be 4"); + TORCH_CHECK(paddingSize.size() == 4, "padding Size is expected to be 4"); int padL = paddingSize[0]; int padR = paddingSize[1]; @@ -350,7 +350,7 @@ void replication_pad2d_out_cuda_template( int numBatch = 1; int numInputDims = input.dim(); - AT_CHECK(input.numel() && (numInputDims == 3 || numInputDims == 4), + TORCH_CHECK(input.numel() && (numInputDims == 3 || numInputDims == 4), "non-empty 3D or 4D (batch mode) tensor expected for input, but got: ", input) @@ -367,7 +367,7 @@ void replication_pad2d_out_cuda_template( int outputH = inputH + padT + padB; int outputW = inputW + padL + padR; - AT_CHECK(outputW >= 1 || outputH >= 1, + TORCH_CHECK(outputW >= 1 || outputH >= 1, "input (H: ", inputH, ", W: ", inputW, ") is too small." " Calculated output H: ", outputH, " W: ", outputW); @@ -418,11 +418,11 @@ void replication_pad2d_backward_out_cuda_template( IntArrayRef paddingSize) { - AT_CHECK(at::cuda::detail::canUse32BitIndexMath(input), + TORCH_CHECK(at::cuda::detail::canUse32BitIndexMath(input), "input tensor must fit into 32-bit index math"); - AT_CHECK(at::cuda::detail::canUse32BitIndexMath(gradOutput), + TORCH_CHECK(at::cuda::detail::canUse32BitIndexMath(gradOutput), "output gradient tensor must fit into 32-bit index math"); - AT_CHECK(paddingSize.size() == 4, "padding Size is expected to be 4"); + TORCH_CHECK(paddingSize.size() == 4, "padding Size is expected to be 4"); int padL = paddingSize[0]; int padR = paddingSize[1]; @@ -443,10 +443,10 @@ void replication_pad2d_backward_out_cuda_template( int oheight = iheight + padT + padB; int owidth = iwidth + padL + padR; - AT_CHECK(owidth == gradOutput.size(dimw), + TORCH_CHECK(owidth == gradOutput.size(dimw), "gradOutput width unexpected. Expected: ", owidth, ", Got: ", gradOutput.size(dimw)); - AT_CHECK(oheight == gradOutput.size(dimh), + TORCH_CHECK(oheight == gradOutput.size(dimh), "gradOutput height unexpected. Expected: ", oheight, ", Got: ", gradOutput.size(dimh)); @@ -483,11 +483,11 @@ static inline void shapeCheck3d( int pleft, int pright, int ptop, int pbottom, int pfront, int pback) { - AT_CHECK(at::cuda::detail::canUse32BitIndexMath(input), + TORCH_CHECK(at::cuda::detail::canUse32BitIndexMath(input), "input tensor must fit into 32-bit index math"); int numInputDims = input.dim(); - AT_CHECK(input.numel() && (numInputDims == 4 || numInputDims == 5), + TORCH_CHECK(input.numel() && (numInputDims == 4 || numInputDims == 5), "non-empty 4D or 5D (batch mode) tensor expected for input, but got: ", input); int planeDim = 0; @@ -508,7 +508,7 @@ static inline void shapeCheck3d( int odepth = idepth + pfront + pback; int oheight = iheight + ptop + pbottom; int owidth = iwidth + pleft + pright; - AT_CHECK(owidth >= 1 || oheight >= 1 || odepth >= 1, + TORCH_CHECK(owidth >= 1 || oheight >= 1 || odepth >= 1, "input (D: ", idepth, " H: ", iheight, ", W: ", iwidth, ") is too small." " Calculated output D: ", odepth, " H: ", oheight, " W: ", owidth); @@ -521,11 +521,11 @@ static inline void shapeAndGradOutputCheck3d( int pleft, int pright, int ptop, int pbottom, int pfront, int pback) { - AT_CHECK(at::cuda::detail::canUse32BitIndexMath(input), + TORCH_CHECK(at::cuda::detail::canUse32BitIndexMath(input), "input tensor must fit into 32-bit index math"); int numInputDims = input.dim(); - AT_CHECK(input.numel() && (numInputDims == 4 || numInputDims == 5), + TORCH_CHECK(input.numel() && (numInputDims == 4 || numInputDims == 5), "non-empty 4D or 5D (batch mode) tensor expected for input, but got: ", input); int planeDim = 0; @@ -546,24 +546,24 @@ static inline void shapeAndGradOutputCheck3d( int odepth = idepth + pfront + pback; int oheight = iheight + ptop + pbottom; int owidth = iwidth + pleft + pright; - AT_CHECK(owidth >= 1 || oheight >= 1 || odepth >= 1, + TORCH_CHECK(owidth >= 1 || oheight >= 1 || odepth >= 1, "input (D: ", idepth, " H: ", iheight, ", W: ", iwidth, ") is too small." " Calculated output D: ", odepth, " H: ", oheight, " W: ", owidth); - AT_CHECK(at::cuda::detail::canUse32BitIndexMath(gradOutput), + TORCH_CHECK(at::cuda::detail::canUse32BitIndexMath(gradOutput), "output gradient tensor must fit into 32-bit index math"); - AT_CHECK(numPlanes == gradOutput.size(planeDim), + TORCH_CHECK(numPlanes == gradOutput.size(planeDim), "gradOutput width unexpected. Expected: ", numPlanes, ", Got: ", gradOutput.size(planeDim)); - AT_CHECK(owidth == gradOutput.size(dimw), + TORCH_CHECK(owidth == gradOutput.size(dimw), "gradOutput width unexpected. Expected: ", owidth, ", Got: ", gradOutput.size(dimw)); - AT_CHECK(oheight == gradOutput.size(dimh), + TORCH_CHECK(oheight == gradOutput.size(dimh), "gradOutput height unexpected. Expected: ", oheight, ", Got: ", gradOutput.size(dimh)); - AT_CHECK(odepth == gradOutput.size(dimd), + TORCH_CHECK(odepth == gradOutput.size(dimd), "gradOutput depth unexpected. Expected: ", odepth, ", Got: ", gradOutput.size(dimd)); } @@ -573,7 +573,7 @@ void replication_pad3d_out_cuda_template( const Tensor& input, IntArrayRef paddingSize) { - AT_CHECK(paddingSize.size() == 6, "padding Size is expected to be 6"); + TORCH_CHECK(paddingSize.size() == 6, "padding Size is expected to be 6"); int pleft = paddingSize[0]; int pright = paddingSize[1]; int ptop = paddingSize[2]; @@ -654,7 +654,7 @@ void replication_pad3d_backward_out_cuda_template( const Tensor& input, IntArrayRef paddingSize) { - AT_CHECK(paddingSize.size() == 6, "padding Size is expected to be 6"); + TORCH_CHECK(paddingSize.size() == 6, "padding Size is expected to be 6"); int pleft = paddingSize[0]; int pright = paddingSize[1]; int ptop = paddingSize[2]; diff --git a/aten/src/ATen/native/cuda/SoftMax.cu b/aten/src/ATen/native/cuda/SoftMax.cu index ef3031e2208b..3b2db0fa0767 100644 --- a/aten/src/ATen/native/cuda/SoftMax.cu +++ b/aten/src/ATen/native/cuda/SoftMax.cu @@ -330,6 +330,7 @@ blockReduce(AccumT* smem, AccumT val, AccumT warpVal = defaultVal; // First warp will perform per-warp reductions for the remaining warps + uint32_t mask = (((uint64_t)1) << (blockDim.x / 32)) - 1; if (threadIdx.x < 32) { int lane = threadIdx.x % 32; if (lane < blockDim.x / 32) { @@ -337,6 +338,9 @@ blockReduce(AccumT* smem, AccumT val, for (int i = 0; i < 32; ++i) { warpVal = r(warpVal, smem[lane * 32 + i]); } +#if CUDA_VERSION >= 9000 + __syncwarp(mask); +#endif smem[lane] = warpVal; } } @@ -482,7 +486,7 @@ Tensor host_softmax(const Tensor & input_, const int64_t dim_, const bool half_t static_assert(std::is_same, float>::value, "accscalar_t for half should be float"); if (input.dim() == 0) input = input.view(1); int64_t dim = maybe_wrap_dim(dim_, input.dim()); - AT_CHECK(dim >=0 && dim < input.dim(), "dim must be non-negative and less than input dimensions"); + TORCH_CHECK(dim >=0 && dim < input.dim(), "dim must be non-negative and less than input dimensions"); int64_t outer_size = 1; int64_t dim_size = input.size(dim); @@ -557,7 +561,7 @@ Tensor host_softmax_backward(const Tensor &grad_, const Tensor &output_, int64_t auto grad = grad_.contiguous(); static_assert(std::is_same, float>::value, "accscalar_t for half should be float"); if (grad.dim() == 0) grad = grad.view(1); - AT_CHECK(dim >=0 && dim < grad.dim(), "dim must be non-negative and less than input dimensions"); + TORCH_CHECK(dim >=0 && dim < grad.dim(), "dim must be non-negative and less than input dimensions"); auto output = output_.contiguous(); if (output.dim() == 0) output = output.view(1); int64_t outer_size = 1; diff --git a/aten/src/ATen/native/cuda/SortingKthValue.cu b/aten/src/ATen/native/cuda/SortingKthValue.cu index ebf1dc8d7e6a..2c2c63cc06cc 100644 --- a/aten/src/ATen/native/cuda/SortingKthValue.cu +++ b/aten/src/ATen/native/cuda/SortingKthValue.cu @@ -145,11 +145,11 @@ void kthvalue_cuda_template( // FIXME: This seems bogus, I only do this because it was the old behaviour. // The reductions are fine, as long as the axis being reduced along // isn't of 0 elements (and the output has elements). - AT_CHECK( + TORCH_CHECK( self.numel() > 0, "cannot perform reduction function kthvalue", " on tensor with no elements because the operation does not have an identity"); - AT_CHECK(k >= 1 && k <= slicesize, "selected number k out of range"); + TORCH_CHECK(k >= 1 && k <= slicesize, "selected number k out of range"); _reduction_with_indices_allocate_or_resize_output( values, indices, self, dim, keepdim); @@ -159,7 +159,7 @@ void kthvalue_cuda_template( return; } - AT_CHECK( + TORCH_CHECK( self.dim() <= MAX_TENSORINFO_DIMS, "cannot operate on more than ", MAX_TENSORINFO_DIMS, @@ -188,14 +188,14 @@ void kthvalue_cuda_template( // this does not reduce to median with dim beause we don't want to copy twice template Tensor median_cuda_template(const Tensor& self) { - AT_CHECK(self.numel() > 0, "median cannot be called with empty tensor"); + TORCH_CHECK(self.numel() > 0, "median cannot be called with empty tensor"); if (self.dim() == 0 && self.numel() == 1) { return self.clone(); } auto self_copy = self.clone().view(-1); auto values = at::empty({1}, self.options()); auto indices = at::empty({1}, self.options().dtype(kLong)); - AT_CHECK( + TORCH_CHECK( self.dim() <= MAX_TENSORINFO_DIMS, "cannot operate on more than ", MAX_TENSORINFO_DIMS, diff --git a/aten/src/ATen/native/cuda/SortingRadixSelect.cuh b/aten/src/ATen/native/cuda/SortingRadixSelect.cuh index 9efd035178fa..ea340cdd9b61 100644 --- a/aten/src/ATen/native/cuda/SortingRadixSelect.cuh +++ b/aten/src/ATen/native/cuda/SortingRadixSelect.cuh @@ -121,7 +121,7 @@ struct TopKTypeConfig { typedef uint32_t RadixType; static inline __device__ RadixType convert(at::Half v) { -#if CUDA_VERSION >= 8000 || defined __HIP_PLATFORM_HCC__ +#if defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_HCC__) RadixType x = __half_as_ushort(v); RadixType mask = -((x >> 15)) | 0x8000; return (v == v) ? (x ^ mask) : 0xffff; @@ -132,7 +132,7 @@ struct TopKTypeConfig { } static inline __device__ at::Half deconvert(RadixType v) { -#if CUDA_VERSION >= 8000 || defined __HIP_PLATFORM_HCC__ +#if defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_HCC__) RadixType mask = ((v >> 15) - 1) | 0x8000; return __ushort_as_half(v ^ mask); #else diff --git a/aten/src/ATen/native/cuda/SpectralOps.cu b/aten/src/ATen/native/cuda/SpectralOps.cu index 97a712a4c184..fd8eb29c8377 100644 --- a/aten/src/ATen/native/cuda/SpectralOps.cu +++ b/aten/src/ATen/native/cuda/SpectralOps.cu @@ -286,7 +286,7 @@ CuFFTParamsLRUCache &cufft_get_plan_cache(int64_t device_index) { namespace detail { int64_t cufft_get_plan_cache_max_size_impl(int64_t device_index) { - AT_CHECK(0 <= device_index && device_index < at::detail::getCUDAHooks().getNumGPUs(), + TORCH_CHECK(0 <= device_index && device_index < at::detail::getCUDAHooks().getNumGPUs(), "cufft_get_plan_cache_max_size: expected 0 <= device_index < ", at::detail::getCUDAHooks().getNumGPUs(), "], but got device_index=", device_index); @@ -294,7 +294,7 @@ int64_t cufft_get_plan_cache_max_size_impl(int64_t device_index) { } void cufft_set_plan_cache_max_size_impl(int64_t device_index, int64_t max_size) { - AT_CHECK(0 <= device_index && device_index < at::detail::getCUDAHooks().getNumGPUs(), + TORCH_CHECK(0 <= device_index && device_index < at::detail::getCUDAHooks().getNumGPUs(), "cufft_set_plan_cache_max_size: expected 0 <= device_index < ", at::detail::getCUDAHooks().getNumGPUs(), "], but got device_index=", device_index); @@ -302,7 +302,7 @@ void cufft_set_plan_cache_max_size_impl(int64_t device_index, int64_t max_size) } int64_t cufft_get_plan_cache_size_impl(int64_t device_index) { - AT_CHECK(0 <= device_index && device_index < at::detail::getCUDAHooks().getNumGPUs(), + TORCH_CHECK(0 <= device_index && device_index < at::detail::getCUDAHooks().getNumGPUs(), "cufft_get_plan_cache_size: expected 0 <= device_index < ", at::detail::getCUDAHooks().getNumGPUs(), "], but got device_index=", device_index); @@ -310,7 +310,7 @@ int64_t cufft_get_plan_cache_size_impl(int64_t device_index) { } void cufft_clear_plan_cache_impl(int64_t device_index) { - AT_CHECK(0 <= device_index && device_index < at::detail::getCUDAHooks().getNumGPUs(), + TORCH_CHECK(0 <= device_index && device_index < at::detail::getCUDAHooks().getNumGPUs(), "cufft_clear_plan_cache: expected 0 <= device_index < ", at::detail::getCUDAHooks().getNumGPUs(), "], but got device_index=", device_index); diff --git a/aten/src/ATen/native/cuda/SummaryOps.cu b/aten/src/ATen/native/cuda/SummaryOps.cu index 7428f988c4b4..3143fec586d7 100644 --- a/aten/src/ATen/native/cuda/SummaryOps.cu +++ b/aten/src/ATen/native/cuda/SummaryOps.cu @@ -311,7 +311,7 @@ Tensor _histc_cuda_template( if (nbins <= 0) { AT_ERROR("bins must be > 0"); } - Tensor output = native::zeros({nbins}, device(DeviceType::CUDA).dtype(kLong)); + Tensor output = native::zeros({nbins}, device(DeviceType::CUDA).dtype(self.scalar_type())); input_t minvalue = min; input_t maxvalue = max; if (min == max) { @@ -322,7 +322,8 @@ Tensor _histc_cuda_template( minvalue = minvalue - 1; maxvalue = maxvalue + 1; } - auto ret = cuda::CUDA_tensor_histogram( + + auto ret = cuda::CUDA_tensor_histogram( output, self, Tensor(), nbins, minvalue, maxvalue); return output; } diff --git a/aten/src/ATen/native/cuda/TensorFactories.cu b/aten/src/ATen/native/cuda/TensorFactories.cu index 90d0208569f6..cc4e5678457d 100644 --- a/aten/src/ATen/native/cuda/TensorFactories.cu +++ b/aten/src/ATen/native/cuda/TensorFactories.cu @@ -26,7 +26,7 @@ Tensor& eye_out_cuda(Tensor& result, int64_t n) { } Tensor& eye_out_cuda(Tensor& result, int64_t n, int64_t m) { - AT_CHECK(n >= 0, "n must be greater or equal to 0, got ", n); + TORCH_CHECK(n >= 0, "n must be greater or equal to 0, got ", n); if(m < 0) { m = n; @@ -46,7 +46,7 @@ Tensor& eye_out_cuda(Tensor& result, int64_t n, int64_t m) { Tensor empty_cuda(IntArrayRef size, const TensorOptions& options) { AT_ASSERT(options.backend() == at::Backend::CUDA); AT_ASSERT(!options.is_variable()); // is_variable should have been 'unpacked' // TODO: remove this when Variable and Tensor are merged - AT_CHECK(!options.pinned_memory(), "Only dense CPU tensors can be pinned"); + TORCH_CHECK(!options.pinned_memory(), "Only dense CPU tensors can be pinned"); check_size_nonnegative(size); auto* allocator = at::cuda::getCUDADeviceAllocator(); @@ -74,8 +74,8 @@ Tensor empty_strided_cuda(IntArrayRef size, IntArrayRef stride, const TensorOpti } Tensor& randperm_out_cuda(Tensor& result, int64_t n, Generator* generator) { - AT_CHECK(n >= 0, "n must be non-negative, got", n); - AT_CHECK(at::scalar_tensor(n, result.options()).defined(), + TORCH_CHECK(n >= 0, "n must be non-negative, got", n); + TORCH_CHECK(at::scalar_tensor(n, result.options()).defined(), "n is too large for result tensor type: '", result.type().toString(), "'"); result.resize_({n}); @@ -322,7 +322,7 @@ Tensor tril_indices_cuda( dim3 dim_grid; // using tril_size instead of tensor.numel(), as each thread takes care of // two elements in the tensor. - AT_CHECK( + TORCH_CHECK( cuda::getApplyGrid(tril_size, dim_grid, tensor.get_device()), "unable to get dim grid"); @@ -398,7 +398,7 @@ Tensor triu_indices_cuda( // using triu_size instead of tensor.numel(), as each thread takes care of // two elements in the tensor. - AT_CHECK( + TORCH_CHECK( cuda::getApplyGrid(triu_size, dim_grid, tensor.get_device()), "unable to get dim grid"); diff --git a/aten/src/ATen/native/cuda/TensorTransformations.cu b/aten/src/ATen/native/cuda/TensorTransformations.cu index d07863423e6d..b9a06cb128ee 100644 --- a/aten/src/ATen/native/cuda/TensorTransformations.cu +++ b/aten/src/ATen/native/cuda/TensorTransformations.cu @@ -179,7 +179,7 @@ Tensor roll_cuda(const Tensor& self, IntArrayRef shifts, IntArrayRef dims) { dim3 dim_block = cuda::getApplyBlock(); dim3 dim_grid; - AT_CHECK(cuda::getApplyGrid(N, dim_grid, in_tensor.get_device()), "unable to get dim grid"); + TORCH_CHECK(cuda::getApplyGrid(N, dim_grid, in_tensor.get_device()), "unable to get dim grid"); auto total_dims = in_tensor.dim(); diff --git a/aten/src/ATen/native/cuda/UnaryOpsKernel.cu b/aten/src/ATen/native/cuda/UnaryOpsKernel.cu new file mode 100644 index 000000000000..74dabf5a13da --- /dev/null +++ b/aten/src/ATen/native/cuda/UnaryOpsKernel.cu @@ -0,0 +1,28 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { namespace native { + +template +void fill_kernel_impl(TensorIterator& iter, Scalar value_scalar) { + auto value = value_scalar.to(); + gpu_nullary_kernel(iter, [value]GPU_LAMBDA() -> scalar_t { + return value; + }); +} + +static void fill_kernel_cuda(TensorIterator& iter, Scalar value) { + AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Bool, at::ScalarType::Half, iter.dtype(), "fill_cuda", [&]() { + fill_kernel_impl(iter, value); + }); +} + +REGISTER_DISPATCH(fill_stub, &fill_kernel_cuda); + +}} diff --git a/aten/src/ATen/native/cuda/Unique.cu b/aten/src/ATen/native/cuda/Unique.cu index 3395701efc5e..10ad1d7cec1b 100644 --- a/aten/src/ATen/native/cuda/Unique.cu +++ b/aten/src/ATen/native/cuda/Unique.cu @@ -38,7 +38,7 @@ std::tuple compute_unique( if (!return_inverse) { inverse_indices = at::empty({0}, options); } else { - AT_CHECK(sorted_indices.defined(), + TORCH_CHECK(sorted_indices.defined(), "return_inverse is set to true, but sorted_indices is undefined. Send a bug report!"); const int64_t *sorted_indices_ptr = sorted_indices.data(); Tensor inv_loc = at::empty({num_inp}, options); diff --git a/aten/src/ATen/native/cuda/UpSample.cuh b/aten/src/ATen/native/cuda/UpSample.cuh new file mode 100644 index 000000000000..ff9d9594ea6a --- /dev/null +++ b/aten/src/ATen/native/cuda/UpSample.cuh @@ -0,0 +1,247 @@ +#include +#include +#include + +#include + +namespace at { +namespace native { + +/* TODO: move this to a common place */ +template +__device__ inline scalar_t min(scalar_t a, scalar_t b) { + return a < b ? a : b; +} + +template +__device__ inline scalar_t max(scalar_t a, scalar_t b) { + return a > b ? a : b; +} + +static inline void upsample_1d_shape_check( + const Tensor& input, + const Tensor& grad_output, + int nbatch, + int nchannels, + int input_width, + int output_width) { + TORCH_CHECK( + input_width > 0 && output_width > 0, + "input and output sizes should be greater than 0, but got input (W: ", + input_width, + ") and output (W: ", + output_width, + ")"); + + if (input.defined()) { + TORCH_CHECK( + input.numel() != 0 && input.dim() == 3, + "non-empty 3D input tensor expected but got a tensor with sizes ", + input.sizes()); + } else if (grad_output.defined()) { + check_dim_size(grad_output, 3, 0, nbatch); + check_dim_size(grad_output, 3, 1, nchannels); + check_dim_size(grad_output, 3, 2, output_width); + } +} + +static inline void upsample_2d_shape_check( + const Tensor& input, + const Tensor& grad_output, + int nbatch, + int nchannels, + int input_height, + int input_width, + int output_height, + int output_width) { + TORCH_CHECK( + input_height > 0 && input_width > 0 && output_height > 0 && + output_width > 0, + "input and output sizes should be greater than 0," + " but got input (H: ", + input_height, + ", W: ", + input_width, + ") output (H: ", + output_height, + ", W: ", + output_width, + ")"); + + if (input.defined()) { + TORCH_CHECK( + input.numel() != 0 && input.dim() == 4, + "non-empty 4D input tensor expected but got a tensor with sizes ", + input.sizes()); + } else if (grad_output.defined()) { + check_dim_size(grad_output, 4, 0, nbatch); + check_dim_size(grad_output, 4, 1, nchannels); + check_dim_size(grad_output, 4, 2, output_height); + check_dim_size(grad_output, 4, 3, output_width); + } +} + +static inline void upsample_3d_shape_check( + const Tensor& input, + const Tensor& grad_output, + int nbatch, + int nchannels, + int input_depth, + int input_height, + int input_width, + int output_depth, + int output_height, + int output_width) { + TORCH_CHECK( + input_depth > 0 && input_height > 0 && input_width > 0 && + output_depth > 0 && output_height > 0 && output_width > 0, + "Input and output sizes should be greater than 0, but got input (D: ", + input_depth, + ", H: ", + input_height, + ", W: ", + input_width, + ") output (D: ", + output_depth, + ", H: ", + output_height, + ", W: ", + output_width, + ")"); + + if (input.defined()) { + TORCH_CHECK( + input.numel() != 0 && input.dim() == 5, + "Non-empty 5D data tensor expected but got a tensor with sizes ", + input.sizes()); + } else if (grad_output.defined()) { + check_dim_size(grad_output, 5, 0, nbatch); + check_dim_size(grad_output, 5, 1, nchannels); + check_dim_size(grad_output, 5, 2, output_depth); + check_dim_size(grad_output, 5, 3, output_height); + check_dim_size(grad_output, 5, 4, output_width); + } +} + +template +__host__ __forceinline__ static accscalar_t area_pixel_compute_scale( + int input_size, + int output_size, + bool align_corners) { + if (output_size > 1) { + return align_corners ? (accscalar_t)(input_size - 1) / (output_size - 1) + : (accscalar_t)input_size / output_size; + } else { + return static_cast(0); + } +} + +template +__device__ __forceinline__ static accscalar_t area_pixel_compute_source_index( + accscalar_t scale, + int dst_index, + bool align_corners, + bool cubic) { + if (align_corners) { + return scale * dst_index; + } else { + accscalar_t src_idx = scale * (dst_index + static_cast(0.5)) - + static_cast(0.5); + // See Note[Follow Opencv resize logic] + return (!cubic && src_idx < static_cast(0)) + ? static_cast(0) + : src_idx; + } +} + +__device__ __forceinline__ static int nearest_neighbor_compute_source_index( + const float scale, + int dst_index, + int input_size) { + const int src_index = + min(static_cast(floorf(dst_index * scale)), input_size - 1); + return src_index; +} + +/* Used by UpSampleBicubic2d.cu */ +template +__device__ __forceinline__ static scalar_t upsample_get_value_bounded( + const PackedTensorAccessor& data, + int batch, + int channel, + int height, + int width, + int y, + int x) { + int access_y = max(min(y, height - 1), 0); + int access_x = max(min(x, width - 1), 0); + return data[batch][channel][access_y][access_x]; +} + +/* Used by UpSampleBicubic2d.cu */ +template +__device__ __forceinline__ static void upsample_increment_value_bounded( + PackedTensorAccessor& data, + int batch, + int channel, + int height, + int width, + int y, + int x, + accscalar_t value) { + int access_y = max(min(y, height - 1), 0); + int access_x = max(min(x, width - 1), 0); + /* TODO: result here is trucated to scalar_t, + check: https://github.com/pytorch/pytorch/pull/19630#discussion_r281426912 + */ + atomicAdd( + &data[batch][channel][access_y][access_x], static_cast(value)); +} + +// Based on +// https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm +template +__device__ __forceinline__ static accscalar_t cubic_convolution1( + accscalar_t x, + accscalar_t A) { + return ((A + 2) * x - (A + 3)) * x * x + 1; +} + +template +__device__ __forceinline__ static accscalar_t cubic_convolution2( + accscalar_t x, + accscalar_t A) { + return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A; +} + +template +__device__ __forceinline__ static void get_cubic_upsampling_coefficients( + accscalar_t coeffs[4], + accscalar_t t) { + accscalar_t A = -0.75; + + accscalar_t x1 = t; + coeffs[0] = cubic_convolution2(x1 + 1.0, A); + coeffs[1] = cubic_convolution1(x1, A); + + // opposite coefficients + accscalar_t x2 = 1.0 - t; + coeffs[2] = cubic_convolution1(x2, A); + coeffs[3] = cubic_convolution2(x2 + 1.0, A); +} + +template +__device__ __forceinline__ static accscalar_t cubic_interp1d( + scalar_t x0, + scalar_t x1, + scalar_t x2, + scalar_t x3, + accscalar_t t) { + accscalar_t coeffs[4]; + get_cubic_upsampling_coefficients(coeffs, t); + + return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3]; +} + +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/cuda/UpSampleBicubic2d.cu b/aten/src/ATen/native/cuda/UpSampleBicubic2d.cu index 5375e33513ca..443e88ec078b 100644 --- a/aten/src/ATen/native/cuda/UpSampleBicubic2d.cu +++ b/aten/src/ATen/native/cuda/UpSampleBicubic2d.cu @@ -1,25 +1,329 @@ #include +#include #include -#include +#include +#include +#include +#include +#include namespace at { namespace native { +namespace { + +template +C10_LAUNCH_BOUNDS_1(1024) +__global__ void upsample_bicubic2d_out_frame( + const int num_elements, + const accscalar_t height_scale, + const accscalar_t width_scale, + const bool align_corners, + const PackedTensorAccessor idata, + PackedTensorAccessor odata) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + + const int batchsize = idata.size(0); + const int channels = idata.size(1); + const int input_height = idata.size(2); + const int input_width = idata.size(3); + const int output_height = odata.size(2); + const int output_width = odata.size(3); + + if (index >= num_elements) { + return; + } + + // Special case: input and output are the same size, just copy + const int output_x = index % output_width; + const int output_y = index / output_width; + + if (input_height == output_height && input_width == output_width) { + for (int n = 0; n < batchsize; n++) { + for (int c = 0; c < channels; c++) { + const scalar_t val = idata[n][c][output_y][output_x]; + odata[n][c][output_y][output_x] = val; + } + } + return; + } + + // Interpolation kernel + accscalar_t real_x = area_pixel_compute_source_index( + width_scale, output_x, align_corners, /*cubic=*/true); + int in_x = floorf(real_x); + accscalar_t t_x = real_x - in_x; + + accscalar_t real_y = area_pixel_compute_source_index( + height_scale, output_y, align_corners, /*cubic=*/true); + int in_y = floorf(real_y); + accscalar_t t_y = real_y - in_y; + + for (int n = 0; n < batchsize; n++) { + for (int c = 0; c < channels; c++) { + accscalar_t coefficients[4]; + + for (int k = 0; k < 4; k++) { + coefficients[k] = cubic_interp1d( + upsample_get_value_bounded( + idata, n, c, input_height, input_width, in_y - 1 + k, in_x - 1), + upsample_get_value_bounded( + idata, n, c, input_height, input_width, in_y - 1 + k, in_x + 0), + upsample_get_value_bounded( + idata, n, c, input_height, input_width, in_y - 1 + k, in_x + 1), + upsample_get_value_bounded( + idata, n, c, input_height, input_width, in_y - 1 + k, in_x + 2), + t_x); + } + + odata[n][c][output_y][output_x] = static_cast(cubic_interp1d( + coefficients[0], + coefficients[1], + coefficients[2], + coefficients[3], + t_y)); + } + } +} + +// Backward (adjoint) operation 1 <- 2 (accumulates) +template +C10_LAUNCH_BOUNDS_1(1024) +__global__ void upsample_bicubic2d_backward_out_frame( + const int num_elements, + const accscalar_t height_scale, + const accscalar_t width_scale, + const bool align_corners, + PackedTensorAccessor idata, + const PackedTensorAccessor odata) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + + const int batchsize = idata.size(0); + const int channels = idata.size(1); + const int input_height = idata.size(2); + const int input_width = idata.size(3); + const int output_height = odata.size(2); + const int output_width = odata.size(3); + + if (index >= num_elements) { + return; + } + + const int output_x = index % output_width; + const int output_y = index / output_width; + // special case: output_xust copy + if (input_height == output_height && input_width == output_width) { + for (int n = 0; n < batchsize; n++) { + for (int c = 0; c < channels; ++c) { + const scalar_t val = odata[n][c][output_y][output_x]; + idata[n][c][output_y][output_x] = val; + } + } + return; + } + + accscalar_t real_x = area_pixel_compute_source_index( + width_scale, output_x, align_corners, /*cubic=*/true); + int input_x = floorf(real_x); + accscalar_t t_x = real_x - input_x; + + accscalar_t real_y = area_pixel_compute_source_index( + height_scale, output_y, align_corners, /*cubic=*/true); + int input_y = floorf(real_y); + accscalar_t t_y = real_y - input_y; + + accscalar_t x_coeffs[4]; + accscalar_t y_coeffs[4]; + + get_cubic_upsampling_coefficients(x_coeffs, t_x); + get_cubic_upsampling_coefficients(y_coeffs, t_y); + + for (int n = 0; n < batchsize; n++) { + for (int c = 0; c < channels; ++c) { + scalar_t out_value = odata[n][c][output_y][output_x]; + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) { + upsample_increment_value_bounded( + idata, + n, + c, + input_height, + input_width, + input_y - 1 + i, + input_x - 1 + j, + out_value * y_coeffs[i] * x_coeffs[j]); + } + } + } + } +} + +static void upsample_bicubic2d_out_cuda_template( + Tensor& output, + const Tensor& input, + IntArrayRef output_size, + bool align_corners) { + TensorArg input_arg{input, "input", 1}, output_arg{output, "output", 2}; + checkAllSameGPU("upsample_bicubic2d_out", {input_arg, output_arg}); + + TORCH_CHECK( + output_size.size() == 2, + "It is expected output_size equals to 2, but got size ", + output_size.size()); + + int output_height = output_size[0]; + int output_width = output_size[1]; + + int nbatch = input.size(0); + int channels = input.size(1); + int input_height = input.size(2); + int input_width = input.size(3); + + upsample_2d_shape_check( + input, + Tensor(), + nbatch, + channels, + input_height, + input_width, + output_height, + output_width); + + output.resize_({input.size(0), input.size(1), output_height, output_width}); + output.zero_(); + + AT_ASSERT( + input_height > 0 && input_width > 0 && output_height > 0 && + output_width > 0); + + const int num_output_elements = output_height * output_width; + const int max_threads = std::min( + at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024); + + // Launch kernel + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + input.scalar_type(), "upsample_bicubic2d_out_frame", [&] { + using accscalar_t = at::acc_type; + + auto idata = input.packed_accessor(); + auto odata = output.packed_accessor(); + + // Get scaling factors + const accscalar_t rheight = area_pixel_compute_scale( + input_height, output_height, align_corners); + const accscalar_t rwidth = area_pixel_compute_scale( + input_width, output_width, align_corners); + + upsample_bicubic2d_out_frame + <<>>( + num_output_elements, + rheight, + rwidth, + align_corners, + idata, + odata); + }); + + AT_CUDA_CHECK(cudaGetLastError()); +} + +static void upsample_bicubic2d_backward_out_cuda_template( + Tensor& grad_input, + const Tensor& grad_output_, + IntArrayRef output_size, + IntArrayRef input_size, + bool align_corners) { + TensorArg grad_input_arg{grad_input, "grad_input", 1}, + grad_output_arg{grad_output_, "grad_output_", 2}; + checkAllSameGPU( + "upsample_bicubic2d_backward_out_cuda", + {grad_output_arg, grad_input_arg}); + + TORCH_CHECK( + output_size.size() == 2, + "It is expected output_size equals to 2, but got size ", + output_size.size()); + + TORCH_CHECK( + input_size.size() == 4, + "It is expected input_size equals to 4, but got size ", + input_size.size()); + + int output_height = output_size[0]; + int output_width = output_size[1]; + + int nbatch = input_size[0]; + int channels = input_size[1]; + int input_height = input_size[2]; + int input_width = input_size[3]; + + upsample_2d_shape_check( + Tensor(), + grad_output_, + nbatch, + channels, + input_height, + input_width, + output_height, + output_width); + + Tensor grad_output = grad_output_.contiguous(); + + grad_input.resize_({nbatch, channels, input_height, input_width}); + grad_input.zero_(); + + const int num_kernels = output_height * output_width; + const int num_threads = std::min( + at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024); + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + grad_output.scalar_type(), "upsample_bicubic2d_backward_out_frame", [&] { + using accscalar_t = at::acc_type; + + auto idata = grad_input.packed_accessor(); + auto odata = grad_output.packed_accessor(); + + const accscalar_t rheight = area_pixel_compute_scale( + input_height, output_height, align_corners); + const accscalar_t rwidth = area_pixel_compute_scale( + input_width, output_width, align_corners); + + upsample_bicubic2d_backward_out_frame + <<>>( + num_kernels, rheight, rwidth, align_corners, idata, odata); + }); + + AT_CUDA_CHECK(cudaGetLastError()); +} + +} // namespace Tensor& upsample_bicubic2d_out_cuda( Tensor& output, const Tensor& input, IntArrayRef output_size, bool align_corners) { - return at::legacy::th::_thnn_upsample_bicubic2d_forward_out( - output, input, output_size, align_corners); + upsample_bicubic2d_out_cuda_template( + output, input, output_size, align_corners); + return output; } Tensor upsample_bicubic2d_cuda( const Tensor& input, IntArrayRef output_size, bool align_corners) { - return at::legacy::th::_thnn_upsample_bicubic2d_forward( - input, output_size, align_corners); + Tensor output = at::empty_like(input); + upsample_bicubic2d_out_cuda_template( + output, input, output_size, align_corners); + return output; } Tensor& upsample_bicubic2d_backward_out_cuda( @@ -28,8 +332,9 @@ Tensor& upsample_bicubic2d_backward_out_cuda( IntArrayRef output_size, IntArrayRef input_size, bool align_corners) { - return at::legacy::th::_thnn_upsample_bicubic2d_backward_out( - grad_input, grad_output, output_size, input_size, align_corners); + upsample_bicubic2d_backward_out_cuda_template( + grad_input, grad_output, output_size, input_size, align_corners); + return grad_input; } Tensor upsample_bicubic2d_backward_cuda( @@ -37,9 +342,11 @@ Tensor upsample_bicubic2d_backward_cuda( IntArrayRef output_size, IntArrayRef input_size, bool align_corners) { - return at::legacy::th::_thnn_upsample_bicubic2d_backward( - grad_output, output_size, input_size, align_corners); + Tensor grad_input = at::empty_like(grad_output); + upsample_bicubic2d_backward_out_cuda_template( + grad_input, grad_output, output_size, input_size, align_corners); + return grad_input; } -} // native -} // at +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu b/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu index 7c53443f70a6..d4e8d1b14f55 100644 --- a/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu +++ b/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu @@ -1,25 +1,308 @@ +// Adapted from interp.cpp from Caffe util by Pauline Luc +// Originally developed by George Papandreou #include +#include #include -#include +#include +#include +#include +#include +#include namespace at { namespace native { +namespace { + +template +C10_LAUNCH_BOUNDS_1(1024) +__global__ void upsample_bilinear2d_out_frame( + const int n, + const accscalar_t rheight, + const accscalar_t rwidth, + const bool align_corners, + const PackedTensorAccessor idata, + PackedTensorAccessor odata) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + + const int batchsize = idata.size(0); + const int channels = idata.size(1); + const int height1 = idata.size(2); + const int width1 = idata.size(3); + const int height2 = odata.size(2); + const int width2 = odata.size(3); + + if (index < n) { + const int w2 = index % width2; // 0:width2-1 + const int h2 = index / width2; // 0:height2-1 + // special case: just copy + if (height1 == height2 && width1 == width2) { + const int h1 = h2; + const int w1 = w2; + for (int n = 0; n < batchsize; n++) { + for (int c = 0; c < channels; ++c) { + const scalar_t val = idata[n][c][h1][w1]; + odata[n][c][h2][w2] = val; + } + } + return; + } + // + const accscalar_t h1r = area_pixel_compute_source_index( + rheight, h2, align_corners, /*cubic=*/false); + const int h1 = h1r; + const int h1p = (h1 < height1 - 1) ? 1 : 0; + const accscalar_t h1lambda = h1r - h1; + const accscalar_t h0lambda = static_cast(1) - h1lambda; + // + const accscalar_t w1r = area_pixel_compute_source_index( + rwidth, w2, align_corners, /*cubic=*/false); + const int w1 = w1r; + const int w1p = (w1 < width1 - 1) ? 1 : 0; + const accscalar_t w1lambda = w1r - w1; + const accscalar_t w0lambda = static_cast(1) - w1lambda; + // + for (int n = 0; n < batchsize; n++) { + for (int c = 0; c < channels; ++c) { + const accscalar_t val = h0lambda * + (w0lambda * idata[n][c][h1][w1] + + w1lambda * idata[n][c][h1][w1 + w1p]) + + h1lambda * + (w0lambda * idata[n][c][h1 + h1p][w1] + + w1lambda * idata[n][c][h1 + h1p][w1 + w1p]); + odata[n][c][h2][w2] = static_cast(val); + } + } + } +} + +// Backward (adjoint) operation 1 <- 2 (accumulates) +template +C10_LAUNCH_BOUNDS_1(1024) +__global__ void upsample_bilinear2d_backward_out_frame( + const int n, + const accscalar_t rheight, + const accscalar_t rwidth, + const bool align_corners, + PackedTensorAccessor idata, + const PackedTensorAccessor odata) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + + const int batchsize = idata.size(0); + const int channels = idata.size(1); + const int height1 = idata.size(2); + const int width1 = idata.size(3); + const int height2 = odata.size(2); + const int width2 = odata.size(3); + + if (index < n) { + const int w2 = index % width2; // 0:width2-1 + const int h2 = index / width2; // 0:height2-1 + // special case: just copy + if (height1 == height2 && width1 == width2) { + const int h1 = h2; + const int w1 = w2; + for (int n = 0; n < batchsize; n++) { + for (int c = 0; c < channels; ++c) { + const scalar_t val = odata[n][c][h1][w1]; + idata[n][c][h2][w2] = val; + } + } + return; + } + // + const accscalar_t h1r = area_pixel_compute_source_index( + rheight, h2, align_corners, /*cubic=*/false); + const int h1 = h1r; + const int h1p = (h1 < height1 - 1) ? 1 : 0; + const accscalar_t h1lambda = h1r - h1; + const accscalar_t h0lambda = static_cast(1) - h1lambda; + // + const accscalar_t w1r = area_pixel_compute_source_index( + rwidth, w2, align_corners, /*cubic=*/false); + const int w1 = w1r; + const int w1p = (w1 < width1 - 1) ? 1 : 0; + const accscalar_t w1lambda = w1r - w1; + const accscalar_t w0lambda = static_cast(1) - w1lambda; + // + for (int n = 0; n < batchsize; n++) { + for (int c = 0; c < channels; ++c) { + const scalar_t d2val = odata[n][c][h2][w2]; + atomicAdd( + &idata[n][c][h1][w1], + static_cast(h0lambda * w0lambda * d2val)); + atomicAdd( + &idata[n][c][h1][w1 + w1p], + static_cast(h0lambda * w1lambda * d2val)); + atomicAdd( + &idata[n][c][h1 + h1p][w1], + static_cast(h1lambda * w0lambda * d2val)); + atomicAdd( + &idata[n][c][h1 + h1p][w1 + w1p], + static_cast(h1lambda * w1lambda * d2val)); + } + } + } +} + +static void upsample_bilinear2d_out_cuda_template( + Tensor& output, + const Tensor& input, + IntArrayRef output_size, + bool align_corners) { + TensorArg input_arg{input, "input", 1}, output_arg{output, "output", 2}; + checkAllSameGPU("upsample_bilinear2d_out_cuda", {input_arg, output_arg}); + + TORCH_CHECK( + output_size.size() == 2, + "It is expected output_size equals to 2, but got size ", + output_size.size()); + + int output_height = output_size[0]; + int output_width = output_size[1]; + + int nbatch = input.size(0); + int channels = input.size(1); + int input_height = input.size(2); + int input_width = input.size(3); + + upsample_2d_shape_check( + input, + Tensor(), + nbatch, + channels, + input_height, + input_width, + output_height, + output_width); + + output.resize_({input.size(0), input.size(1), output_height, output_width}); + output.zero_(); + + AT_ASSERT( + input_height > 0 && input_width > 0 && output_height > 0 && + output_width > 0); + + const int num_kernels = output_height * output_width; + const int num_threads = std::min( + at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024); + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + input.scalar_type(), "upsample_bilinear2d_out_frame", [&] { + using accscalar_t = at::acc_type; + + auto idata = input.packed_accessor(); + auto odata = output.packed_accessor(); + + const accscalar_t rheight = area_pixel_compute_scale( + input_height, output_height, align_corners); + const accscalar_t rwidth = area_pixel_compute_scale( + input_width, output_width, align_corners); + + upsample_bilinear2d_out_frame + <<>>( + num_kernels, rheight, rwidth, align_corners, idata, odata); + }); + + AT_CUDA_CHECK(cudaGetLastError()); +} + +static void upsample_bilinear2d_backward_out_cuda_template( + Tensor& grad_input, + const Tensor& grad_output_, + IntArrayRef output_size, + IntArrayRef input_size, + bool align_corners) { + TensorArg grad_input_arg{grad_input, "grad_input", 1}, + grad_output_arg{grad_output_, "grad_output_", 2}; + checkAllSameGPU( + "upsample_bilinear2d_backward_out_cuda", + {grad_output_arg, grad_input_arg}); + + TORCH_CHECK( + output_size.size() == 2, + "It is expected output_size equals to 2, but got size ", + output_size.size()); + + TORCH_CHECK( + input_size.size() == 4, + "It is expected input_size equals to 4, but got size ", + input_size.size()); + + int output_height = output_size[0]; + int output_width = output_size[1]; + + int nbatch = input_size[0]; + int channels = input_size[1]; + int input_height = input_size[2]; + int input_width = input_size[3]; + + upsample_2d_shape_check( + Tensor(), + grad_output_, + nbatch, + channels, + input_height, + input_width, + output_height, + output_width); + + Tensor grad_output = grad_output_.contiguous(); + + grad_input.resize_({nbatch, channels, input_height, input_width}); + grad_input.zero_(); + + const int num_kernels = output_height * output_width; + const int num_threads = std::min( + at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024); + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + grad_output.scalar_type(), "upsample_bilinear2d_backward_out_frame", [&] { + using accscalar_t = at::acc_type; + + auto idata = grad_input.packed_accessor(); + auto odata = grad_output.packed_accessor(); + + const accscalar_t rheight = area_pixel_compute_scale( + input_height, output_height, align_corners); + const accscalar_t rwidth = area_pixel_compute_scale( + input_width, output_width, align_corners); + + upsample_bilinear2d_backward_out_frame + <<>>( + num_kernels, rheight, rwidth, align_corners, idata, odata); + }); + + AT_CUDA_CHECK(cudaGetLastError()); +} + +} // namespace Tensor& upsample_bilinear2d_out_cuda( Tensor& output, const Tensor& input, IntArrayRef output_size, bool align_corners) { - return at::legacy::th::_thnn_upsample_bilinear2d_forward_out( - output, input, output_size, align_corners); + upsample_bilinear2d_out_cuda_template( + output, input, output_size, align_corners); + return output; } Tensor upsample_bilinear2d_cuda( const Tensor& input, IntArrayRef output_size, bool align_corners) { - return at::legacy::th::_thnn_upsample_bilinear2d_forward( - input, output_size, align_corners); + Tensor output = at::empty_like(input); + upsample_bilinear2d_out_cuda_template( + output, input, output_size, align_corners); + return output; } Tensor& upsample_bilinear2d_backward_out_cuda( @@ -28,8 +311,9 @@ Tensor& upsample_bilinear2d_backward_out_cuda( IntArrayRef output_size, IntArrayRef input_size, bool align_corners) { - return at::legacy::th::_thnn_upsample_bilinear2d_backward_out( - grad_input, grad_output, output_size, input_size, align_corners); + upsample_bilinear2d_backward_out_cuda_template( + grad_input, grad_output, output_size, input_size, align_corners); + return grad_input; } Tensor upsample_bilinear2d_backward_cuda( @@ -37,9 +321,11 @@ Tensor upsample_bilinear2d_backward_cuda( IntArrayRef output_size, IntArrayRef input_size, bool align_corners) { - return at::legacy::th::_thnn_upsample_bilinear2d_backward( - grad_output, output_size, input_size, align_corners); + Tensor grad_input = at::empty_like(grad_output); + upsample_bilinear2d_backward_out_cuda_template( + grad_input, grad_output, output_size, input_size, align_corners); + return grad_input; } -} // native -} // at +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/cuda/UpSampleLinear1d.cu b/aten/src/ATen/native/cuda/UpSampleLinear1d.cu index eb491f5e9c2b..0f70b57344cb 100644 --- a/aten/src/ATen/native/cuda/UpSampleLinear1d.cu +++ b/aten/src/ATen/native/cuda/UpSampleLinear1d.cu @@ -1,25 +1,248 @@ +// Adapted from interp.cpp from Caffe util by Pauline Luc +// Originally developed by George Papandreou #include +#include #include -#include +#include +#include +#include +#include +#include namespace at { namespace native { +namespace { + +template +#ifdef __HIP_PLATFORM_HCC__ +C10_LAUNCH_BOUNDS_1(1024) +#endif +__global__ void upsample_linear1d_out_frame( + const int n, + const accscalar_t rwidth, + const bool align_corners, + const PackedTensorAccessor idata, + PackedTensorAccessor odata) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + + const int batchsize = idata.size(0); + const int channels = idata.size(1); + const int width1 = idata.size(2); + const int width2 = odata.size(2); + + if (index < n) { + const int w2 = index % width2; + // special case: just copy + if (width1 == width2) { + const int w1 = w2; + for (int n = 0; n < batchsize; n++) { + for (int c = 0; c < channels; ++c) { + const scalar_t val = idata[n][c][w1]; + odata[n][c][w2] = val; + } + } + return; + } + // + const accscalar_t w1r = area_pixel_compute_source_index( + rwidth, w2, align_corners, /*cubic=*/false); + const int w1 = w1r; + const int w1p = (w1 < width1 - 1) ? 1 : 0; + const accscalar_t w1lambda = w1r - w1; + const accscalar_t w0lambda = static_cast(1) - w1lambda; + // + for (int n = 0; n < batchsize; n++) { + for (int c = 0; c < channels; ++c) { + const accscalar_t val = + w0lambda * idata[n][c][w1] + w1lambda * idata[n][c][w1 + w1p]; + odata[n][c][w2] = static_cast(val); + } + } + } +} + +// Backward (adjoint) operation 1 <- 2 (accumulates) +template +#ifdef __HIP_PLATFORM_HCC__ +C10_LAUNCH_BOUNDS_1(1024) +#endif +__global__ void upsample_linear1d_out_frame_backward( + const int n, + const accscalar_t rwidth, + const bool align_corners, + PackedTensorAccessor idata, + const PackedTensorAccessor odata) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + + const int batchsize = idata.size(0); + const int channels = idata.size(1); + const int width1 = idata.size(2); + const int width2 = odata.size(2); + + if (index < n) { + const int w2 = index % width2; + // special case: just copy + if (width1 == width2) { + const int w1 = w2; + for (int n = 0; n < batchsize; n++) { + for (int c = 0; c < channels; ++c) { + const scalar_t val = odata[n][c][w1]; + idata[n][c][w2] = val; + } + } + return; + } + // + const accscalar_t w1r = area_pixel_compute_source_index( + rwidth, w2, align_corners, /*cubic=*/false); + const int w1 = w1r; + const int w1p = (w1 < width1 - 1) ? 1 : 0; + const accscalar_t w1lambda = w1r - w1; + const accscalar_t w0lambda = static_cast(1) - w1lambda; + // + for (int n = 0; n < batchsize; n++) { + for (int c = 0; c < channels; ++c) { + const scalar_t d2val = odata[n][c][w2]; + atomicAdd(&idata[n][c][w1], static_cast(w0lambda * d2val)); + atomicAdd( + &idata[n][c][w1 + w1p], static_cast(w1lambda * d2val)); + } + } + } +} + +static void upsample_linear1d_out_cuda_template( + Tensor& output, + const Tensor& input, + IntArrayRef output_size, + bool align_corners) { + TensorArg input_arg{input, "input", 1}, output_arg{output, "output", 2}; + checkAllSameGPU("upsample_linear1d_out_cuda", {input_arg, output_arg}); + + TORCH_CHECK( + output_size.size() == 1, + "It is expected output_size equals to 1, but got size ", + output_size.size()); + + int output_width = output_size[0]; + + int nbatch = input.size(0); + int channels = input.size(1); + int input_width = input.size(2); + + upsample_1d_shape_check( + input, Tensor(), nbatch, channels, input_width, output_width); + + output.resize_({input.size(0), input.size(1), output_width}); + output.zero_(); + + AT_ASSERT(input_width > 0 && output_width > 0); + + const int num_kernels = output_width; + const int num_threads = + at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock; + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + input.scalar_type(), "upsample_linear1d_out_frame", [&] { + using accscalar_t = at::acc_type; + + auto idata = input.packed_accessor(); + auto odata = output.packed_accessor(); + + const accscalar_t rwidth = area_pixel_compute_scale( + input_width, output_width, align_corners); + + upsample_linear1d_out_frame + <<>>(num_kernels, rwidth, align_corners, idata, odata); + }); + + AT_CUDA_CHECK(cudaGetLastError()); +} + +static void upsample_linear1d_backward_out_cuda_template( + Tensor& grad_input, + const Tensor& grad_output_, + IntArrayRef output_size, + IntArrayRef input_size, + bool align_corners) { + TensorArg grad_output_arg{grad_output_, "grad_output_", 1}, + grad_input_arg{grad_input, "grad_input", 2}; + checkAllSameGPU( + "upsample_linear1d_backward_out_cuda", {grad_output_arg, grad_input_arg}); + + TORCH_CHECK( + output_size.size() == 1, + "It is expected output_size equals to 1, but got size ", + output_size.size()); + + TORCH_CHECK( + input_size.size() == 3, + "It is expected input_size equals to 3, but got size ", + input_size.size()); + + int output_width = output_size[0]; + + int nbatch = input_size[0]; + int channels = input_size[1]; + int input_width = input_size[2]; + + upsample_1d_shape_check( + Tensor(), grad_output_, nbatch, channels, input_width, output_width); + + Tensor grad_output = grad_output_.contiguous(); + + grad_input.resize_({nbatch, channels, input_width}); + grad_input.zero_(); + + const int num_kernels = output_width; + const int num_threads = + at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock; + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + grad_output.scalar_type(), "upsample_linear1d_out_frame_backward", [&] { + using accscalar_t = at::acc_type; + + auto idata = grad_input.packed_accessor(); + auto odata = grad_output.packed_accessor(); + + const accscalar_t rwidth = area_pixel_compute_scale( + input_width, output_width, align_corners); + + upsample_linear1d_out_frame_backward + <<>>(num_kernels, rwidth, align_corners, idata, odata); + }); + + AT_CUDA_CHECK(cudaGetLastError()); +} + +} // namespace Tensor& upsample_linear1d_out_cuda( Tensor& output, const Tensor& input, IntArrayRef output_size, bool align_corners) { - return at::legacy::th::_thnn_upsample_linear1d_forward_out( - output, input, output_size, align_corners); + upsample_linear1d_out_cuda_template( + output, input, output_size, align_corners); + return output; } Tensor upsample_linear1d_cuda( const Tensor& input, IntArrayRef output_size, bool align_corners) { - return at::legacy::th::_thnn_upsample_linear1d_forward( - input, output_size, align_corners); + Tensor output = at::empty_like(input); + upsample_linear1d_out_cuda_template( + output, input, output_size, align_corners); + return output; } Tensor& upsample_linear1d_backward_out_cuda( @@ -28,8 +251,9 @@ Tensor& upsample_linear1d_backward_out_cuda( IntArrayRef output_size, IntArrayRef input_size, bool align_corners) { - return at::legacy::th::_thnn_upsample_linear1d_backward_out( - grad_input, grad_output, output_size, input_size, align_corners); + upsample_linear1d_backward_out_cuda_template( + grad_input, grad_output, output_size, input_size, align_corners); + return grad_input; } Tensor upsample_linear1d_backward_cuda( @@ -37,9 +261,11 @@ Tensor upsample_linear1d_backward_cuda( IntArrayRef output_size, IntArrayRef input_size, bool align_corners) { - return at::legacy::th::_thnn_upsample_linear1d_backward( - grad_output, output_size, input_size, align_corners); + Tensor grad_input = at::empty_like(grad_output); + upsample_linear1d_backward_out_cuda_template( + grad_input, grad_output, output_size, input_size, align_corners); + return grad_input; } -} // native -} // at +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/cuda/UpSampleNearest1d.cu b/aten/src/ATen/native/cuda/UpSampleNearest1d.cu index bfba5a16ddfc..2218d2775a53 100644 --- a/aten/src/ATen/native/cuda/UpSampleNearest1d.cu +++ b/aten/src/ATen/native/cuda/UpSampleNearest1d.cu @@ -1,23 +1,215 @@ #include +#include #include -#include +#include +#include +#include +#include +#include namespace at { namespace native { +namespace { -Tensor& upsample_nearest1d_out_cuda( +template +C10_LAUNCH_BOUNDS_1(1024) +__global__ void upsample_nearest1d_out_frame( + const int n, + const PackedTensorAccessor idata, + PackedTensorAccessor odata) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + + const int batchsize = idata.size(0); + const int channels = idata.size(1); + const int width1 = idata.size(2); + const int width2 = odata.size(2); + + const float scale = (float)width1 / (float)width2; + + if (index < n) { + const int w2 = index % width2; + // special case: just copy + if (width1 == width2) { + const int w1 = w2; + for (int n = 0; n < batchsize; n++) { + for (int c = 0; c < channels; ++c) { + const scalar_t val = idata[n][c][w1]; + odata[n][c][w2] = val; + } + } + return; + } + // + const int w1 = nearest_neighbor_compute_source_index(scale, w2, width1); + + for (int n = 0; n < batchsize; n++) { + for (int c = 0; c < channels; ++c) { + const scalar_t val = idata[n][c][w1]; + odata[n][c][w2] = val; + } + } + } +} + +// Backward operation +template +C10_LAUNCH_BOUNDS_1(1024) +__global__ void upsample_nearest1d_backward_out_frame( + const int n, + PackedTensorAccessor idata, + const PackedTensorAccessor odata) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + + const int batchsize = idata.size(0); + const int channels = idata.size(1); + const int width1 = idata.size(2); + const int width2 = odata.size(2); + + const float scale = (float)width1 / (float)width2; + + if (index < n) { + const int w2 = index % width2; + // special case: just copy + if (width1 == width2) { + const int w1 = w2; + for (int n = 0; n < batchsize; n++) { + for (int c = 0; c < channels; ++c) { + const scalar_t val = odata[n][c][w1]; + idata[n][c][w2] = val; + } + } + return; + } + // + const int w1 = nearest_neighbor_compute_source_index(scale, w2, width1); + + for (int n = 0; n < batchsize; n++) { + for (int c = 0; c < channels; ++c) { + const scalar_t d2val = odata[n][c][w2]; + atomicAdd(&idata[n][c][w1], d2val); + } + } + } +} + +static void upsample_nearest1d_out_cuda_template( Tensor& output, const Tensor& input, IntArrayRef output_size) { - return at::legacy::th::_thnn_upsample_nearest1d_forward_out( - output, input, output_size); + TensorArg input_arg{input, "input", 1}, output_arg{output, "output", 2}; + checkAllSameGPU("upsample_nearest1d_out_cuda", {input_arg, output_arg}); + + TORCH_CHECK( + output_size.size() == 1, + "It is expected output_size equals to 1, but got size ", + output_size.size()); + + int output_width = output_size[0]; + + int nbatch = input.size(0); + int channels = input.size(1); + int input_width = input.size(2); + + upsample_1d_shape_check( + input, Tensor(), nbatch, channels, input_width, output_width); + + AT_ASSERT(input_width > 0 && output_width > 0); + + output.resize_({input.size(0), input.size(1), output_width}); + output.zero_(); + + const int num_kernels = output_width; + const int num_threads = std::min( + at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024); + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + input.scalar_type(), "upsample_nearest1d_out_frame", [&] { + using accscalar_t = at::acc_type; + + auto idata = input.packed_accessor(); + auto odata = output.packed_accessor(); + + upsample_nearest1d_out_frame + <<>>(num_kernels, idata, odata); + }); + + AT_CUDA_CHECK(cudaGetLastError()); +} + +static void upsample_nearest1d_backward_out_cuda_template( + Tensor& grad_input, + const Tensor& grad_output_, + IntArrayRef output_size, + IntArrayRef input_size) { + TensorArg grad_input_arg{grad_input, "grad_input", 1}, + grad_output_arg{grad_output_, "grad_output_", 2}; + checkAllSameGPU( + "upsample_nearest1d_backward_out_cuda_template", + {grad_output_arg, grad_input_arg}); + + TORCH_CHECK( + output_size.size() == 1, + "It is expected output_size equals to 1, but got size ", + output_size.size()); + + TORCH_CHECK( + input_size.size() == 3, + "It is expected input_size equals to 3, but got size ", + input_size.size()); + + int output_width = output_size[0]; + + int nbatch = input_size[0]; + int channels = input_size[1]; + int input_width = input_size[2]; + + upsample_1d_shape_check( + Tensor(), grad_output_, nbatch, channels, input_width, output_width); + + Tensor grad_output = grad_output_.contiguous(); + grad_input.resize_({nbatch, channels, input_width}); + grad_input.zero_(); + + const int num_kernels = output_width; + const int num_threads = std::min( + at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024); + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + grad_output.scalar_type(), "upsample_nearest1d_backward_out_frame", [&] { + using accscalar_t = at::acc_type; + + auto idata = grad_input.packed_accessor(); + auto odata = grad_output.packed_accessor(); + + upsample_nearest1d_backward_out_frame + <<>>(num_kernels, idata, odata); + }); + + AT_CUDA_CHECK(cudaGetLastError()); } -Tensor upsample_nearest1d_cuda( +} // namespace + +Tensor& upsample_nearest1d_out_cuda( + Tensor& output, const Tensor& input, IntArrayRef output_size) { - return at::legacy::th::_thnn_upsample_nearest1d_forward( - input, output_size); + upsample_nearest1d_out_cuda_template(output, input, output_size); + return output; +} + +Tensor upsample_nearest1d_cuda(const Tensor& input, IntArrayRef output_size) { + Tensor output = at::empty_like(input); + upsample_nearest1d_out_cuda_template(output, input, output_size); + return output; } Tensor& upsample_nearest1d_backward_out_cuda( @@ -25,17 +217,20 @@ Tensor& upsample_nearest1d_backward_out_cuda( const Tensor& grad_output, IntArrayRef output_size, IntArrayRef input_size) { - return at::legacy::th::_thnn_upsample_nearest1d_backward_out( - grad_input, grad_output, output_size, input_size); + upsample_nearest1d_backward_out_cuda_template( + grad_input, grad_output, output_size, input_size); + return grad_input; } Tensor upsample_nearest1d_backward_cuda( const Tensor& grad_output, IntArrayRef output_size, IntArrayRef input_size) { - return at::legacy::th::_thnn_upsample_nearest1d_backward( - grad_output, output_size, input_size); + Tensor grad_input = at::empty_like(grad_output); + upsample_nearest1d_backward_out_cuda_template( + grad_input, grad_output, output_size, input_size); + return grad_input; } -} // native -} // at +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/cuda/UpSampleNearest2d.cu b/aten/src/ATen/native/cuda/UpSampleNearest2d.cu index 83f40f68d6fe..f8d99609b84c 100644 --- a/aten/src/ATen/native/cuda/UpSampleNearest2d.cu +++ b/aten/src/ATen/native/cuda/UpSampleNearest2d.cu @@ -1,23 +1,255 @@ #include +#include #include -#include +#include +#include +#include +#include +#include namespace at { namespace native { +namespace { -Tensor& upsample_nearest2d_out_cuda( +template +C10_LAUNCH_BOUNDS_1(1024) +__global__ void upsample_nearest2d_out_frame( + const int n, + const PackedTensorAccessor idata, + PackedTensorAccessor odata) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + + const int batchsize = idata.size(0); + const int channels = idata.size(1); + const int height1 = idata.size(2); + const int width1 = idata.size(3); + const int height2 = odata.size(2); + const int width2 = odata.size(3); + + const float height_scale = (float)height1 / (float)height2; + const float width_scale = (float)width1 / (float)width2; + + if (index < n) { + const int w2 = index % width2; // 0:width2-1 + const int h2 = index / width2; // 0:height2-1 + // special case: just copy + if (height1 == height2 && width1 == width2) { + const int h1 = h2; + const int w1 = w2; + + for (int n = 0; n < batchsize; n++) { + for (int c = 0; c < channels; ++c) { + const scalar_t val = idata[n][c][h1][w1]; + odata[n][c][h2][w2] = val; + } + } + return; + } + // + const int h1 = + nearest_neighbor_compute_source_index(height_scale, h2, height1); + const int w1 = + nearest_neighbor_compute_source_index(width_scale, w2, width1); + + for (int n = 0; n < batchsize; n++) { + for (int c = 0; c < channels; ++c) { + const scalar_t val = idata[n][c][h1][w1]; + odata[n][c][h2][w2] = val; + } + } + } +} + +// Backward operation +template +C10_LAUNCH_BOUNDS_1(1024) +__global__ void upsample_nearest2d_backward_out_frame( + const int n, + PackedTensorAccessor idata, + const PackedTensorAccessor odata) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + + const int batchsize = idata.size(0); + const int channels = idata.size(1); + const int height1 = idata.size(2); + const int width1 = idata.size(3); + const int height2 = odata.size(2); + const int width2 = odata.size(3); + + const float height_scale = (float)height1 / (float)height2; + const float width_scale = (float)width1 / (float)width2; + + if (index < n) { + const int w2 = index % width2; // 0:width2-1 + const int h2 = index / width2; // 0:height2-1 + // special case: just copy + if (height1 == height2 && width1 == width2) { + const int h1 = h2; + const int w1 = w2; + + for (int n = 0; n < batchsize; n++) { + for (int c = 0; c < channels; ++c) { + const scalar_t val = odata[n][c][h2][w2]; + idata[n][c][h1][w1] = val; + } + } + return; + } + // + const int h1 = + nearest_neighbor_compute_source_index(height_scale, h2, height1); + const int w1 = + nearest_neighbor_compute_source_index(width_scale, w2, width1); + + for (int n = 0; n < batchsize; n++) { + for (int c = 0; c < channels; ++c) { + const scalar_t d2val = odata[n][c][h2][w2]; + atomicAdd(&idata[n][c][h1][w1], d2val); + } + } + } +} + +static void upsample_nearest2d_out_cuda_template( Tensor& output, const Tensor& input, IntArrayRef output_size) { - return at::legacy::th::_thnn_upsample_nearest2d_forward_out( - output, input, output_size); + TensorArg input_arg{input, "input", 1}, output_arg{output, "output", 2}; + checkAllSameGPU( + "upsample_nearest2d_out_cuda_template", {input_arg, output_arg}); + + TORCH_CHECK( + output_size.size() == 2, + "It is expected output_size equals to 2, but got size ", + output_size.size()); + + int output_height = output_size[0]; + int output_width = output_size[1]; + + int nbatch = input.size(0); + int channels = input.size(1); + int input_height = input.size(2); + int input_width = input.size(3); + + upsample_2d_shape_check( + input, + Tensor(), + nbatch, + channels, + input_height, + input_width, + output_height, + output_width); + + AT_ASSERT( + input_height > 0 && input_width > 0 && output_height > 0 && + output_width > 0); + + output.resize_({input.size(0), input.size(1), output_height, output_width}); + output.zero_(); + + const int num_kernels = output_height * output_width; + const int num_threads = std::min( + at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024); + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + input.scalar_type(), "upsample_nearest2d_out_frame", [&] { + using accscalar_t = at::acc_type; + + auto idata = input.packed_accessor(); + auto odata = output.packed_accessor(); + + upsample_nearest2d_out_frame + <<>>(num_kernels, idata, odata); + }); + + AT_CUDA_CHECK(cudaGetLastError()); +} + +static void upsample_nearest2d_backward_out_cuda_template( + Tensor& grad_input, + const Tensor& grad_output_, + IntArrayRef output_size, + IntArrayRef input_size) { + TensorArg grad_input_arg{grad_input, "grad_input", 1}, + grad_output_arg{grad_output_, "grad_output_", 2}; + checkAllSameGPU( + "upsample_nearest2d_backward_out_cuda", + {grad_output_arg, grad_input_arg}); + + TORCH_CHECK( + output_size.size() == 2, + "It is expected output_size equals to 2, but got size ", + output_size.size()); + + TORCH_CHECK( + input_size.size() == 4, + "It is expected input_size equals to 4, but got size ", + input_size.size()); + + int output_height = output_size[0]; + int output_width = output_size[1]; + + int nbatch = input_size[0]; + int channels = input_size[1]; + int input_height = input_size[2]; + int input_width = input_size[3]; + + upsample_2d_shape_check( + Tensor(), + grad_output_, + nbatch, + channels, + input_height, + input_width, + output_height, + output_width); + + Tensor grad_output = grad_output_.contiguous(); + grad_input.resize_({nbatch, channels, input_height, input_width}); + + grad_input.zero_(); + + const int num_kernels = output_height * output_width; + const int num_threads = std::min( + at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024); + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + grad_output.scalar_type(), "upsample_nearest2d_backward_out_frame", [&] { + using accscalar_t = at::acc_type; + + auto idata = grad_input.packed_accessor(); + auto odata = grad_output.packed_accessor(); + + upsample_nearest2d_backward_out_frame + <<>>(num_kernels, idata, odata); + }); + + AT_CUDA_CHECK(cudaGetLastError()); } -Tensor upsample_nearest2d_cuda( +} // namespace + +Tensor& upsample_nearest2d_out_cuda( + Tensor& output, const Tensor& input, IntArrayRef output_size) { - return at::legacy::th::_thnn_upsample_nearest2d_forward( - input, output_size); + upsample_nearest2d_out_cuda_template(output, input, output_size); + return output; +} + +Tensor upsample_nearest2d_cuda(const Tensor& input, IntArrayRef output_size) { + Tensor output = at::empty_like(input); + upsample_nearest2d_out_cuda_template(output, input, output_size); + return output; } Tensor& upsample_nearest2d_backward_out_cuda( @@ -25,17 +257,20 @@ Tensor& upsample_nearest2d_backward_out_cuda( const Tensor& grad_output, IntArrayRef output_size, IntArrayRef input_size) { - return at::legacy::th::_thnn_upsample_nearest2d_backward_out( - grad_input, grad_output, output_size, input_size); + upsample_nearest2d_backward_out_cuda_template( + grad_input, grad_output, output_size, input_size); + return grad_input; } Tensor upsample_nearest2d_backward_cuda( const Tensor& grad_output, IntArrayRef output_size, IntArrayRef input_size) { - return at::legacy::th::_thnn_upsample_nearest2d_backward( - grad_output, output_size, input_size); + Tensor grad_input = at::empty_like(grad_output); + upsample_nearest2d_backward_out_cuda_template( + grad_input, grad_output, output_size, input_size); + return grad_input; } -} // native -} // at +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/cuda/UpSampleNearest3d.cu b/aten/src/ATen/native/cuda/UpSampleNearest3d.cu index bb208a5986ba..39590bbfb40b 100644 --- a/aten/src/ATen/native/cuda/UpSampleNearest3d.cu +++ b/aten/src/ATen/native/cuda/UpSampleNearest3d.cu @@ -1,23 +1,280 @@ #include +#include #include -#include +#include +#include +#include +#include +#include namespace at { namespace native { +namespace { -Tensor& upsample_nearest3d_out_cuda( +template +C10_LAUNCH_BOUNDS_1(1024) +__global__ void upsample_nearest3d_out_frame( + const int n, + const PackedTensorAccessor idata, + PackedTensorAccessor odata) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + + const int batchsize = idata.size(0); + const int channels = idata.size(1); + const int depth1 = idata.size(2); + const int height1 = idata.size(3); + const int width1 = idata.size(4); + const int depth2 = odata.size(2); + const int height2 = odata.size(3); + const int width2 = odata.size(4); + + const float depth_scale = (float)depth1 / (float)depth2; + const float height_scale = (float)height1 / (float)height2; + const float width_scale = (float)width1 / (float)width2; + + if (index < n) { + const int w2 = (index % (height2 * width2)) % width2; // 0:width2-1 + const int h2 = (index % (height2 * width2)) / width2; // 0:height2-1 + const int d2 = index / (height2 * width2); // 0:depth2-1 + // special case: just copy + if (depth1 == depth2 && height1 == height2 && width1 == width2) { + const int d1 = d2; + const int h1 = h2; + const int w1 = w2; + + for (int n = 0; n < batchsize; n++) { + for (int c = 0; c < channels; ++c) { + const scalar_t val = idata[n][c][d1][h1][w1]; + odata[n][c][d2][h2][w2] = val; + } + } + return; + } + // + const int h1 = + nearest_neighbor_compute_source_index(height_scale, h2, height1); + const int w1 = + nearest_neighbor_compute_source_index(width_scale, w2, width1); + const int d1 = + nearest_neighbor_compute_source_index(depth_scale, d2, depth1); + + for (int n = 0; n < batchsize; n++) { + for (int c = 0; c < channels; ++c) { + const scalar_t val = idata[n][c][d1][h1][w1]; + odata[n][c][d2][h2][w2] = val; + } + } + } +} + +// Backward operation +template +C10_LAUNCH_BOUNDS_1(1024) +__global__ void upsample_nearest3d_backward_out_frame( + const int n, + PackedTensorAccessor idata, + const PackedTensorAccessor odata) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + + const int batchsize = idata.size(0); + const int channels = idata.size(1); + const int depth1 = idata.size(2); + const int height1 = idata.size(3); + const int width1 = idata.size(4); + const int depth2 = odata.size(2); + const int height2 = odata.size(3); + const int width2 = odata.size(4); + + const float depth_scale = (float)depth1 / (float)depth2; + const float height_scale = (float)height1 / (float)height2; + const float width_scale = (float)width1 / (float)width2; + + if (index < n) { + const int w2 = (index % (height2 * width2)) % width2; // 0:width2-1 + const int h2 = (index % (height2 * width2)) / width2; // 0:height2-1 + const int d2 = index / (height2 * width2); // 0:depth2-1 + + // special case: just copy + if (depth1 == depth2 && height1 == height2 && width1 == width2) { + const int d1 = d2; + const int h1 = h2; + const int w1 = w2; + for (int n = 0; n < batchsize; n++) { + for (int c = 0; c < channels; ++c) { + const scalar_t val = odata[n][c][d1][h1][w1]; + idata[n][c][d2][h2][w2] = val; + } + } + return; + } + // + const int h1 = + nearest_neighbor_compute_source_index(height_scale, h2, height1); + const int w1 = + nearest_neighbor_compute_source_index(width_scale, w2, width1); + const int d1 = + nearest_neighbor_compute_source_index(depth_scale, d2, depth1); + + for (int n = 0; n < batchsize; n++) { + for (int c = 0; c < channels; ++c) { + const scalar_t val = odata[n][c][d2][h2][w2]; + atomicAdd(&idata[n][c][d1][h1][w1], val); + } + } + } +} + +static void upsample_nearest3d_out_cuda_template( Tensor& output, const Tensor& input, IntArrayRef output_size) { - return at::legacy::th::_thnn_upsample_nearest3d_forward_out( - output, input, output_size); + TensorArg input_arg{input, "input", 1}, output_arg{output, "output", 2}; + checkAllSameGPU("upsample_nearest3d_out_cuda", {input_arg, output_arg}); + + TORCH_CHECK( + output_size.size() == 3, + "It is expected output_size equals to 3, but got size ", + output_size.size()); + + int output_depth = output_size[0]; + int output_height = output_size[1]; + int output_width = output_size[2]; + + int nbatch = input.size(0); + int channels = input.size(1); + int input_depth = input.size(2); + int input_height = input.size(3); + int input_width = input.size(4); + + upsample_3d_shape_check( + input, + Tensor(), + nbatch, + channels, + input_depth, + input_height, + input_width, + output_depth, + output_height, + output_width); + + AT_ASSERT( + input_depth > 0 && input_height > 0 && input_width > 0 && + output_depth > 0 && output_height > 0 && output_width > 0); + + output.resize_({input.size(0), + input.size(1), + output_depth, + output_height, + output_width}); + output.zero_(); + + const int num_kernels = output_depth * output_height * output_width; + const int num_threads = std::min( + at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024); + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + input.scalar_type(), "upsample_nearest3d_out_frame", [&] { + using accscalar_t = at::acc_type; + + auto idata = input.packed_accessor(); + auto odata = output.packed_accessor(); + + upsample_nearest3d_out_frame + <<>>(num_kernels, idata, odata); + }); + + AT_CUDA_CHECK(cudaGetLastError()); } -Tensor upsample_nearest3d_cuda( +static void upsample_nearest3d_backward_out_cuda_template( + Tensor& grad_input, + const Tensor& grad_output_, + IntArrayRef output_size, + IntArrayRef input_size) { + TensorArg grad_input_arg{grad_input, "grad_input", 1}, + grad_output_arg{grad_output_, "grad_output_", 2}; + checkAllSameGPU( + "upsample_nearest3d_backward_out_cuda", + {grad_output_arg, grad_input_arg}); + + TORCH_CHECK( + output_size.size() == 3, + "It is expected output_size equals to 3, but got size ", + output_size.size()); + + TORCH_CHECK( + input_size.size() == 5, + "It is expected input_size equals to 5, but got size ", + input_size.size()); + + int output_depth = output_size[0]; + int output_height = output_size[1]; + int output_width = output_size[2]; + + int nbatch = input_size[0]; + int channels = input_size[1]; + int input_depth = input_size[2]; + int input_height = input_size[3]; + int input_width = input_size[4]; + + upsample_3d_shape_check( + Tensor(), + grad_output_, + nbatch, + channels, + input_depth, + input_height, + input_width, + output_depth, + output_height, + output_width); + + Tensor grad_output = grad_output_.contiguous(); + grad_input.resize_( + {nbatch, channels, input_depth, input_height, input_width}); + grad_input.zero_(); + + const int num_kernels = output_depth * output_height * output_width; + const int num_threads = std::min( + at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024); + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + grad_output.scalar_type(), "upsample_nearest3d_backward_out_frame", [&] { + using accscalar_t = at::acc_type; + + auto idata = grad_input.packed_accessor(); + auto odata = grad_output.packed_accessor(); + + upsample_nearest3d_backward_out_frame + <<>>(num_kernels, idata, odata); + }); + + AT_CUDA_CHECK(cudaGetLastError()); +} + +} // namespace + +Tensor& upsample_nearest3d_out_cuda( + Tensor& output, const Tensor& input, IntArrayRef output_size) { - return at::legacy::th::_thnn_upsample_nearest3d_forward( - input, output_size); + upsample_nearest3d_out_cuda_template(output, input, output_size); + return output; +} + +Tensor upsample_nearest3d_cuda(const Tensor& input, IntArrayRef output_size) { + Tensor output = at::empty_like(input); + upsample_nearest3d_out_cuda_template(output, input, output_size); + return output; } Tensor& upsample_nearest3d_backward_out_cuda( @@ -25,17 +282,20 @@ Tensor& upsample_nearest3d_backward_out_cuda( const Tensor& grad_output, IntArrayRef output_size, IntArrayRef input_size) { - return at::legacy::th::_thnn_upsample_nearest3d_backward_out( - grad_input, grad_output, output_size, input_size); + upsample_nearest3d_backward_out_cuda_template( + grad_input, grad_output, output_size, input_size); + return grad_input; } Tensor upsample_nearest3d_backward_cuda( const Tensor& grad_output, IntArrayRef output_size, IntArrayRef input_size) { - return at::legacy::th::_thnn_upsample_nearest3d_backward( - grad_output, output_size, input_size); + Tensor grad_input = at::empty_like(grad_output); + upsample_nearest3d_backward_out_cuda_template( + grad_input, grad_output, output_size, input_size); + return grad_input; } -} // native -} // at +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu b/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu index 386887fc3ed2..683860e8a466 100644 --- a/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu +++ b/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu @@ -1,25 +1,384 @@ +// Adapted from interp.cpp from Caffe util by Pauline Luc +// Originally developed by George Papandreou #include +#include #include -#include +#include +#include +#include +#include +#include namespace at { namespace native { +namespace { + +template +C10_LAUNCH_BOUNDS_1(1024) +__global__ void upsample_trilinear3d_out_frame( + const int n, + const accscalar_t rdepth, + const accscalar_t rheight, + const accscalar_t rwidth, + const bool align_corners, + const PackedTensorAccessor idata, + PackedTensorAccessor odata) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + + const int batchsize = idata.size(0); + const int channels = idata.size(1); + const int depth1 = idata.size(2); + const int height1 = idata.size(3); + const int width1 = idata.size(4); + const int depth2 = odata.size(2); + const int height2 = odata.size(3); + const int width2 = odata.size(4); + + if (index < n) { + const int w2 = (index % (height2 * width2)) % width2; // 0:width2-1 + const int h2 = (index % (height2 * width2)) / width2; // 0:height2-1 + const int t2 = index / (height2 * width2); // 0:depth2-1 + // special case: just copy + if (depth1 == depth2 && height1 == height2 && width1 == width2) { + const int t1 = t2; + const int h1 = h2; + const int w1 = w2; + + for (int n = 0; n < batchsize; n++) { + for (int c = 0; c < channels; ++c) { + const scalar_t val = idata[n][c][t1][h1][w1]; + odata[n][c][t2][h2][w2] = val; + } + } + return; + } + // + const accscalar_t t1r = area_pixel_compute_source_index( + rdepth, t2, align_corners, /*cubic=*/false); + const int t1 = t1r; + const int t1p = (t1 < depth1 - 1) ? 1 : 0; + const accscalar_t t1lambda = t1r - t1; + const accscalar_t t0lambda = static_cast(1) - t1lambda; + // + const accscalar_t h1r = area_pixel_compute_source_index( + rheight, h2, align_corners, /*cubic=*/false); + const int h1 = h1r; + const int h1p = (h1 < height1 - 1) ? 1 : 0; + const accscalar_t h1lambda = h1r - h1; + const accscalar_t h0lambda = static_cast(1) - h1lambda; + // + const accscalar_t w1r = area_pixel_compute_source_index( + rwidth, w2, align_corners, /*cubic=*/false); + const int w1 = w1r; + const int w1p = (w1 < width1 - 1) ? 1 : 0; + const accscalar_t w1lambda = w1r - w1; + const accscalar_t w0lambda = static_cast(1) - w1lambda; + // + for (int n = 0; n < batchsize; n++) { + for (int c = 0; c < channels; ++c) { + const accscalar_t val = t0lambda * + (h0lambda * + (w0lambda * idata[n][c][t1][h1][w1] + + w1lambda * idata[n][c][t1][h1][w1 + w1p]) + + h1lambda * + (w0lambda * idata[n][c][t1][h1 + h1p][w1] + + w1lambda * idata[n][c][t1][h1 + h1p][w1 + w1p])) + + t1lambda * + (h0lambda * + (w0lambda * idata[n][c][t1 + t1p][h1][w1] + + w1lambda * idata[n][c][t1 + t1p][h1][w1 + w1p]) + + h1lambda * + (w0lambda * idata[n][c][t1 + t1p][h1 + h1p][w1] + + w1lambda * idata[n][c][t1 + t1p][h1 + h1p][w1 + w1p])); + odata[n][c][t2][h2][w2] = static_cast(val); + } + } + } +} + +// Backward (adjoint) operation 1 <- 2 (accumulates) +template +C10_LAUNCH_BOUNDS_1(1024) +__global__ void upsample_trilinear3d_backward_out_frame( + const int n, + const accscalar_t rdepth, + const accscalar_t rheight, + const accscalar_t rwidth, + const bool align_corners, + PackedTensorAccessor idata, + const PackedTensorAccessor odata) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + + const int batchsize = idata.size(0); + const int channels = idata.size(1); + const int depth1 = idata.size(2); + const int height1 = idata.size(3); + const int width1 = idata.size(4); + const int depth2 = odata.size(2); + const int height2 = odata.size(3); + const int width2 = odata.size(4); + + if (index < n) { + const int w2 = (index % (height2 * width2)) % width2; // 0:width2-1 + const int h2 = (index % (height2 * width2)) / width2; // 0:height2-1 + const int t2 = index / (height2 * width2); // 0:depth2-1 + // special case: just copy + if (depth1 == depth2 && height1 == height2 && width1 == width2) { + const int t1 = t2; + const int h1 = h2; + const int w1 = w2; + + for (int n = 0; n < batchsize; n++) { + for (int c = 0; c < channels; ++c) { + const scalar_t val = odata[n][c][t1][h1][w1]; + idata[n][c][t2][h2][w2] = val; + } + } + return; + } + // + const accscalar_t t1r = area_pixel_compute_source_index( + rdepth, t2, align_corners, /*cubic=*/false); + const int t1 = t1r; + const int t1p = (t1 < depth1 - 1) ? 1 : 0; + const accscalar_t t1lambda = t1r - t1; + const accscalar_t t0lambda = static_cast(1) - t1lambda; + // + const accscalar_t h1r = area_pixel_compute_source_index( + rheight, h2, align_corners, /*cubic=*/false); + const int h1 = h1r; + const int h1p = (h1 < height1 - 1) ? 1 : 0; + const accscalar_t h1lambda = h1r - h1; + const accscalar_t h0lambda = static_cast(1) - h1lambda; + // + const accscalar_t w1r = area_pixel_compute_source_index( + rwidth, w2, align_corners, /*cubic=*/false); + const int w1 = w1r; + const int w1p = (w1 < width1 - 1) ? 1 : 0; + const accscalar_t w1lambda = w1r - w1; + const accscalar_t w0lambda = static_cast(1) - w1lambda; + // + for (int n = 0; n < batchsize; n++) { + for (int c = 0; c < channels; ++c) { + const scalar_t d2val = odata[n][c][t2][h2][w2]; + atomicAdd( + &idata[n][c][t1][h1][w1], + static_cast(t0lambda * h0lambda * w0lambda * d2val)); + atomicAdd( + &idata[n][c][t1][h1][w1 + w1p], + static_cast(t0lambda * h0lambda * w1lambda * d2val)); + atomicAdd( + &idata[n][c][t1][h1 + h1p][w1], + static_cast(t0lambda * h1lambda * w0lambda * d2val)); + atomicAdd( + &idata[n][c][t1][h1 + h1p][w1 + w1p], + static_cast(t0lambda * h1lambda * w1lambda * d2val)); + atomicAdd( + &idata[n][c][t1 + t1p][h1][w1], + static_cast(t1lambda * h0lambda * w0lambda * d2val)); + atomicAdd( + &idata[n][c][t1 + t1p][h1][w1 + w1p], + static_cast(t1lambda * h0lambda * w1lambda * d2val)); + atomicAdd( + &idata[n][c][t1 + t1p][h1 + h1p][w1], + static_cast(t1lambda * h1lambda * w0lambda * d2val)); + atomicAdd( + &idata[n][c][t1 + t1p][h1 + h1p][w1 + w1p], + static_cast(t1lambda * h1lambda * w1lambda * d2val)); + } + } + } +} + +static void upsample_trilinear3d_out_cuda_template( + Tensor& output, + const Tensor& input, + IntArrayRef output_size, + bool align_corners) { + TensorArg input_arg{input, "input", 1}, output_arg{output, "output", 2}; + checkAllSameGPU("upsample_trilinear3d_out_cuda", {input_arg, output_arg}); + + TORCH_CHECK( + output_size.size() == 3, + "It is expected output_size equals to 3, but got size ", + output_size.size()); + + int output_depth = output_size[0]; + int output_height = output_size[1]; + int output_width = output_size[2]; + + int nbatch = input.size(0); + int channels = input.size(1); + int input_depth = input.size(2); + int input_height = input.size(3); + int input_width = input.size(4); + + upsample_3d_shape_check( + input, + Tensor(), + nbatch, + channels, + input_depth, + input_height, + input_width, + output_depth, + output_height, + output_width); + + output.resize_({input.size(0), + input.size(1), + output_depth, + output_height, + output_width}); + output.zero_(); + + AT_ASSERT( + input_depth > 0 && input_height > 0 && input_width > 0 && + output_depth > 0 && output_height > 0 && output_width > 0); + + const int num_kernels = output_depth * output_height * output_width; + const int num_threads = std::min( + at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024); + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + input.scalar_type(), "upsample_trilinear3d_out_frame", [&] { + using accscalar_t = at::acc_type; + + auto idata = input.packed_accessor(); + auto odata = output.packed_accessor(); + + const accscalar_t rdepth = area_pixel_compute_scale( + input_depth, output_depth, align_corners); + const accscalar_t rheight = area_pixel_compute_scale( + input_height, output_height, align_corners); + const accscalar_t rwidth = area_pixel_compute_scale( + input_width, output_width, align_corners); + + upsample_trilinear3d_out_frame + <<>>( + num_kernels, + rdepth, + rheight, + rwidth, + align_corners, + idata, + odata); + }); + + AT_CUDA_CHECK(cudaGetLastError()); +} + +static void upsample_trilinear3d_backward_out_cuda_template( + Tensor& grad_input, + const Tensor& grad_output_, + IntArrayRef output_size, + IntArrayRef input_size, + bool align_corners) { + TensorArg grad_input_arg{grad_input, "grad_input", 1}, + grad_output_arg{grad_output_, "grad_output_", 2}; + checkAllSameGPU( + "upsample_trilinear3d_backward_out_cuda", + {grad_output_arg, grad_input_arg}); + + TORCH_CHECK( + output_size.size() == 3, + "It is expected output_size equals to 3, but got size ", + output_size.size()); + + TORCH_CHECK( + input_size.size() == 5, + "It is expected input_size equals to 5, but got size ", + input_size.size()); + + int output_depth = output_size[0]; + int output_height = output_size[1]; + int output_width = output_size[2]; + + int nbatch = input_size[0]; + int channels = input_size[1]; + int input_depth = input_size[2]; + int input_height = input_size[3]; + int input_width = input_size[4]; + + upsample_3d_shape_check( + Tensor(), + grad_output_, + nbatch, + channels, + input_depth, + input_height, + input_width, + output_depth, + output_height, + output_width); + Tensor grad_output = grad_output_.contiguous(); + + grad_input.resize_( + {nbatch, channels, input_depth, input_height, input_width}); + grad_input.zero_(); + + const int num_kernels = output_depth * output_height * output_width; + const int num_threads = std::min( + at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024); + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + grad_output.scalar_type(), + "upsample_trilinear3d_backward_out_frame", + [&] { + using accscalar_t = at::acc_type; + + auto idata = grad_input.packed_accessor(); + auto odata = grad_output.packed_accessor(); + + const accscalar_t rdepth = area_pixel_compute_scale( + input_depth, output_depth, align_corners); + const accscalar_t rheight = area_pixel_compute_scale( + input_height, output_height, align_corners); + const accscalar_t rwidth = area_pixel_compute_scale( + input_width, output_width, align_corners); + + upsample_trilinear3d_backward_out_frame + <<>>( + num_kernels, + rdepth, + rheight, + rwidth, + align_corners, + idata, + odata); + }); + + AT_CUDA_CHECK(cudaGetLastError()); +} + +} // namespace Tensor& upsample_trilinear3d_out_cuda( Tensor& output, const Tensor& input, IntArrayRef output_size, bool align_corners) { - return at::legacy::th::_thnn_upsample_trilinear3d_forward_out( - output, input, output_size, align_corners); + upsample_trilinear3d_out_cuda_template( + output, input, output_size, align_corners); + return output; } Tensor upsample_trilinear3d_cuda( const Tensor& input, IntArrayRef output_size, bool align_corners) { - return at::legacy::th::_thnn_upsample_trilinear3d_forward( - input, output_size, align_corners); + Tensor output = at::empty_like(input); + upsample_trilinear3d_out_cuda_template( + output, input, output_size, align_corners); + return output; } Tensor& upsample_trilinear3d_backward_out_cuda( @@ -28,8 +387,9 @@ Tensor& upsample_trilinear3d_backward_out_cuda( IntArrayRef output_size, IntArrayRef input_size, bool align_corners) { - return at::legacy::th::_thnn_upsample_trilinear3d_backward_out( - grad_input, grad_output, output_size, input_size, align_corners); + upsample_trilinear3d_backward_out_cuda_template( + grad_input, grad_output, output_size, input_size, align_corners); + return grad_input; } Tensor upsample_trilinear3d_backward_cuda( @@ -37,9 +397,11 @@ Tensor upsample_trilinear3d_backward_cuda( IntArrayRef output_size, IntArrayRef input_size, bool align_corners) { - return at::legacy::th::_thnn_upsample_trilinear3d_backward( - grad_output, output_size, input_size, align_corners); + Tensor grad_input = at::empty_like(grad_output); + upsample_trilinear3d_backward_out_cuda_template( + grad_input, grad_output, output_size, input_size, align_corners); + return grad_input; } -} // native -} // at +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/cuda/WeightNorm.cu b/aten/src/ATen/native/cuda/WeightNorm.cu index fcb737fd95b8..8392feadb228 100644 --- a/aten/src/ATen/native/cuda/WeightNorm.cu +++ b/aten/src/ATen/native/cuda/WeightNorm.cu @@ -415,10 +415,10 @@ std::tuple weight_norm_cuda_backward { // These checks should always succeed, because weight_norm_fused_backward should only // ever be recorded in the autograd graph via weight_norm, which passes contiguous v and g. - AT_CHECK(saved_v.is_contiguous(), "saved_v must be contiguous"); - AT_CHECK(saved_g.is_contiguous(), "saved_g must be contiguous"); - AT_CHECK(saved_norms.is_contiguous(), "saved_norms must be contiguous"); - AT_CHECK(dim == 0 || dim == saved_v.dim() - 1, "fused kernels can only be applied for first or last dim") + TORCH_CHECK(saved_v.is_contiguous(), "saved_v must be contiguous"); + TORCH_CHECK(saved_g.is_contiguous(), "saved_g must be contiguous"); + TORCH_CHECK(saved_norms.is_contiguous(), "saved_norms must be contiguous"); + TORCH_CHECK(dim == 0 || dim == saved_v.dim() - 1, "fused kernels can only be applied for first or last dim") auto grad_v = at::empty_like(saved_v); auto grad_g = at::empty_like(saved_g); diff --git a/aten/src/ATen/native/cudnn/Conv.cpp b/aten/src/ATen/native/cudnn/Conv.cpp index 7cc49cea4727..938f037e501c 100644 --- a/aten/src/ATen/native/cudnn/Conv.cpp +++ b/aten/src/ATen/native/cudnn/Conv.cpp @@ -217,10 +217,10 @@ Tensor narrowGroup(const Tensor& t, int dim, int group_idx, int64_t groups) { // Used on pad, stride and dilation static void check_args(CheckedFrom c, IntArrayRef args, size_t expected_size, const char* arg_name) { - AT_CHECK(args.size() <= expected_size, + TORCH_CHECK(args.size() <= expected_size, "Too many ", arg_name, " values (", args.size(), ") supplied, expecting ", expected_size, " (while checking arguments for ", c, ")"); - AT_CHECK(args.size() >= expected_size, + TORCH_CHECK(args.size() >= expected_size, "Not enough ", arg_name, " values (", args.size(), ") supplied, expecting ", expected_size, " (while checking arguments for ", c, ")"); diff --git a/aten/src/ATen/native/cudnn/LossCTC.cpp b/aten/src/ATen/native/cudnn/LossCTC.cpp index c634305d33d2..f315b098dcbb 100644 --- a/aten/src/ATen/native/cudnn/LossCTC.cpp +++ b/aten/src/ATen/native/cudnn/LossCTC.cpp @@ -46,14 +46,14 @@ std::tuple _cudnn_ctc_loss(const Tensor& log_probs_t, const Tens checkBackend(c, {*log_probs}, Backend::CUDA); checkBackend(c, {*targets}, Backend::CPU); int64_t batch_size = log_probs->size(1); - AT_CHECK(input_lengths_.size() == batch_size, "input_lengths needs to have size to match batch_size"); - AT_CHECK(target_lengths_.size() == batch_size, "target_lengths needs to have size to match batch_size"); + TORCH_CHECK(input_lengths_.size() == batch_size, "input_lengths needs to have size to match batch_size"); + TORCH_CHECK(target_lengths_.size() == batch_size, "target_lengths needs to have size to match batch_size"); std::vector input_lengths(input_lengths_.begin(), input_lengths_.end()); std::vector target_lengths(target_lengths_.begin(), target_lengths_.end()); setCuDNNStreamToCurrent(); - AT_CHECK(BLANK == 0, "blank must be label 0 for cudnn_ctc_loss"); + TORCH_CHECK(BLANK == 0, "blank must be label 0 for cudnn_ctc_loss"); // checked in dispatch: // assert other conditions for cudnnCTCLoss: all label lengths <= 256 // all input lengths = logprob.size(0) diff --git a/aten/src/ATen/native/cudnn/RNN.cpp b/aten/src/ATen/native/cudnn/RNN.cpp index 39e0e1cd49be..3dbe44e9c075 100644 --- a/aten/src/ATen/native/cudnn/RNN.cpp +++ b/aten/src/ATen/native/cudnn/RNN.cpp @@ -627,7 +627,7 @@ Tensor _cudnn_rnn_flatten_weight( bool fn_bidirectional ) { - AT_CHECK(weight_arr.size() > 0, + TORCH_CHECK(weight_arr.size() > 0, "_cudnn_rnn_flatten_weight_: cannot flatten empty weight list"); auto any_param = weight_arr[0]; @@ -701,7 +701,7 @@ std::tuple _cudnn_rnn( // TODO: Set device to input if (fn.rnn.mode != CUDNN_LSTM) { - AT_CHECK(!cx.defined(), + TORCH_CHECK(!cx.defined(), "rnn: illegal defined cx for non-LSTM RNN"); } @@ -714,9 +714,9 @@ std::tuple _cudnn_rnn( auto hidden_size = _hidden_size(fn.rnn, fn.tensors); auto output_size = _output_size(fn.rnn, fn.tensors); - AT_CHECK(hx.is_contiguous(), + TORCH_CHECK(hx.is_contiguous(), "rnn: hx is not contiguous"); - AT_CHECK(!cx.defined() || cx.is_contiguous(), + TORCH_CHECK(!cx.defined() || cx.is_contiguous(), "rnn: cx is not contiguous"); auto x = input.contiguous(); @@ -750,7 +750,7 @@ std::tuple _cudnn_rnn( w_desc.set(weight_buf, 3); } - AT_CHECK(!cx.defined() || cx.sizes().equals(hidden_size), + TORCH_CHECK(!cx.defined() || cx.sizes().equals(hidden_size), "Expected cell size ", IntArrayRef{hidden_size}, ", got ", cx.sizes()); size_t workspace_size; @@ -842,7 +842,7 @@ std::tuple _cudnn_rnn_backward_input( auto handle = getCudnnHandle(); if (fn.rnn.mode != CUDNN_LSTM) { - AT_CHECK(!cx.defined(), + TORCH_CHECK(!cx.defined(), "rnn: illegal defined cx for non-LSTM RNN"); } @@ -857,9 +857,9 @@ std::tuple _cudnn_rnn_backward_input( auto hidden_size = _hidden_size(fn.rnn, fn.tensors); auto output_size = _output_size(fn.rnn, fn.tensors); - AT_CHECK(hx.is_contiguous(), + TORCH_CHECK(hx.is_contiguous(), "rnn: hx is not contiguous"); - AT_CHECK(!cx.defined() || cx.is_contiguous(), + TORCH_CHECK(!cx.defined() || cx.is_contiguous(), "rnn: cx is not contiguous"); auto x = input.contiguous(); @@ -873,24 +873,24 @@ std::tuple _cudnn_rnn_backward_input( AT_ASSERTM(cx.defined() || !output_mask[2], "illegally required grad of cx for non-LSTM RNN"); auto dcx = cx.defined() ? at::empty(hidden_size, cx.options()) : Tensor(); - AT_CHECK(fn_train, + TORCH_CHECK(fn_train, "cudnn RNN backward can only be called in training mode"); - AT_CHECK(input.sizes().equals(input_size), + TORCH_CHECK(input.sizes().equals(input_size), "Expected input size ", IntArrayRef{input_size}, ", got ", input.sizes()); - AT_CHECK(output.sizes().equals(output_size), + TORCH_CHECK(output.sizes().equals(output_size), "Expected output size ", IntArrayRef{output_size}, ", got ", output.sizes()); - AT_CHECK(!hx.defined() || hx.sizes().equals(hidden_size), + TORCH_CHECK(!hx.defined() || hx.sizes().equals(hidden_size), "Expected hidden size ", IntArrayRef{hidden_size}, ", got ", hx.sizes()); - AT_CHECK(!cx.defined() || cx.sizes().equals(hidden_size), + TORCH_CHECK(!cx.defined() || cx.sizes().equals(hidden_size), "Expected cell size ", IntArrayRef{hidden_size}, ", got ", cx.sizes()); - AT_CHECK(!dhy.defined() || dhy.sizes().equals(hidden_size), + TORCH_CHECK(!dhy.defined() || dhy.sizes().equals(hidden_size), "Expected d_hidden size ", IntArrayRef{hidden_size}, ", got ", dhy.sizes()); - AT_CHECK(!dcy.defined() || dcy.sizes().equals(hidden_size), + TORCH_CHECK(!dcy.defined() || dcy.sizes().equals(hidden_size), "Expected d_cell size ", IntArrayRef{hidden_size}, ", got ", dcy.sizes()); - AT_CHECK(dhy.is_cuda() && dy.is_cuda() && (!dcy.defined() || dcy.is_cuda()), + TORCH_CHECK(dhy.is_cuda() && dy.is_cuda() && (!dcy.defined() || dcy.is_cuda()), "Gradients aren't CUDA tensors"); cudnnRNNAlgo_t algo = get_algo(fn.rnn, fn.tensors, input); @@ -965,7 +965,7 @@ std::vector _cudnn_rnn_backward_weight( auto handle = getCudnnHandle(); if (fn.rnn.mode != CUDNN_LSTM) { - AT_CHECK(!cx.defined(), + TORCH_CHECK(!cx.defined(), "rnn: illegal defined cx for non-LSTM RNN"); } @@ -978,20 +978,20 @@ std::vector _cudnn_rnn_backward_weight( auto input_size = _input_size(fn.tensors); auto hidden_size = _hidden_size(fn.rnn, fn.tensors); - AT_CHECK(fn_train, + TORCH_CHECK(fn_train, "cudnn RNN backward can only be called in training mode"); - AT_CHECK(input.sizes().equals(input_size), + TORCH_CHECK(input.sizes().equals(input_size), "Expected input size ", IntArrayRef{input_size}, ", got ", input.sizes()); - AT_CHECK(!hx.defined() || hx.sizes().equals(hidden_size), + TORCH_CHECK(!hx.defined() || hx.sizes().equals(hidden_size), "Expected hidden size ", IntArrayRef{hidden_size}, ", got ", hx.sizes()); // TODO: the above were the only checks in rnn.py, but it doesn't seem // like these checks are enough - AT_CHECK(hx.is_contiguous(), + TORCH_CHECK(hx.is_contiguous(), "rnn: hx is not contiguous"); - AT_CHECK(!cx.defined() || cx.is_contiguous(), + TORCH_CHECK(!cx.defined() || cx.is_contiguous(), "rnn: cx is not contiguous"); auto x = input.contiguous(); @@ -1236,7 +1236,7 @@ std::pair _cudnn_impl( AT_WARN(WEIGHT_FORMAT_WARN); } - AT_CHECK(_batch_sizes.dim() == 1, "batch_sizes tensor should be 1D"); + TORCH_CHECK(_batch_sizes.dim() == 1, "batch_sizes tensor should be 1D"); IntArrayRef batch_sizes { _batch_sizes.data(), static_cast(_batch_sizes.size(0)) }; auto & dropout_state = get_dropout_state(dropout_p, train, input.options()); diff --git a/aten/src/ATen/native/layer_norm.cpp b/aten/src/ATen/native/layer_norm.cpp new file mode 100644 index 000000000000..1dce347c92ea --- /dev/null +++ b/aten/src/ATen/native/layer_norm.cpp @@ -0,0 +1,122 @@ +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { + +namespace { + +std::tuple layer_norm_forward_cpu( + const Tensor& X, + const Tensor& gamma /* optional */, + const Tensor& beta /* optional */, + int64_t M, + int64_t N, + double eps) { + Tensor Y = at::native::empty_like(X); + Tensor mean = at::empty({M}, X.options()); + Tensor rstd = at::empty({M}, X.options()); + LayerNormKernel(kCPU, X, gamma, beta, M, N, eps, &Y, &mean, &rstd); + return std::make_tuple(Y, mean, rstd); +} + +} // namespace + +Tensor layer_norm( + const Tensor& input, + IntArrayRef normalized_shape, + const Tensor& weight /* optional */, + const Tensor& bias /* optional */, + double eps, + bool cudnn_enabled) { + const int normalized_ndim = normalized_shape.size(); + TORCH_CHECK( + normalized_ndim >= 1, + "Expected normalized_shape to be at least 1-dimensional, i.e., ", + "containing at least one element, but got normalized_shape = ", + normalized_shape); + TORCH_CHECK( + !weight.defined() || weight.sizes().equals(normalized_shape), + "Expected weight to be of same shape as normalized_shape, but got ", + "weight of shape ", + weight.sizes(), + " and normalized_shape = ", + normalized_shape); + TORCH_CHECK( + !bias.defined() || bias.sizes().equals(normalized_shape), + "Expected bias to be of same shape as normalized_shape, but got ", + "bias of shape ", + bias.sizes(), + " and normalized_shape = ", + normalized_shape); + + const auto input_shape = input.sizes(); + const auto input_ndim = input.dim(); + + if (input_ndim < normalized_ndim || + !input_shape.slice(input_ndim - normalized_ndim) + .equals(normalized_shape)) { + std::stringstream ss; + ss << "Given normalized_shape=" << normalized_shape + << ", expected input with shape [*"; + for (auto size : normalized_shape) { + ss << ", " << size; + } + ss << "], but got input of size" << input_shape; + AT_ERROR(ss.str()); + } + + const int axis = input_ndim - normalized_ndim; + const int64_t M = std::accumulate( + input_shape.cbegin(), + input_shape.cbegin() + axis, + 1LL, + std::multiplies()); + const int64_t N = std::accumulate( + input_shape.cbegin() + axis, + input_shape.cend(), + 1LL, + std::multiplies()); + + // TODO(yangxm): Remove this check after backward pass landed. + const auto is_forward = [](const Tensor& tensor) { + return tensor.is_variable() && !tensor.requires_grad(); + }; + if (input.device().is_cpu() && is_forward(input) && is_forward(weight) && + is_forward(bias)) { + return std::get<0>(layer_norm_forward_cpu( + input.contiguous(), weight.contiguous(), bias.contiguous(), M, N, eps)); + } + + // Apply layer norm + auto input_reshaped = input.contiguous().view({1, M, -1}); + auto out = at::batch_norm( + input_reshaped, {}, {}, {}, {}, true, 0, eps, cudnn_enabled); + out = out.view(input_shape); + + if (weight.defined() && bias.defined()) { + return bias.addcmul(out, weight, 1); + } else if (weight.defined()) { + return out.mul(weight); + } else if (bias.defined()) { + return out.add(bias); + } else { + return out; + } +} + +DEFINE_DISPATCH(LayerNormKernel); + +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/miopen/Conv_miopen.cpp b/aten/src/ATen/native/miopen/Conv_miopen.cpp index cffbb1b1ac77..d7e9dab945db 100644 --- a/aten/src/ATen/native/miopen/Conv_miopen.cpp +++ b/aten/src/ATen/native/miopen/Conv_miopen.cpp @@ -211,10 +211,10 @@ Tensor narrowGroup(const Tensor& t, int dim, int group_idx, int64_t groups) { // Used on pad, stride and dilation static void check_args(CheckedFrom c, IntArrayRef args, size_t expected_size, const char* arg_name) { - AT_CHECK(args.size() <= expected_size, + TORCH_CHECK(args.size() <= expected_size, "Too many ", arg_name, " values (", args.size(), ") supplied, expecting ", expected_size, " (while checking arguments for ", c, ")"); - AT_CHECK(args.size() >= expected_size, + TORCH_CHECK(args.size() >= expected_size, "Not enough ", arg_name, " values (", args.size(), ") supplied, expecting ", expected_size, " (while checking arguments for ", c, ")"); diff --git a/aten/src/ATen/native/mkl/LinearAlgebra.cpp b/aten/src/ATen/native/mkl/LinearAlgebra.cpp index a6ecdcde198c..126329cdfe31 100644 --- a/aten/src/ATen/native/mkl/LinearAlgebra.cpp +++ b/aten/src/ATen/native/mkl/LinearAlgebra.cpp @@ -52,30 +52,36 @@ static inline void gemm_batched(const CBLAS_TRANSPOSE trans_A, const CBLAS_TRANS template static inline void baddbmm_mkl_template(const Tensor& res, const Tensor& mat1, const Tensor& mat2, Scalar beta_, Scalar alpha_) { - auto is_transposed = [&](const Tensor& t) { + auto is_transposed = [&](const TensorAccessor& t) { return t.stride(0) == 1 && t.stride(1) >= t.size(0); }; - const CBLAS_TRANSPOSE trans_A = is_transposed(mat1[0]) ? CblasTrans : CblasNoTrans; - const CBLAS_TRANSPOSE trans_B = is_transposed(mat2[0]) ? CblasTrans : CblasNoTrans; - const int batch_size = mat1.size(0); - const int M = mat1.size(1); - const int N = mat2.size(2); - const int K = mat1.size(2); + auto mat1_acc = mat1.accessor(); + auto mat2_acc = mat2.accessor(); + auto res_acc = res.accessor(); + + const CBLAS_TRANSPOSE trans_A = is_transposed(mat1_acc[0]) ? CblasTrans : CblasNoTrans; + const CBLAS_TRANSPOSE trans_B = is_transposed(mat2_acc[0]) ? CblasTrans : CblasNoTrans; + + const int batch_size = mat1_acc.size(0); + const int M = mat1_acc.size(1); + const int N = mat2_acc.size(2); + const int K = mat1_acc.size(2); scalar_t alpha = alpha_.to(); scalar_t beta = beta_.to(); - const int lda = is_transposed(mat1[0]) ? mat1[0].stride(1) : mat1[0].stride(0); - const int ldb = is_transposed(mat2[0]) ? mat2[0].stride(1) : mat2[0].stride(0); + const int lda = is_transposed(mat1_acc[0]) ? mat1_acc[0].stride(1) : mat1_acc[0].stride(0); + const int ldb = is_transposed(mat2_acc[0]) ? mat2_acc[0].stride(1) : mat2_acc[0].stride(0); const int ldc = res[0].stride(0); std::vector A(batch_size); std::vector B(batch_size); std::vector C(batch_size); + for (int64_t batch = 0; batch < batch_size; batch++) { - A[batch] = mat1[batch].data(); - B[batch] = mat2[batch].data(); - C[batch] = res[batch].data(); + A[batch] = mat1_acc[batch].data(); + B[batch] = mat2_acc[batch].data(); + C[batch] = res_acc[batch].data(); } gemm_batched(trans_A, trans_B, batch_size, M, N, K, alpha, A.data(), lda, B.data(), ldb, beta, C.data(), ldc); diff --git a/aten/src/ATen/native/mkl/SpectralOps.cpp b/aten/src/ATen/native/mkl/SpectralOps.cpp index b2e135a524b9..58401d2d30ae 100644 --- a/aten/src/ATen/native/mkl/SpectralOps.cpp +++ b/aten/src/ATen/native/mkl/SpectralOps.cpp @@ -176,7 +176,7 @@ Tensor _fft_mkl(const Tensor& self, int64_t signal_ndim, osize = output_sizes[i]; istride = complex_input ? input.stride(i) >> 1 : input.stride(i); ostride = onumel; - AT_CHECK(isize <= MKL_LONG_MAX && osize <= MKL_LONG_MAX && ostride <= MKL_LONG_MAX, + TORCH_CHECK(isize <= MKL_LONG_MAX && osize <= MKL_LONG_MAX && ostride <= MKL_LONG_MAX, "MKL FFT: input signal numel exceeds allowed range [1 ~ ", MKL_LONG_MAX, "]"); if (!need_contiguous && istride > MKL_LONG_MAX) { // If we didn't plan to contiguous-fy but the `istride` exceeds bound, @@ -186,7 +186,7 @@ Tensor _fft_mkl(const Tensor& self, int64_t signal_ndim, // fine as `inumel` is non-decreasing. need_contiguous = true; } - AT_CHECK(!need_contiguous || inumel <= MKL_LONG_MAX, + TORCH_CHECK(!need_contiguous || inumel <= MKL_LONG_MAX, "MKL FFT: input signal numel exceeds allowed range [1 ~ ", MKL_LONG_MAX, "]"); inumel *= isize; onumel *= osize; diff --git a/aten/src/ATen/native/mkldnn/Pooling.cpp b/aten/src/ATen/native/mkldnn/Pooling.cpp index 0e1b351bdde1..6314773b5839 100644 --- a/aten/src/ATen/native/mkldnn/Pooling.cpp +++ b/aten/src/ATen/native/mkldnn/Pooling.cpp @@ -73,7 +73,7 @@ static Tensor _mkldnn_pool2d( IntArrayRef dilation, bool ceil_mode, ideep::algorithm algo) { - AT_CHECK(!ceil_mode, "Currently Mkldnn Pooling operators do not support ceil_mode."); + TORCH_CHECK(!ceil_mode, "Currently Mkldnn Pooling operators do not support ceil_mode."); auto kernel_size_vec = expand_param_if_needed(kernel_size, "kernel_size", 2); auto stride_vec = expand_param_if_needed(stride, "stride", 2); auto padding_vec = expand_param_if_needed(padding, "padding", 2); diff --git a/aten/src/ATen/native/mkldnn/UnaryOps.cpp b/aten/src/ATen/native/mkldnn/UnaryOps.cpp new file mode 100644 index 000000000000..5045acd60a57 --- /dev/null +++ b/aten/src/ATen/native/mkldnn/UnaryOps.cpp @@ -0,0 +1,46 @@ +#include +#include +#include + +#if !AT_MKLDNN_ENABLED() + +namespace at { +namespace native { + +Tensor mkldnn_sigmoid(const Tensor& self) { + AT_ERROR("mkldnn_sigmoid: ATen not compiled with MKLDNN support"); +} + +Tensor& mkldnn_sigmoid_(Tensor& self) { + AT_ERROR("mkldnn_sigmoid_: ATen not compiled with MKLDNN support"); +} + +} // namespace native +} // namespace at + +#else // AT_MKLDNN_EBABLED + +#include + +namespace at { +namespace native { + +Tensor mkldnn_sigmoid(const Tensor& self) { + ideep::tensor& x = itensor_from_mkldnn(self); + ideep::tensor y; + ideep::eltwise_forward::compute( + x, y, ideep::algorithm::eltwise_logistic, ideep::prop_kind::forward); + return new_with_itensor_mkldnn(std::move(y), self.options()); +} + +Tensor& mkldnn_sigmoid_(Tensor& self) { + ideep::tensor& x = itensor_from_mkldnn(self); + ideep::eltwise_forward::compute( + x, x, ideep::algorithm::eltwise_logistic, ideep::prop_kind::forward); + return self; +} + +} // namespace native +} // namespace at + +#endif // AT_MKLDNN_EBABLED diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index 2fec270eabbf..7c179ec58505 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -221,9 +221,6 @@ - func: as_strided(Tensor(a) self, int[] size, int[] stride, int? storage_offset=None) -> Tensor(a) variants: function, method - cpu_half: True - cpu_bool: True - cuda_bool: True dispatch: CPU: as_strided_tensorimpl CUDA: as_strided_tensorimpl @@ -424,7 +421,7 @@ - func: constant_pad_nd(Tensor self, int[] pad, Scalar value=0) -> Tensor variants: function -- func: contiguous(Tensor self) -> Tensor +- func: contiguous(Tensor self, *, MemoryFormat memory_format=contiguous_format) -> Tensor variants: method - func: convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups) -> Tensor @@ -456,28 +453,8 @@ variants: method device_guard: False -- func: s_copy_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!) - cpu_half: True - cpu_bool: True - cuda_bool: True - dispatch: - CPU: _s_copy__cpu - CUDA: _s_copy__cuda - QuantizedCPU: _s_copy__quantized - -- func: _s_copy_from(Tensor self, Tensor dst, bool non_blocking=False) -> Tensor - cpu_half: True - cpu_bool: True - cuda_bool: True - dispatch: - CUDA: _s_copy_from_cuda - -- func: _copy_same_type_(Tensor(a!) self, Tensor src) -> void - cpu_half: True - cpu_bool: True - cuda_bool: True - dispatch: - CPU: _copy_same_type__cpu +- func: _copy_from(Tensor self, Tensor dst, bool non_blocking=False) -> Tensor + dispatch: {} - func: cos(Tensor self) -> Tensor variants: function, method @@ -696,9 +673,6 @@ CUDA: _embedding_bag_per_sample_weights_backward_cuda - func: empty(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor - cpu_half: True - cpu_bool: True - cuda_bool: True dispatch: CPU: empty_cpu CUDA: empty_cuda @@ -711,9 +685,6 @@ - func: resize_(Tensor(a!) self, int[] size) -> Tensor(a!) variants: method - cpu_bool: True - cuda_bool: True - cpu_half: True device_guard: False dispatch: CPU: resize_cpu_ @@ -729,9 +700,6 @@ device_guard: False - func: empty_strided(int[] size, int[] stride, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor - cpu_half: True - cpu_bool: True - cuda_bool: True dispatch: CPU: empty_strided_cpu CUDA: empty_strided_cuda @@ -1411,6 +1379,9 @@ - func: pinverse(Tensor self, float rcond=1e-15) -> Tensor variants: function, method +- func: poisson_nll_loss(Tensor input, Tensor target, bool log_input, bool full, float eps, int reduction) -> Tensor + variants: function + - func: scalar_tensor(Scalar s, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor - func: rand(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor @@ -1622,12 +1593,17 @@ - func: sigmoid(Tensor self) -> Tensor variants: function, method + dispatch: + CPU: sigmoid + CUDA: sigmoid + MkldnnCPU: mkldnn_sigmoid - func: sigmoid_(Tensor(a!) self) -> Tensor(a!) variants: function, method dispatch: CPU: _sigmoid__cpu CUDA: _sigmoid__cuda + MkldnnCPU: mkldnn_sigmoid_ - func: sigmoid(Tensor self, *, Tensor(a!) out) -> Tensor(a!) dispatch: @@ -1829,6 +1805,12 @@ - func: std(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor variants: function, method +- func: std_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor) + variants: function + +- func: std_mean(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor) + variants: function + - func: std(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) # FIXME: These could be combined as optional but for https://github.com/pytorch/pytorch/issues/6593. @@ -2005,6 +1987,12 @@ - func: var(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) +- func: var_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor) + variants: function + +- func: var_mean(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor) + variants: function + - func: view_as(Tensor self, Tensor other) -> Tensor variants: method device_guard: False @@ -2128,9 +2116,6 @@ - func: clone(Tensor self) -> Tensor variants: function, method - cpu_half: True - cpu_bool: True - cuda_bool: True dispatch: CPU: clone CUDA: clone @@ -2139,9 +2124,6 @@ MkldnnCPU: mkldnn_clone - func: resize_as_(Tensor(a!) self, Tensor the_template) -> Tensor(a!) - cpu_bool: True - cuda_bool: True - cpu_half: True variants: function, method dispatch: CPU: resize_as_ @@ -2166,9 +2148,6 @@ - func: zero_(Tensor(a!) self) -> Tensor(a!) variants: method, function - cpu_half: True - cpu_bool: True - cuda_bool: True dispatch: CPU: zero_ CUDA: zero_ @@ -2547,7 +2526,7 @@ - func: to_mkldnn_backward(Tensor grad, Tensor input) -> Tensor -- func: quantize_linear(Tensor self, float scale, int zero_point) -> Tensor +- func: quantize_linear(Tensor self, float scale, int zero_point, ScalarType dtype) -> Tensor variants: function, method dispatch: CPU: quantize_linear_cpu @@ -2557,6 +2536,11 @@ dispatch: QuantizedCPU: dequantize_quant +- func: dequantize_linear(Tensor self, float scale, int zero_point, ScalarType dtype) -> Tensor + variants: function, method + dispatch: + CPU: dequantize_linear_cpu + - func: q_scale(Tensor self) -> Scalar variants: function, method dispatch: @@ -2572,6 +2556,10 @@ dispatch: QuantizedCPU: int_repr_quant +- func: _per_tensor_affine_qtensor(Tensor self, float scale, int zero_point) -> Tensor + dispatch: + CPU: per_tensor_affine_qtensor_cpu + # to(Device) must not exist because all constructors of Device also works for # TensorOptions. Otherwise, an ambiguity error is thrown. # See NOTE [ TensorOptions Constructors ]. @@ -2603,12 +2591,7 @@ variants: method # NB: Does NOT check precondition that numel == 1 -# WARNING: Use of cpu_half here is generally not supported; please -# don't use it. - func: _local_scalar_dense(Tensor self) -> Scalar - cpu_half: True - cpu_bool: True - cuda_bool: True dispatch: CPU: _local_scalar_dense_cpu CUDA: _local_scalar_dense_cuda @@ -2699,8 +2682,6 @@ - func: is_set_to(Tensor self, Tensor tensor) -> bool variants: method - cpu_bool: True - cuda_bool: True device_guard: False - func: masked_fill_(Tensor(a!) self, Tensor mask, Scalar value) -> Tensor(a!) @@ -2722,9 +2703,6 @@ variants: function, method - func: view(Tensor(a) self, int[] size) -> Tensor(a) - cpu_half: True - cpu_bool: True - cuda_bool: True variants: method device_guard: False @@ -2951,6 +2929,9 @@ - func: uniform_(Tensor(a!) self, float from=0, float to=1, *, Generator? generator=None) -> Tensor(a!) variants: method + dispatch: + CPU: uniform_cpu_ + CUDA: uniform_cuda_ - func: normal_(Tensor(a!) self, float mean=0, float std=1, *, Generator? generator=None) -> Tensor(a!) variants: method @@ -3657,15 +3638,27 @@ - func: adaptive_avg_pool3d(Tensor self, int[3] output_size, *, Tensor(a!) out) -> Tensor(a!) python_module: nn + dispatch: + CPU: adaptive_avg_pool3d_out_cpu + CUDA: adaptive_avg_pool3d_out_cuda - func: adaptive_avg_pool3d(Tensor self, int[3] output_size) -> Tensor python_module: nn + dispatch: + CPU: adaptive_avg_pool3d_cpu + CUDA: adaptive_avg_pool3d_cuda - func: adaptive_avg_pool3d_backward(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn + dispatch: + CPU: adaptive_avg_pool3d_backward_out_cpu + CUDA: adaptive_avg_pool3d_backward_out_cuda - func: adaptive_avg_pool3d_backward(Tensor grad_output, Tensor self) -> Tensor python_module: nn + dispatch: + CPU: adaptive_avg_pool3d_backward_cpu + CUDA: adaptive_avg_pool3d_backward_cuda # Return: (Tensor output, Tensor indices) - func: adaptive_max_pool2d(Tensor self, int[2] output_size, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!)) @@ -3804,18 +3797,30 @@ CUDA: fractional_max_pool3d_backward_cuda # Return: (Tensor output, Tensor indices) -- func: max_pool2d_with_indices(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False, *, Tensor(a!) output, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!)) +- func: max_pool2d_with_indices(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!)) python_module: nn + dispatch: + CPU: max_pool2d_with_indices_out_cpu + CUDA: max_pool2d_with_indices_out_cuda # Return: (Tensor output, Tensor indices) - func: max_pool2d_with_indices(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor) python_module: nn + dispatch: + CPU: max_pool2d_with_indices_cpu + CUDA: max_pool2d_with_indices_cuda - func: max_pool2d_with_indices_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, bool ceil_mode, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn + dispatch: + CPU: max_pool2d_with_indices_backward_out_cpu + CUDA: max_pool2d_with_indices_backward_out_cuda - func: max_pool2d_with_indices_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, bool ceil_mode, Tensor indices) -> Tensor python_module: nn + dispatch: + CPU: max_pool2d_with_indices_backward_cpu + CUDA: max_pool2d_with_indices_backward_cuda # Return: (Tensor output, Tensor indices) - func: max_pool3d_with_indices(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False, *, Tensor(a!) output, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!)) diff --git a/aten/src/ATen/native/quantized/Copy.cpp b/aten/src/ATen/native/quantized/Copy.cpp index 3e66085cdf54..3951d86bc243 100644 --- a/aten/src/ATen/native/quantized/Copy.cpp +++ b/aten/src/ATen/native/quantized/Copy.cpp @@ -1,27 +1,28 @@ -#include +#include #include -#include -#include #include namespace at { namespace native { -Tensor& _s_copy__quantized(Tensor& self, const Tensor& src, bool /* unused */) { - AT_CHECK( - self.scalar_type() == at::kQInt8, - "Quantized copy only works with kQInt8 as target Tensor"); - AT_CHECK( +Tensor& quantized_copy_(Tensor& self, const Tensor& src) { + TORCH_CHECK( src.scalar_type() == at::kFloat, "Quantized copy only works with kFloat as source Tensor"); - qint8* self_data = self.data(); - float* src_data = src.data(); - for (int i = 0; i < self.numel(); ++i) { - self_data[i] = quantize_uint8( - self.q_scale().to(), - self.q_zero_point().to(), - src_data[i]); - } + TORCH_CHECK(self.is_contiguous() && src.is_contiguous(), + "Quantized copy only works with contiguous Tensors"); + TORCH_CHECK(self.sizes().equals(src.sizes()), + "Quantized copy only works with Tensors with the same shape"); + AT_DISPATCH_QINT_TYPES(self.scalar_type(), "Copy", [&]() { + float* src_data = src.data(); + scalar_t* self_data = self.data(); + for (int i = 0; i < self.numel(); ++i) { + self_data[i] = quantize_val( + self.q_scale().to(), + self.q_zero_point().to(), + src_data[i]); + } + }); return self; } } // namespace native diff --git a/aten/src/ATen/native/quantized/Copy.h b/aten/src/ATen/native/quantized/Copy.h new file mode 100644 index 000000000000..0e55387a8545 --- /dev/null +++ b/aten/src/ATen/native/quantized/Copy.h @@ -0,0 +1,11 @@ +#pragma once + +#include + +namespace at { +namespace native { + +Tensor& quantized_copy_(Tensor& self, const Tensor& src); + +} +} diff --git a/aten/src/ATen/native/quantized/QTensor.cpp b/aten/src/ATen/native/quantized/QTensor.cpp index 0166632daa33..20ceb5d32cea 100644 --- a/aten/src/ATen/native/quantized/QTensor.cpp +++ b/aten/src/ATen/native/quantized/QTensor.cpp @@ -7,8 +7,8 @@ namespace at { namespace native { -Tensor quantize_linear_cpu(const Tensor& self, double scale, int64_t zero_point) { - auto quantizer = make_per_tensor_affine_quantizer(scale, zero_point); +Tensor quantize_linear_cpu(const Tensor& self, double scale, int64_t zero_point, ScalarType dtype) { + auto quantizer = make_per_tensor_affine_quantizer(scale, zero_point, dtype); return quantizer->quantize(self); } @@ -16,6 +16,21 @@ Tensor dequantize_quant(const Tensor& self) { return get_qtensorimpl(self)->quantizer()->dequantize(self); } +Tensor dequantize_linear_cpu(const Tensor& self, double scale, int64_t zero_point, ScalarType dtype) { + AT_CHECK(isQIntType(toQIntType(self.scalar_type())), + "Scalar type for quantized Tensor must have same underlying type as input."); + AT_CHECK(dtype == ScalarType::Float, "ScalarType for target Tensor must be float."); + Tensor f = at::empty(self.sizes(), self.options().dtype(dtype)); + AT_DISPATCH_QINT_TYPES( + toQIntType(self.scalar_type()), "dequantize_linear_cpu", [&]() { + underlying_t* qdata = self.data(); + auto* fdata = f.data(); + for (int i = 0; i < self.numel(); ++i) { + fdata[i] = (static_cast(qdata[i]) - zero_point) * scale; + }}); + return f; +} + Scalar q_scale_quant(const Tensor& self) { auto quantizer = get_qtensorimpl(self)->quantizer(); AT_ASSERT(quantizer->qscheme() == kPerTensorAffine); @@ -33,12 +48,27 @@ Quantizer* quantizer(const Tensor& self) { } Tensor int_repr_quant(const Tensor& self) { - Tensor dst = at::empty(self.sizes(), self.options().dtype(at::kByte)); - uint8_t* self_data = reinterpret_cast(self.data()); - uint8_t* dst_data = dst.data(); - if (self.numel() > 0) { - memcpy(dst_data, self_data, self.numel()); - } + Tensor dst; + AT_DISPATCH_QINT_TYPES( + self.scalar_type(), "int_repr", [&]() { + dst = at::empty(self.sizes(), self.options().dtype(UNDERLYING_TYPE)); + underlying_t* self_data = reinterpret_cast(self.data()); + underlying_t* dst_data = dst.data(); + if (self.numel() > 0) { + memcpy(dst_data, self_data, self.nbytes()); + }}); + return dst; +} + +Tensor per_tensor_affine_qtensor_cpu(const Tensor& self, double scale, int64_t zero_point) { + Tensor dst = at::_empty_affine_quantized(self.sizes(), self.options().dtype(toQIntType(self.scalar_type())), scale, zero_point); + AT_DISPATCH_QINT_TYPES(dst.scalar_type(), "per_tensor_affine_qtensor", [&]() { + underlying_t* self_data = self.data(); + underlying_t* dst_data = reinterpret_cast(dst.data()); + if (self.numel() > 0) { + memcpy(dst_data, self_data, self.numel()); + } + }); return dst; } diff --git a/aten/src/ATen/native/quantized/TensorFactories.cpp b/aten/src/ATen/native/quantized/TensorFactories.cpp index 100d19c5036f..d3268a935a00 100644 --- a/aten/src/ATen/native/quantized/TensorFactories.cpp +++ b/aten/src/ATen/native/quantized/TensorFactories.cpp @@ -11,7 +11,8 @@ namespace native { // support quantizer in python frontend, once // that is ready, we'll change to use quantizer Tensor empty_affine_quantized_cpu(IntArrayRef size, const TensorOptions& options, double scale, int64_t zero_point) { - return new_qtensor_cpu(size, options, make_per_tensor_affine_quantizer(scale, zero_point)); + TORCH_CHECK(options.has_dtype(), "Must provide data type for Tensor creation functions."); + return new_qtensor_cpu(size, options, make_per_tensor_affine_quantizer(scale, zero_point, typeMetaToScalarType(options.dtype()))); } }} // at::native diff --git a/aten/src/ATen/native/quantized/cpu/fake_quantize.cpp b/aten/src/ATen/native/quantized/cpu/fake_quantize.cpp index 3efb36e2edd5..b7a8d1fe9992 100644 --- a/aten/src/ATen/native/quantized/cpu/fake_quantize.cpp +++ b/aten/src/ATen/native/quantized/cpu/fake_quantize.cpp @@ -136,11 +136,13 @@ class FakeQuantizePerTensorAffineOp_backward : public c10::OperatorKernel { static auto registry = c10::RegisterOperators() .op("quantized::fake_quantize_per_tensor_affine_forward(Tensor X, float scale, int zero_point, int num_bits = 8, int quant_delay = 0, int iter = 0) -> Tensor", - c10::kernel(), - c10::dispatchKey(CPUTensorId())) + c10::RegisterOperators::options() + .kernel() + .dispatchKey(CPUTensorId())) .op("quantized::fake_quantize_per_tensor_affine_backward(Tensor X, Tensor dY, float scale, int zero_point, int num_bits=8, int quant_delay=0, int iter = 0) -> Tensor", - c10::kernel(), - c10::dispatchKey(CPUTensorId())); + c10::RegisterOperators::options() + .kernel() + .dispatchKey(CPUTensorId())); } // namespace }} // namespace at::native diff --git a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h index 05692969253d..498ce8aca4c8 100644 --- a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h +++ b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h @@ -20,6 +20,14 @@ struct FBGEMM_API PackedFCWeight { int w_zp; }; +struct FBGEMM_API PackedConvWeight { + std::unique_ptr> w; + std::vector col_offsets; + std::vector kernel; + float w_scale; + int32_t w_zp; +}; + // Convert the weight from uint8 to int8. static void convert_uint8_int8( int K, diff --git a/aten/src/ATen/native/quantized/cpu/qadd.cpp b/aten/src/ATen/native/quantized/cpu/qadd.cpp new file mode 100644 index 000000000000..a8361835ba92 --- /dev/null +++ b/aten/src/ATen/native/quantized/cpu/qadd.cpp @@ -0,0 +1,49 @@ +#include +#include +#include +#include +#include +#include + +#include + +namespace at { namespace native { +namespace { +template +class QAddInt8 final : public c10::OperatorKernel { + public: + Tensor operator()(at::Tensor qa, at::Tensor qb, + double scale, int64_t zero_point) { + AT_ASSERTM(qa.numel() == qb.numel(), "Add operands must be the same size!"); + TORCH_CHECK(qa.scalar_type() == qb.scalar_type(), "Add operands should have same data type."); + auto a = qa.dequantize(); + auto b = qb.dequantize(); + auto c = at::empty_like(a); + auto iter = TensorIterator::binary_op(c, a, b); + + if (ReLUFused) { + binary_kernel(*iter, [&](float a_val, float b_val) -> float { + return std::max(a_val + b_val, 0); + }); + } else { + binary_kernel(*iter, [&](float a_val, float b_val) -> float { + return a_val + b_val; + }); + } + return c.quantize_linear(scale, zero_point, qa.scalar_type()); // Requantize + } +}; + +static auto registry = c10::RegisterOperators() +.op("quantized::add(Tensor qa, Tensor qb, float scale, int zero_point)" + "-> Tensor qc", + c10::RegisterOperators::options() + .kernel>() + .dispatchKey(QuantizedCPUTensorId())) +.op("quantized::add_relu(Tensor qa, Tensor qb, float scale, int zero_point)" + "-> Tensor qc", + c10::RegisterOperators::options() + .kernel>() + .dispatchKey(QuantizedCPUTensorId())); +} // namespace +}} // namespace at::native diff --git a/aten/src/ATen/native/quantized/cpu/qconv.cpp b/aten/src/ATen/native/quantized/cpu/qconv.cpp new file mode 100644 index 000000000000..7675a1cd02e6 --- /dev/null +++ b/aten/src/ATen/native/quantized/cpu/qconv.cpp @@ -0,0 +1,156 @@ +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +namespace { +class QConv2dInt8 final : public c10::OperatorKernel { + public: +#ifdef USE_FBGEMM + Tensor operator()( + Tensor act, + Tensor packed_weight, + Tensor bias, + const std::vector& stride, + const std::vector& padding, + const std::vector& dilation, + const std::vector& output_padding, + int64_t groups, + double output_scale, + int64_t output_zero_point) { + TORCH_CHECK( + fbgemm::fbgemmSupportedCPU(), "Your CPU does not support FBGEMM."); + TORCH_CHECK( + act.ndimension() == 4, + "Activations are supposed to have 4 dimensions."); + TORCH_CHECK(stride.size() == 2, "2D convolution only"); + TORCH_CHECK(padding.size() == 2, "2D convolution only"); + TORCH_CHECK(dilation.size() == 2, "2D convolution only"); + TORCH_CHECK(output_padding.size() == 2, "2D convolution only"); + TORCH_CHECK( + (dilation[0] == 1 && dilation[1] == 1), + "Currently dilation should be 1"); + TORCH_CHECK( + (output_padding[0] == 0 && output_padding[1] == 0), + "Currently output padding should be 0"); + + // inputs are in NHWC format + int N = act.size(0); + int H = act.size(1); + int W = act.size(2); + int C = act.size(3); + int K = bias.size(0); + + Tensor act_contig = act.contiguous(); + const uint8_t* act_ptr = + reinterpret_cast(act_contig.data()); + + PackedConvWeight& pack_ptr = + cpp_custom_type_hack::cast(packed_weight); + auto packB = pack_ptr.w.get(); + // packB->printPackedMatrix("PackedB inside QConv2dInt8:"); + auto& col_offsets = pack_ptr.col_offsets; + auto& kernel = pack_ptr.kernel; + + std::vector row_offset_buf( + fbgemm::PackAWithIm2Col::rowOffsetBufferSize()); + + int pad_l = padding[0]; + int pad_t = padding[1]; + int stride_h = stride[0]; + int stride_w = stride[1]; + int kernel_h = kernel[0]; + int kernel_w = kernel[1]; + + fbgemm::conv_param_t<> conv_p( + N, // Batch size + C, // Number of input channels + K, // Number of output channels + {H, W}, + groups, + {kernel_h, kernel_w}, + {stride_h, stride_w}, + {pad_l, pad_t, pad_l, pad_t}); + + fbgemm::PackAWithIm2Col packA( + conv_p, + act_ptr, + nullptr, + act.q_zero_point().toInt(), + row_offset_buf.data()); + + fbgemm::DoNothing<> NoOpObj{}; + + auto bias_contig = bias.contiguous(); + + float act_scale = act.q_scale().toFloat(); + int32_t act_zero_point = act.q_zero_point().toInt(); + + float weight_scale_float = pack_ptr.w_scale; + int32_t weight_zero_point_int32 = pack_ptr.w_zp; + + float output_multiplier_float = + (act_scale * weight_scale_float) / static_cast(output_scale); + + fbgemm::ReQuantizeOutput outputProcObj( + NoOpObj, + &output_multiplier_float, + output_zero_point, + act_zero_point, + &weight_zero_point_int32, + packA.getRowOffsetBuffer(), + col_offsets.data(), + bias_contig.data(), + K, + groups); + + Tensor output = _empty_affine_quantized( + {N, H, W, K}, + device(kCPU).dtype(kQUInt8), + output_scale, + output_zero_point); + auto buffer = at::zeros_like(output, output.options().dtype(at::kInt)); + + // Do the GEMM + fbgemm::fbgemmPacked( + packA, + *packB, + reinterpret_cast(output.data()), + buffer.data(), + K, + outputProcObj, + 0 /* thread_id*/, + 1 /* num_threads */); + + return output; + } +#else // USE_FBGEMM + Tensor operator()( + Tensor /* activation */, + Tensor /* packed_weight */, + Tensor /* bias */, + const std::vector& /* stride */, + const std::vector& /* padding */, + const std::vector& /* dilation */, + const std::vector& /* output padding */, + int64_t /* groups */, + double /* output scale */, + int64_t /* output_zero_point */) { + TORCH_CHECK( + false, "This PyTorch installation was not built with FBGEMM operators"); + } +#endif // USE_FBGEMM +}; + +static auto registry = c10::RegisterOperators().op( + "quantized::fbgemm_conv2d", + c10::RegisterOperators::options().kernel().dispatchKey( + QuantizedCPUTensorId())); + +} // namespace +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp new file mode 100644 index 000000000000..f009a6aad81b --- /dev/null +++ b/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp @@ -0,0 +1,76 @@ +#include +#include +#include +#include +#include +#include + +namespace caffe2 { +#ifdef USE_FBGEMM +// Required for cpp_custom_type_hack to work +CAFFE_KNOWN_TYPE(PackedConvWeight); +#endif +} // namespace caffe2 + +namespace at { +namespace native { +namespace { +class QConvPackWeightInt8 final : public c10::OperatorKernel { + public: +#ifdef USE_FBGEMM + Tensor operator()(Tensor weight, int64_t groups) { + TORCH_CHECK( + weight.ndimension() == 4, "Weights are expected to have 4 dimensions"); + TORCH_CHECK(groups == 1, "Groupwise convolutions are not supported yet"); + // weights in RS(C/G)K format + // matrix dimensions after im2col + int NDim = weight.size(3) / groups; + int KDim = weight.size(0) * weight.size(1) * groups * weight.size(2); + auto weight_config = weight.contiguous(); + int weight_zero_point_int32 = weight.q_zero_point().toInt(); + TORCH_CHECK( + weight_zero_point_int32 == 0, + "Only symmetric quantization is supported for weights yet"); + const int8_t* weight_ptr_int8 = + reinterpret_cast(weight_config.data()); + + std::vector col_offsets(NDim * groups); + std::vector kernel{static_cast(weight.size(0)), + static_cast(weight.size(1))}; + std::vector weight_int8(KDim * NDim * groups); + auto ret_ptr = guts::make_unique( + PackedConvWeight{guts::make_unique>( + fbgemm::matrix_op_t::NoTranspose, + KDim, + NDim, + weight_ptr_int8, + NDim, + nullptr, // PackBMatrix manages ownership of pmat + groups), + col_offsets, + kernel, + weight.q_scale().toFloat(), + weight_zero_point_int32}); + // TODO: we will need to replace this with torchscript classes at a later + // point. + return cpp_custom_type_hack::create(std::move(ret_ptr), weight.options()); + } +#else // USE_FBGEMM + Tensor operator()( + Tensor, /* weight */ + int64_t /* groups */ + ) { + TORCH_CHECK( + false, "This PyTorch installation was not built with FBGEMM operators"); + } +#endif // USE_FBGEMM +}; + +static auto registry = c10::RegisterOperators().op( + "quantized::fbgemm_conv_prepack", + c10::RegisterOperators::options().kernel().dispatchKey( + QuantizedCPUTensorId())); + +} // namespace +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/quantized/cpu/qfc.cpp b/aten/src/ATen/native/quantized/cpu/qlinear.cpp similarity index 91% rename from aten/src/ATen/native/quantized/cpu/qfc.cpp rename to aten/src/ATen/native/quantized/cpu/qlinear.cpp index 62c02d63cd76..f276010e9412 100644 --- a/aten/src/ATen/native/quantized/cpu/qfc.cpp +++ b/aten/src/ATen/native/quantized/cpu/qlinear.cpp @@ -32,7 +32,7 @@ class QFCInt8 final : public c10::OperatorKernel { // TODO: contiguous is called for further jit optimizations. auto input_contig = input.contiguous(); const auto* input_ptr = - reinterpret_cast(input_contig.data()); + reinterpret_cast(input_contig.data()); AT_ASSERT(input.dim() >= 2); // C(output) = A(input) x B(weight), where C, A, B are M x N, M x K, K x N @@ -95,6 +95,8 @@ class QFCInt8 final : public c10::OperatorKernel { // TODO: contiguous is called for further jit optimizations. auto bias_contig = bias.contiguous(); + const auto* bias_ptr = + reinterpret_cast(bias_contig.data()); // After the uint8 * int8 matrix multiplication is performed, this operation // does: @@ -108,13 +110,13 @@ class QFCInt8 final : public c10::OperatorKernel { /*Bq_zero_point=*/&weight_zero_point_int32, /*row_offsets=*/packA.getRowOffsetBuffer(), /*col_offsets=*/col_offsets.data(), - /*bias=*/bias_contig.data(), + /*bias=*/bias_ptr, /*nCol=*/N); // Allocate output Tensor and a buffer for fbgemmPacked to use auto output = _empty_affine_quantized( {M, N}, - at::device(kCPU).dtype(kQInt8), + at::device(kCPU).dtype(kQUInt8), output_scale, output_zero_point); @@ -124,7 +126,7 @@ class QFCInt8 final : public c10::OperatorKernel { fbgemm::fbgemmPacked( /*packA=*/packA, /*packB=*/*packB, - /*C=*/reinterpret_cast(output.data()), + /*C=*/reinterpret_cast(output.data()), /*C_buffer=*/buffer.data(), /*ldc=*/N, /*outProcess=*/outputProcObj, @@ -152,11 +154,13 @@ class QFCInt8 final : public c10::OperatorKernel { static auto registry = c10::RegisterOperators() .op("quantized::fbgemm_linear(Tensor X, Tensor W_prepack, Tensor b, float Y_scale_i, int Y_zero_point_i) -> Tensor Y", - c10::kernel>(), - c10::dispatchKey(QuantizedCPUTensorId())) + c10::RegisterOperators::options() + .kernel>() + .dispatchKey(QuantizedCPUTensorId())) .op("quantized::fbgemm_linear_relu(Tensor X, Tensor W_prepack, Tensor b, float Y_scale_i, int Y_zero_point_i) -> Tensor Y", - c10::kernel>(), - c10::dispatchKey(QuantizedCPUTensorId())); + c10::RegisterOperators::options() + .kernel>() + .dispatchKey(QuantizedCPUTensorId())); } // namespace } // namespace native } // namespace at diff --git a/aten/src/ATen/native/quantized/cpu/qfc_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp similarity index 89% rename from aten/src/ATen/native/quantized/cpu/qfc_prepack.cpp rename to aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp index 0d1e18f6733c..ead10f05f611 100644 --- a/aten/src/ATen/native/quantized/cpu/qfc_prepack.cpp +++ b/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp @@ -45,16 +45,13 @@ class QFCPackWeightInt8 final : public c10::OperatorKernel { auto N = weight.size(0); auto K = weight.size(1); - int32_t weight_zero_point_int32 = weight.q_zero_point().toInt() - 128; + int32_t weight_zero_point_int32 = weight.q_zero_point().toInt(); // TODO: contiguous is called for further JIT optimizations. auto weight_contig = weight.contiguous(); - std::vector weight_int8(K * N); - int8_t* weight_ptr_int8 = weight_int8.data(); - uint8_t* weight_ptr_uint8 = - reinterpret_cast(weight_contig.data()); - convert_uint8_int8(K, N, weight_ptr_uint8, weight_ptr_int8); + int8_t* weight_ptr_int8 = + reinterpret_cast(weight_contig.data()); std::vector col_offsets(N); calc_col_offsets_transpose( @@ -95,8 +92,9 @@ class QFCPackWeightInt8 final : public c10::OperatorKernel { static auto registry = c10::RegisterOperators().op( "quantized::fbgemm_linear_prepack(Tensor W) -> Tensor W_prepack", - c10::kernel(), - c10::dispatchKey(QuantizedCPUTensorId())); + c10::RegisterOperators::options() + .kernel() + .dispatchKey(QuantizedCPUTensorId())); } // namespace } // namespace native } // namespace at diff --git a/aten/src/ATen/native/quantized/cpu/qrelu.cpp b/aten/src/ATen/native/quantized/cpu/qrelu.cpp index e179ff5b6fbe..ee540901087a 100644 --- a/aten/src/ATen/native/quantized/cpu/qrelu.cpp +++ b/aten/src/ATen/native/quantized/cpu/qrelu.cpp @@ -14,13 +14,13 @@ class QReluInt8 final : public c10::OperatorKernel { public: Tensor operator()(Tensor qx) { Tensor qy = at::_empty_affine_quantized(qx.sizes(), - at::device(kCPU).dtype(kQInt8), + at::device(kCPU).dtype(kQUInt8), qx.q_scale().toDouble(), qx.q_zero_point().toLong()); auto iter = TensorIterator::unary_op(qy, qx); const auto zero_point = qx.q_zero_point().toByte(); - unary_kernel(*iter, [&](c10::qint8 value) -> c10::qint8 { - return c10::qint8(std::max(value.val_, zero_point)); + unary_kernel(*iter, [&](c10::quint8 value) -> c10::quint8 { + return c10::quint8(std::max(value.val_, zero_point)); }); return qy; } @@ -28,8 +28,9 @@ class QReluInt8 final : public c10::OperatorKernel { static auto registry = c10::RegisterOperators().op( "quantized::relu(Tensor qx) -> Tensor", - c10::kernel(), - c10::dispatchKey(QuantizedCPUTensorId())); + c10::RegisterOperators::options() + .kernel() + .dispatchKey(QuantizedCPUTensorId())); } // namespace }} // namespace at::native diff --git a/aten/src/ATen/native/quantized/cpu/qsumrelu.cpp b/aten/src/ATen/native/quantized/cpu/qsumrelu.cpp deleted file mode 100644 index 0fdf368d5b58..000000000000 --- a/aten/src/ATen/native/quantized/cpu/qsumrelu.cpp +++ /dev/null @@ -1,34 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#include - -namespace at { namespace native { -namespace { -class QSumReLUInt8 final : public c10::OperatorKernel { - public: - Tensor operator()(at::Tensor qa, at::Tensor qb, - double scale, int64_t zero_point) { - AT_ASSERTM(qa.numel() == qb.numel(), "Sum operands must be the same size!"); - auto a = qa.dequantize(); - auto b = qb.dequantize(); - auto c = at::empty_like(a); - auto iter = TensorIterator::binary_op(c, a, b); - binary_kernel(*iter, [&](float a_val, float b_val) -> float { - return std::max(a_val + b_val, 0); - }); - return c.quantize_linear(scale, zero_point); // Requantize - } -}; - -static auto registry = c10::RegisterOperators().op( - "quantized::sum_relu(Tensor qa, Tensor qb, float scale, int zero_point)" - "-> Tensor qc", - c10::kernel(), - c10::dispatchKey(QuantizedCPUTensorId())); -} // namespace -}} // namespace at::native diff --git a/aten/src/ATen/native/quantized/cuda/fake_quantize_per_tensor_affine.cu b/aten/src/ATen/native/quantized/cuda/fake_quantize_per_tensor_affine.cu new file mode 100644 index 000000000000..c50260c972c6 --- /dev/null +++ b/aten/src/ATen/native/quantized/cuda/fake_quantize_per_tensor_affine.cu @@ -0,0 +1,165 @@ +#include +#include +#include +#include + +/* FakeQuantize Op for PerTensorAffine quantization scheme */ +namespace at { namespace native { +namespace { +/* Fake-quantizes the 'inputs' tensor. +Args: + X: Forward input tensor. + scale: scale of per tensor affine quantization + zero_point: zero_point of per tensor affine quantization + num_bits: Number of quantization bits. + quant_delay: Count of global steps for which to delay the quantization. + See note below. + iter: The current quantization iteration used for `quant_delay`. +Returns: + Quantized tensor (double dtype). + +Notes: + - quant_delay might be set to non-zero to help weights stabilize in the + beginning of the training. + - quantization range [0, 2^bits - 1] +*/ +class FakeQuantizePerTensorAffineOp_forward : public c10::OperatorKernel { + public: + at::Tensor operator()( + at::Tensor X, + double scale, + int64_t zero_point, + int64_t num_bits = 8, + int64_t quant_delay = 0, + int64_t iter = 0 + ) { + // Sanity checks. + TORCH_CHECK(X.is_cuda()); + TORCH_CHECK(X.scalar_type() == ScalarType::Float); + if (num_bits > 32 || num_bits < 1) { + throw std::invalid_argument("`num_bits` should be in the [1, 32] range."); + } + if (zero_point < 0) { + throw std::invalid_argument("`zero_point` must be a positive integer."); + } + if (quant_delay < 0) { + throw std::invalid_argument("`quant_delay` must be a positive integer."); + } + + if (quant_delay != 0 && iter < 0) { + throw std::invalid_argument( + "`iter` must be >=0 for non-zero `quant_delay`"); + } + + auto Y = at::empty_like(X); + + if (quant_delay > 0 && iter <= quant_delay) { + Y.copy_(X); // We might want to just return the input here. + return Y; + } + + float inv_scale = 1.0f / scale; + const float quant_min = 0; + const float quant_max = (1 << num_bits) - 1; + at::cuda::CUDA_tensor_apply2( + X, + Y, + [=] __device__ ( + const float& input_val, + float& result_val) { + result_val = (fminf(quant_max, fmaxf(quant_min, (std::round(input_val * inv_scale + zero_point)))) - zero_point) * scale; + }); + return Y; + } +}; + +/* Backward path to fake-quantize the 'inputs' tensor. + +Args: + X: Forward input tensor. + dY: Backward input tensor. + scale: scale of per tensor affine quantization + zero_point: zero_point of per tensor affine quantization + num_bits: Number of quantization bits. + quant_delay: Count of global steps for which to delay the quantization. + See note in forward. + iter: The current quantization iteration used for `quant_delay`. +Returns: + Quantized tensor (double dtype). + +Notes: + - quant_delay might be set to non-zero to help weights stabilize in the + beginning of the training. + - quantization range [0, 2^bits - 1] +*/ +class FakeQuantizePerTensorAffineOp_backward : public c10::OperatorKernel { + public: + at::Tensor operator()( + at::Tensor X, + at::Tensor dY, + double scale, + int64_t zero_point, + int64_t num_bits = 8, + int64_t quant_delay = 0, + int64_t iter = 0) { + // Sanity checks. + TORCH_CHECK(X.is_cuda()); + TORCH_CHECK(X.scalar_type() == ScalarType::Float); + if (num_bits > 32 || num_bits < 1) { + throw std::invalid_argument("`num_bits` should be in the [1, 32] range."); + } + if (zero_point < 0) { + throw std::invalid_argument("`zero_point` must be a positive integer."); + } + if (quant_delay < 0) { + throw std::invalid_argument("`quant_delay` must be a positive integer."); + } + if (X.numel() <= 0) { + return X; + } + if (X.numel() != dY.numel()) { + throw std::invalid_argument("`X` and `dY` are not the same size"); + } + + if (quant_delay != 0 && iter < 0) { + throw std::invalid_argument( + "`iter` must be >=0 for non-zero `quant_delay`"); + } + + auto dX = at::zeros_like(dY); + if (quant_delay > 0 && iter <= quant_delay) { + dX.copy_(dY); + return dX; + } + + float inv_scale = 1.0f / scale; + const float quant_min = 0; + const float quant_max = (1 << num_bits) - 1; + auto mask = at::empty_like(dY); + at::cuda::CUDA_tensor_apply2( + X, + mask, + [=] __device__ ( + const float& input_val, + float& result_val) { + float Xq = std::round(input_val * inv_scale + zero_point); + result_val = float(Xq >= quant_min && Xq <= quant_max); + }); + dX = mask * dY; + return dX; + } +}; + +static auto registry = + c10::RegisterOperators() + .op("quantized::fake_quantize_per_tensor_affine_forward(Tensor X, float scale, int zero_point, int num_bits = 8, int quant_delay = 0, int iter = 0) -> Tensor", + c10::RegisterOperators::options() + .kernel() + .dispatchKey(CUDATensorId())) + .op("quantized::fake_quantize_per_tensor_affine_backward(Tensor X, Tensor dY, float scale, int zero_point, int num_bits=8, int quant_delay=0, int iter = 0) -> Tensor", + c10::RegisterOperators::options() + .kernel() + .dispatchKey(CUDATensorId())); + +} // namespace +}} // namespace at::native diff --git a/aten/src/ATen/native/sparse/SparseTensor.cpp b/aten/src/ATen/native/sparse/SparseTensor.cpp index c4ec81d0c2ef..62860f80c6ff 100644 --- a/aten/src/ATen/native/sparse/SparseTensor.cpp +++ b/aten/src/ATen/native/sparse/SparseTensor.cpp @@ -52,13 +52,13 @@ Tensor &_coalesced_sparse_(SparseTensor& self, bool coalesced) { } Tensor indices_sparse(const Tensor& self) { - AT_CHECK(self.is_coalesced(), + TORCH_CHECK(self.is_coalesced(), "Cannot get indices on an uncoalesced tensor, please call .coalesce() first"); return get_sparse_impl(self)->indices().alias(); } Tensor values_sparse(const Tensor& self) { - AT_CHECK(self.is_coalesced(), + TORCH_CHECK(self.is_coalesced(), "Cannot get values on an uncoalesced tensor, please call .coalesce() first"); return get_sparse_impl(self)->values().alias(); } @@ -91,7 +91,6 @@ SparseTensor new_with_dims_sparse(int64_t sparse_dim, int64_t dense_dim, ArrayRe return self; } -// Does NOT make copies of indices and values SparseTensor new_with_dims_and_tensor_sparse( int64_t sparse_dim, int64_t dense_dim, @@ -101,7 +100,16 @@ SparseTensor new_with_dims_and_tensor_sparse( const TensorOptions& options) { SparseTensor self = new_sparse(options); get_sparse_impl(self)->resize_(sparse_dim, dense_dim, size); - alias_into_sparse(self, indices, values); + // NOTE: There is no guarantee that `indices` and `values` don't contain AutogradMeta. However, + // we want to maintain the invariant that `indices_` and `values_` of a sparse tensor don't + // contain AutogradMeta, and to achieve that we shallow-copy `indices` and `values` here. + auto indices_shallow_copy = LongTensor(indices.unsafeGetTensorImpl()->shallow_copy_and_detach( + /*version_counter=*/indices.unsafeGetTensorImpl()->version_counter(), + /*allow_tensor_metadata_change=*/true)); + auto values_shallow_copy = Tensor(values.unsafeGetTensorImpl()->shallow_copy_and_detach( + /*version_counter=*/values.unsafeGetTensorImpl()->version_counter(), + /*allow_tensor_metadata_change=*/true)); + alias_into_sparse(self, indices_shallow_copy, values_shallow_copy); return self; } @@ -109,7 +117,7 @@ SparseTensor new_with_dims_and_tensor_sparse( /** Empty init **/ Tensor empty_sparse(IntArrayRef size, const TensorOptions& options) { - AT_CHECK(!options.pinned_memory(), "Only dense CPU tensors can be pinned"); + TORCH_CHECK(!options.pinned_memory(), "Only dense CPU tensors can be pinned"); return new_with_dims_sparse(size.size(), 0, size, options); } @@ -137,11 +145,11 @@ Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values_, const Ten Tensor values = expand_values_if_needed(values_); // arg checking - AT_CHECK(!options.has_layout() || options.layout() == kSparse, "expected sparse layout, but got layout ", options.layout()); + TORCH_CHECK(!options.has_layout() || options.layout() == kSparse, "expected sparse layout, but got layout ", options.layout()); // the following checks are redundant because they are also checked in SparseTensorImpl::set_indices_and_values_unsafe // but we need to ensure them in order to infer the shape. - AT_CHECK(indices.dim() == 2, "indices must be sparse_dim x nnz, but got: ", indices.sizes()) - AT_CHECK(!indices.is_sparse(), "expected indices to be a dense tensor, but got indices of layout ", indices.layout()); + TORCH_CHECK(indices.dim() == 2, "indices must be sparse_dim x nnz, but got: ", indices.sizes()) + TORCH_CHECK(!indices.is_sparse(), "expected indices to be a dense tensor, but got indices of layout ", indices.layout()); // If sizes are not given, it is inferred as max index of each dim. int64_t sparse_dim = indices.size(0); @@ -161,7 +169,7 @@ Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values_, const Ten auto cpu_computed_indices_sizes_accessor = cpu_computed_indices_sizes.accessor(); for (int64_t d = 0; d < sparse_dim; d++) { int64_t min_index_in_dim = cpu_min_indices_accessor[d]; - AT_CHECK(min_index_in_dim >= 0, + TORCH_CHECK(min_index_in_dim >= 0, "found negative index ", min_index_in_dim, " for dim ", d); computed_sizes[static_cast(d)] = cpu_computed_indices_sizes_accessor[d]; } @@ -186,14 +194,14 @@ Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values_, ArrayRef< Tensor values = expand_values_if_needed(values_); // arg checking - AT_CHECK(!options.has_layout() || options.layout() == kSparse, "expected sparse layout, but got layout ", options.layout()); + TORCH_CHECK(!options.has_layout() || options.layout() == kSparse, "expected sparse layout, but got layout ", options.layout()); // the following checks are redundant because they are also checked in SparseTensorImpl::set_indices_and_values_unsafe // but we need to ensure them in order to infer the shape. - AT_CHECK(indices.dim() == 2, "indices must be sparse_dim x nnz, but got: ", indices.sizes()) - AT_CHECK(!indices.is_sparse(), "expected indices to be a dense tensor, but got indices of layout ", indices.layout()); + TORCH_CHECK(indices.dim() == 2, "indices must be sparse_dim x nnz, but got: ", indices.sizes()) + TORCH_CHECK(!indices.is_sparse(), "expected indices to be a dense tensor, but got indices of layout ", indices.layout()); int64_t sparse_dim = indices.size(0); int64_t dense_dim = values.dim() - 1; - AT_CHECK(size.size() == sparse_dim + dense_dim, + TORCH_CHECK(size.size() == sparse_dim + dense_dim, "number of dimensions must be sparse_dim (", sparse_dim, ") + dense_dim (", dense_dim, "), but got ", size.size()); // Check to make sure all indices are within the boundaries of `size` @@ -214,11 +222,11 @@ Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values_, ArrayRef< // NB: This used to sync ndim times to access each entry; now we copy // everything to CPU first and then access it. int64_t min_index_in_dim = cpu_min_indices_accessor[d]; - AT_CHECK(min_index_in_dim >= 0, + TORCH_CHECK(min_index_in_dim >= 0, "found negative index ", min_index_in_dim, " for dim ", d); int64_t max_index_in_dim = cpu_max_indices_accessor[d]; int64_t dim_size = size[static_cast(d)]; - AT_CHECK(max_index_in_dim < dim_size, + TORCH_CHECK(max_index_in_dim < dim_size, "size is inconsistent with indices: for dim ", d, ", size is ", dim_size, " but found index ", max_index_in_dim); } } @@ -236,7 +244,7 @@ Tensor _sparse_coo_tensor_unsafe(const Tensor& indices, const Tensor& values_, A Tensor values = expand_values_if_needed(values_); // arg checking - AT_CHECK(!options.has_layout() || options.layout() == kSparse, "expected sparse layout, but got layout ", options.layout()); + TORCH_CHECK(!options.has_layout() || options.layout() == kSparse, "expected sparse layout, but got layout ", options.layout()); int64_t sparse_dim = indices.size(0); int64_t dense_dim = values.dim() - 1; @@ -288,8 +296,8 @@ SparseTensor dense_to_sparse(const Tensor& self, int64_t sparse_dim){ int64_t dims = self.dim(); // TODO: it seems like sparse_dim == 0 could be supported even if self.dim() > 0, // but this would take some work and doesn't seem particularly useful. - AT_CHECK(sparse_dim > 0 || self.dim() == 0, "sparse_dim must be >0 if dimensionality > 0"); - AT_CHECK(sparse_dim <= dims, + TORCH_CHECK(sparse_dim > 0 || self.dim() == 0, "sparse_dim must be >0 if dimensionality > 0"); + TORCH_CHECK(sparse_dim <= dims, "sparse_dim must be less than or equal to self.dim()"); at::TensorOptions sparse_options = self.options().layout(kSparse); std::vector sizes = self.sizes().vec(); @@ -325,6 +333,9 @@ SparseTensor dense_to_sparse(const Tensor& self, int64_t sparse_dim){ // NB: Dropped the resizeNd variants Tensor sparse_to_dense(const SparseTensor& self) { + if(self.scalar_type() == ScalarType::Half && self.options().device().is_cpu()) { + AT_ERROR("to_dense() not supported for float16 on CPU"); + } Tensor dst = at::zeros(self.sizes(), self.options().layout(kStrided)); return dst.add_(self); } @@ -441,12 +452,12 @@ void inline sparse_mask_out_cpu_kernel( } SparseTensor& sparse_mask_out_cpu(SparseTensor& r, const Tensor& t, const SparseTensor& mask) { - AT_CHECK(mask.is_coalesced(), "sparse_mask: mask is uncoalesced"); - AT_CHECK(mask.sizes().equals(t.sizes()), "sparse_mask: operands have incompatible sizes; self has size ", + TORCH_CHECK(mask.is_coalesced(), "sparse_mask: mask is uncoalesced"); + TORCH_CHECK(mask.sizes().equals(t.sizes()), "sparse_mask: operands have incompatible sizes; self has size ", t.sizes(), " but mask has size ", mask.sizes()); AT_ASSERT(!t.is_cuda()); // we were supposed to have dispatched on this - AT_CHECK(!r.is_cuda(), "sparse_mask: expected 'out' to be CPU, but got CUDA"); - AT_CHECK(!mask.is_cuda(), "sparse_mask: expected 'mask' to be CPU, but got CUDA"); + TORCH_CHECK(!r.is_cuda(), "sparse_mask: expected 'out' to be CPU, but got CUDA"); + TORCH_CHECK(!mask.is_cuda(), "sparse_mask: expected 'mask' to be CPU, but got CUDA"); resize_as_sparse_(r, mask); if (mask._nnz() == 0) { return r.zero_(); diff --git a/aten/src/ATen/native/sparse/SparseTensorMath.cpp b/aten/src/ATen/native/sparse/SparseTensorMath.cpp index 67b7d24edc80..6aa1966d3f4b 100644 --- a/aten/src/ATen/native/sparse/SparseTensorMath.cpp +++ b/aten/src/ATen/native/sparse/SparseTensorMath.cpp @@ -99,7 +99,7 @@ SparseTensor& log1p_out_sparse(SparseTensor& r, const SparseTensor& t) { if (is_same_tensor(r, t)) { // don't have in-place log1p for uncoalesced input because coalesce() is not in-place - AT_CHECK( + TORCH_CHECK( r.is_coalesced(), "log1p: in-place on uncoalesced tensors is not supported yet!"); } else { @@ -110,7 +110,7 @@ SparseTensor& log1p_out_sparse(SparseTensor& r, const SparseTensor& t) { } SparseTensor& log1p_sparse_(SparseTensor& t) { - AT_CHECK(t.is_coalesced(), "log1p: in-place on uncoalesced tensors is not supported yet!"); + TORCH_CHECK(t.is_coalesced(), "log1p: in-place on uncoalesced tensors is not supported yet!"); return log1p_out_sparse(t, t); } @@ -123,7 +123,7 @@ SparseTensor& log1p_sparse_(SparseTensor& t) { SparseTensor& pow_out_sparse_scalar(SparseTensor& r, const SparseTensor& t_, Scalar value) { AT_ASSERT(r.is_sparse()); AT_ASSERT(t_.is_sparse()); - AT_CHECK(value.toDouble() != 0, "pow: cannot raise to zeroth power on sparse tensor; it would make the result tensor dense"); + TORCH_CHECK(value.toDouble() != 0, "pow: cannot raise to zeroth power on sparse tensor; it would make the result tensor dense"); // This coalesce is why we can't easily provide an inplace variant SparseTensor t = t_.coalesce(); @@ -191,10 +191,10 @@ SparseTensor& add_out_sparse_cpu(SparseTensor& r, const SparseTensor& t, const S AT_ASSERT(r.is_sparse()); AT_ASSERT(t.is_sparse()); AT_ASSERT(!t.is_cuda()); // the dispatch argument - AT_CHECK(!r.is_cuda(), "add: expected 'out' to be CPU tensor, but got CUDA tensor"); - AT_CHECK(!src.is_cuda(), "add: expected 'other' to be a CPU tensor, but got a CUDA tensor"); + TORCH_CHECK(!r.is_cuda(), "add: expected 'out' to be CPU tensor, but got CUDA tensor"); + TORCH_CHECK(!src.is_cuda(), "add: expected 'other' to be a CPU tensor, but got a CUDA tensor"); - AT_CHECK(t.sizes().equals(src.sizes()), "add: expected sizes of 'self' and 'other' to match, but ", t.sizes(), " != ", src.sizes()); + TORCH_CHECK(t.sizes().equals(src.sizes()), "add: expected sizes of 'self' and 'other' to match, but ", t.sizes(), " != ", src.sizes()); if (src._nnz() == 0) { return copy_sparse_to_sparse_(r, t); @@ -203,7 +203,7 @@ SparseTensor& add_out_sparse_cpu(SparseTensor& r, const SparseTensor& t, const S return mul_out_sparse_scalar(r, src, value); } - AT_CHECK(is_same_density(t, src), "add: expected 'self' and 'other' to have same density, but 'self' has ", t.sparse_dim(), " sparse dimensions while 'other' has ", src.sparse_dim(), " sparse dimensions"); + TORCH_CHECK(is_same_density(t, src), "add: expected 'self' and 'other' to have same density, but 'self' has ", t.sparse_dim(), " sparse dimensions while 'other' has ", src.sparse_dim(), " sparse dimensions"); // saving those because they can be overwritten when doing in-place operations int64_t t_nnz = t._nnz(), s_nnz = src._nnz(), max_nnz = t_nnz + s_nnz; @@ -336,10 +336,10 @@ Tensor& add_out_dense_sparse_cpu(Tensor& r, const Tensor& dense, SparseTensorRef AT_ASSERT(sparse_.is_sparse()); AT_ASSERT(!dense.is_cuda()); // dispatch argument - AT_CHECK(!r.is_cuda(), "add: expected 'out' to be CPU tensor, but got CUDA tensor"); - AT_CHECK(!sparse_.is_cuda(), "add: expected 'other' to be a CPU tensor, but got a CUDA tensor"); + TORCH_CHECK(!r.is_cuda(), "add: expected 'out' to be CPU tensor, but got CUDA tensor"); + TORCH_CHECK(!sparse_.is_cuda(), "add: expected 'other' to be a CPU tensor, but got a CUDA tensor"); - AT_CHECK(dense.sizes().equals(sparse_.sizes()), "add: expected 'self' and 'other' to have same size, but self has size ", + TORCH_CHECK(dense.sizes().equals(sparse_.sizes()), "add: expected 'self' and 'other' to have same size, but self has size ", dense.sizes(), " while other has size ", sparse_.sizes(), " (FYI: dense-sparse addition does not currently support broadcasting)"); r.resize_as_(dense); @@ -384,12 +384,12 @@ SparseTensor& mul_out_sparse_cpu(SparseTensor& r, const Tensor& t_, const Tensor return mul_out_sparse_zerodim(r, src_, t_); } - AT_CHECK(t_.sizes().equals(src_.sizes()), "mul operands have incompatible sizes"); + TORCH_CHECK(t_.sizes().equals(src_.sizes()), "mul operands have incompatible sizes"); AT_ASSERT(!t_.is_cuda()); // dispatch argument - AT_CHECK(!r.is_cuda(), "mul: expected 'out' to be CPU tensor, but got CUDA tensor"); - AT_CHECK(!src_.is_cuda(), "mul: expected 'other' to be a CPU tensor, but got a CUDA tensor"); + TORCH_CHECK(!r.is_cuda(), "mul: expected 'out' to be CPU tensor, but got CUDA tensor"); + TORCH_CHECK(!src_.is_cuda(), "mul: expected 'other' to be a CPU tensor, but got a CUDA tensor"); - AT_CHECK(t_.sizes().equals(src_.sizes()), "mul: expected 'self' and 'other' to have same sizes, but ", t_.sizes(), " != ", src_.sizes()); + TORCH_CHECK(t_.sizes().equals(src_.sizes()), "mul: expected 'self' and 'other' to have same sizes, but ", t_.sizes(), " != ", src_.sizes()); if (src_._nnz() == 0 || t_._nnz() == 0) { r.resize_as_(src_); @@ -536,24 +536,24 @@ Tensor& s_addmm_out_sparse_dense_cpu( ) { // TODO: This error message seems awfully opaque AT_ASSERT(!t.is_cuda()); - AT_CHECK(!r.is_cuda(), "addmm: expected 'out' to be CPU tensor, but got CUDA tensor"); - AT_CHECK(!sparse_.is_cuda(), "addmm: expected 'mat1' to be a CPU tensor, but got a CUDA tensor"); - AT_CHECK(!dense.is_cuda(), "addmm: expected 'mat2' to be a CPU tensor, but got a CUDA tensor"); + TORCH_CHECK(!r.is_cuda(), "addmm: expected 'out' to be CPU tensor, but got CUDA tensor"); + TORCH_CHECK(!sparse_.is_cuda(), "addmm: expected 'mat1' to be a CPU tensor, but got a CUDA tensor"); + TORCH_CHECK(!dense.is_cuda(), "addmm: expected 'mat2' to be a CPU tensor, but got a CUDA tensor"); - AT_CHECK(sparse_.sparse_dim() == 2, "addmm: matrices expected, got ", sparse_.sparse_dim(), "D tensor"); - AT_CHECK(sparse_.dense_dim() == 0, "addmm: scalar values expected, got ", sparse_.dense_dim(), "D values"); - AT_CHECK(dense.dim() == 2, "addmm: matrices expected, got ", dense.dim(), "D tensor"); + TORCH_CHECK(sparse_.sparse_dim() == 2, "addmm: matrices expected, got ", sparse_.sparse_dim(), "D tensor"); + TORCH_CHECK(sparse_.dense_dim() == 0, "addmm: scalar values expected, got ", sparse_.dense_dim(), "D values"); + TORCH_CHECK(dense.dim() == 2, "addmm: matrices expected, got ", dense.dim(), "D tensor"); // ixj * jxk = ixk int64_t dim_i = sparse_.size(0); int64_t dim_j = sparse_.size(1); int64_t dim_k = dense.size(1); - AT_CHECK(dense.size(0) == dim_j, + TORCH_CHECK(dense.size(0) == dim_j, "addmm: Argument #3 (dense): Expected dim 0 size ", dim_j, ", got ", dense.size(0)); - AT_CHECK(t.size(0) == dim_i, + TORCH_CHECK(t.size(0) == dim_i, "addmm: Argument #1 (t): Expected dim 0 size ", dim_i, ", got ", t.size(0)); - AT_CHECK(t.size(1) == dim_k, + TORCH_CHECK(t.size(1) == dim_k, "addmm: Argument #1 (t): Expected dim 1 size ", dim_k, ", got ", t.size(1)); r.resize_({dim_i, dim_k}); @@ -629,21 +629,21 @@ SparseTensor& hspmm_out_sparse_cpu(SparseTensor& r, const SparseTensor& sparse_, Scalar alpha = 1; AT_ASSERT(!sparse_.is_cuda()); // dispatch argument - AT_CHECK(!r.is_cuda(), "hspmm: expected 'out' to be CPU tensor, but got CUDA tensor"); - AT_CHECK(!dense.is_cuda(), "hspmm: expected 'other' to be a CPU tensor, but got a CUDA tensor"); + TORCH_CHECK(!r.is_cuda(), "hspmm: expected 'out' to be CPU tensor, but got CUDA tensor"); + TORCH_CHECK(!dense.is_cuda(), "hspmm: expected 'other' to be a CPU tensor, but got a CUDA tensor"); - AT_CHECK(sparse_.sparse_dim() == 2, + TORCH_CHECK(sparse_.sparse_dim() == 2, "hspmm: Argument #2: matrices expected, got ", sparse_.sparse_dim(), "D tensor"); - AT_CHECK(sparse_.dense_dim() == 0, + TORCH_CHECK(sparse_.dense_dim() == 0, "hspmm: Argument #2: scalar values expected, got ", sparse_.dense_dim(), "D values"); - AT_CHECK(dense.dim() == 2, + TORCH_CHECK(dense.dim() == 2, "hspmm: Argument #3: matrices expected, got ", dense.dim(), "D tensor"); int64_t m = sparse_.size(0); int64_t k = sparse_.size(1); int64_t n = dense.size(1); - AT_CHECK(dense.size(0) == k, + TORCH_CHECK(dense.size(0) == k, "hspmm: Argument #3: Expected dim 0 size ", k, ", got ", dense.size(0)); get_sparse_impl(r)->raw_resize_(1, 1, {m, n}); @@ -714,15 +714,15 @@ SparseTensor& _sspaddmm_out_cpu( Scalar alpha ) { AT_ASSERT(!t.is_cuda()); // dispatch argument - AT_CHECK(!r.is_cuda(), "sspaddmm: expected 'out' to be CPU tensor, but got CUDA tensor"); - AT_CHECK(!sparse_.is_cuda(), "sspaddmm: expected 'mat1' to be a CPU tensor, but got a CUDA tensor"); - AT_CHECK(!dense.is_cuda(), "sspaddmm: expected 'mat2' to be a CPU tensor, but got a CUDA tensor"); + TORCH_CHECK(!r.is_cuda(), "sspaddmm: expected 'out' to be CPU tensor, but got CUDA tensor"); + TORCH_CHECK(!sparse_.is_cuda(), "sspaddmm: expected 'mat1' to be a CPU tensor, but got a CUDA tensor"); + TORCH_CHECK(!dense.is_cuda(), "sspaddmm: expected 'mat2' to be a CPU tensor, but got a CUDA tensor"); - AT_CHECK(sparse_.sparse_dim() == 2, + TORCH_CHECK(sparse_.sparse_dim() == 2, "sspaddmm: Argument #2: matrices expected, got ", sparse_.sparse_dim(), "D tensor"); - AT_CHECK(sparse_.dense_dim() == 0, + TORCH_CHECK(sparse_.dense_dim() == 0, "sspaddmm: Argument #2: scalar values expected, got ", sparse_.dense_dim(), "D values"); - AT_CHECK(dense.dim() == 2, + TORCH_CHECK(dense.dim() == 2, "sspaddmm: Argument #2: matrices expected, got ", dense.dim(), "D tensor"); SparseTensor sparse = sparse_.coalesce(); @@ -736,11 +736,11 @@ SparseTensor& _sspaddmm_out_cpu( // See test_saddmm get_sparse_impl(r)->raw_resize_(2, 0, {dim_i, dim_k}); - AT_CHECK(dense.size(0) == dim_j, + TORCH_CHECK(dense.size(0) == dim_j, "sspaddmm: Argument #3: Expected dim 0 size ", dim_j, ", got ", dense.size(0)); - AT_CHECK(t.size(0) == dim_i, + TORCH_CHECK(t.size(0) == dim_i, "sspaddmm: Argument #1: Expected dim 0 size ", dim_i, ", got ", t.size(0)); - AT_CHECK(t.size(1) == dim_k, + TORCH_CHECK(t.size(1) == dim_k, "sspaddmm: Argument #1: Expected dim 1 size ", dim_k, ", got ", t.size(1)); int64_t nnz = sparse._nnz(); @@ -858,7 +858,7 @@ Tensor _sparse_sum(const SparseTensor& input, IntArrayRef dims_to_sum, ScalarTyp } Tensor _sparse_sum(const SparseTensor& input, IntArrayRef dims_to_sum) { - AT_CHECK(input._nnz() > 0, "_sparse_sum: sparse tensor input._nnz() == 0, please call torch.sparse.sum(input) instead.") + TORCH_CHECK(input._nnz() > 0, "_sparse_sum: sparse tensor input._nnz() == 0, please call torch.sparse.sum(input) instead.") const int64_t input_dim = input.dim(); auto dims_to_sum_b = dim_list_to_bitset(dims_to_sum, input_dim); @@ -975,8 +975,8 @@ Tensor _sparse_sum(const SparseTensor& input, IntArrayRef dims_to_sum) { // - grad.values might have zeros // -------------------------------------------------------------------- Tensor _sparse_sum_backward_cpu(const Tensor& grad_, const SparseTensor& input_, IntArrayRef dims_to_sum) { - AT_CHECK(!grad_.is_cuda(), "_sparse_sum_backward_cpu: expected 'grad_' to be CPU tensor, but got CUDA tensor"); - AT_CHECK(!input_.is_cuda(), "_sparse_sum_backward_cpu: expected 'input_' to be CPU tensor, but got CUDA tensor"); + TORCH_CHECK(!grad_.is_cuda(), "_sparse_sum_backward_cpu: expected 'grad_' to be CPU tensor, but got CUDA tensor"); + TORCH_CHECK(!input_.is_cuda(), "_sparse_sum_backward_cpu: expected 'input_' to be CPU tensor, but got CUDA tensor"); auto input = input_.coalesce(); const int64_t input_dim = input.dim(); @@ -1009,7 +1009,7 @@ Tensor _sparse_sum_backward_cpu(const Tensor& grad_, const SparseTensor& input_, const bool sum_sparse_dim = (sparse_dims_to_sum_size > 0); if (sum_all_sparse_dim) { - AT_CHECK(!grad_.is_sparse(), "_sparse_sum_backward_cpu: expected grad_ Tensor to be dense since all sparse dims are summed"); + TORCH_CHECK(!grad_.is_sparse(), "_sparse_sum_backward_cpu: expected grad_ Tensor to be dense since all sparse dims are summed"); auto grad_input_values = grad_; auto expand_size = input_values.sizes().vec(); if (sum_dense_dim) { @@ -1023,7 +1023,7 @@ Tensor _sparse_sum_backward_cpu(const Tensor& grad_, const SparseTensor& input_, return at::_sparse_coo_tensor_with_dims_and_tensors(input_sparse_dim, input_dense_dim, input_sizes, input_indices.clone(), grad_input_values, input.options().dtype(grad_.dtype())); // convert to grad dtype } else { - AT_CHECK(grad_.is_sparse(), "_sparse_sum_backward_cpu: expected grad_ Tensor to be sparse, but got dense"); + TORCH_CHECK(grad_.is_sparse(), "_sparse_sum_backward_cpu: expected grad_ Tensor to be sparse, but got dense"); auto grad = grad_.coalesce(); LongTensor grad_indices = grad._indices(); Tensor grad_values = grad._values(); diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cu index f364ee7d204f..bea1a8aa94eb 100644 --- a/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cu +++ b/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cu @@ -66,7 +66,7 @@ inline cusparseHandle_t setCUDASparseStream() { } void Xcoo2csr(const int *coorowind, int64_t nnz, int64_t m, int *csrrowptr) { - AT_CHECK((m <= INT_MAX) && (nnz <= INT_MAX), + TORCH_CHECK((m <= INT_MAX) && (nnz <= INT_MAX), "cusparseXcoo2csr only supports m, nnz with the bound [val] <= ", INT_MAX); @@ -117,7 +117,7 @@ void Scsrmm2(char transa, char transb, int64_t m, int64_t n, int64_t k, int64_t cusparseOperation_t opa = convertTransToCusparseOperation(transa); cusparseOperation_t opb = convertTransToCusparseOperation(transb); - AT_CHECK((m <= INT_MAX) && (n <= INT_MAX) && (k <= INT_MAX) && (nnz <= INT_MAX) && (ldb <= INT_MAX) && (ldc <= INT_MAX), + TORCH_CHECK((m <= INT_MAX) && (n <= INT_MAX) && (k <= INT_MAX) && (nnz <= INT_MAX) && (ldb <= INT_MAX) && (ldc <= INT_MAX), "cusparseScsrmm2 only supports m, n, k, nnz, ldb, ldc with the bound [val] <= ", INT_MAX); int i_m = (int)m; int i_n = (int)n; @@ -144,7 +144,7 @@ void Dcsrmm2(char transa, char transb, int64_t m, int64_t n, int64_t k, int64_t cusparseOperation_t opa = convertTransToCusparseOperation(transa); cusparseOperation_t opb = convertTransToCusparseOperation(transb); - AT_CHECK((m <= INT_MAX) && (n <= INT_MAX) && (k <= INT_MAX) && (nnz <= INT_MAX) && (ldb <= INT_MAX) && (ldc <= INT_MAX), + TORCH_CHECK((m <= INT_MAX) && (n <= INT_MAX) && (k <= INT_MAX) && (nnz <= INT_MAX) && (ldb <= INT_MAX) && (ldc <= INT_MAX), "cusparseDcsrmm2 only supports m, n, k, nnz, ldb, ldc with the bound [val] <= ", INT_MAX); int i_m = (int)m; int i_n = (int)n; @@ -169,7 +169,7 @@ void Dcsrmm2(char transa, char transb, int64_t m, int64_t n, int64_t k, int64_t /* format conversion */ void CreateIdentityPermutation(int64_t nnz, int *P) { - AT_CHECK((nnz <= INT_MAX), + TORCH_CHECK((nnz <= INT_MAX), "Xcsrsort_bufferSizeExt only supports m, n, nnz with the bound [val] <= ", INT_MAX); int i_nnz = (int)nnz; @@ -180,7 +180,7 @@ void CreateIdentityPermutation(int64_t nnz, int *P) { void Xcsrsort_bufferSizeExt(int64_t m, int64_t n, int64_t nnz, const int *csrRowPtr, const int *csrColInd, size_t *pBufferSizeInBytes) { - AT_CHECK((m <= INT_MAX) && (n <= INT_MAX) && (nnz <= INT_MAX), + TORCH_CHECK((m <= INT_MAX) && (n <= INT_MAX) && (nnz <= INT_MAX), "Xcsrsort_bufferSizeExt only supports m, n, nnz with the bound [val] <=", INT_MAX); int i_m = (int)m; @@ -193,7 +193,7 @@ void Xcsrsort_bufferSizeExt(int64_t m, int64_t n, int64_t nnz, const int *csrRow void Xcsrsort(int64_t m, int64_t n, int64_t nnz, const int *csrRowPtr, int *csrColInd, int *P, void *pBuffer) { - AT_CHECK((m <= INT_MAX) && (n <= INT_MAX) && (nnz <= INT_MAX), + TORCH_CHECK((m <= INT_MAX) && (n <= INT_MAX) && (nnz <= INT_MAX), "Xcsrsort only supports m, n, nnz with the bound [val] <= ", INT_MAX); int i_m = (int)m; @@ -209,7 +209,7 @@ void Xcsrsort(int64_t m, int64_t n, int64_t nnz, const int *csrRowPtr, int *csrC void Xcoosort_bufferSizeExt(int64_t m, int64_t n, int64_t nnz, const int *cooRows, const int *cooCols, size_t *pBufferSizeInBytes) { - AT_CHECK((m <= INT_MAX) && (n <= INT_MAX) && (nnz <= INT_MAX), + TORCH_CHECK((m <= INT_MAX) && (n <= INT_MAX) && (nnz <= INT_MAX), "Xcoosort_bufferSizeExt only supports m, n, nnz with the bound [val] <= ", INT_MAX); int i_m = (int)m; @@ -222,7 +222,7 @@ void Xcoosort_bufferSizeExt(int64_t m, int64_t n, int64_t nnz, const int *cooRow void XcoosortByRow(int64_t m, int64_t n, int64_t nnz, int *cooRows, int *cooCols, int *P, void *pBuffer) { - AT_CHECK((m <= INT_MAX) && (n <= INT_MAX) && (nnz <= INT_MAX), + TORCH_CHECK((m <= INT_MAX) && (n <= INT_MAX) && (nnz <= INT_MAX), "XcoosortByRow only supports m, n, nnz with the bound [val] <= ", INT_MAX); int i_m = (int)m; diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cpp b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cpp index 5bb961591c6f..15cd4b0e15cb 100644 --- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cpp +++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cpp @@ -9,13 +9,13 @@ namespace at { namespace native { using namespace at::sparse; SparseTensor& sparse_mask_out_cuda(SparseTensor& r, const Tensor& t, const SparseTensor& mask) { - AT_CHECK(mask.is_coalesced(), "sparse_mask: mask is uncoalesced"); - AT_CHECK(mask.sizes().equals(t.sizes()), "sparse_mask: operands have incompatible sizes; self has size ", + TORCH_CHECK(mask.is_coalesced(), "sparse_mask: mask is uncoalesced"); + TORCH_CHECK(mask.sizes().equals(t.sizes()), "sparse_mask: operands have incompatible sizes; self has size ", t.sizes(), " but mask has size ", mask.sizes()); AT_ASSERT(t.is_cuda()); // dispatch argument - AT_CHECK(mask.is_cuda(), "sparse_mask: expected 'mask' to be CUDA, but got CPU"); - AT_CHECK(r.is_cuda(), "sparse_mask: expected 'out' to be CUDA, but got CPU"); - AT_CHECK(cuda::check_device({r, t, mask}), + TORCH_CHECK(mask.is_cuda(), "sparse_mask: expected 'mask' to be CUDA, but got CPU"); + TORCH_CHECK(r.is_cuda(), "sparse_mask: expected 'out' to be CUDA, but got CPU"); + TORCH_CHECK(cuda::check_device({r, t, mask}), "sparse_mask: arguments are located on different devices; self is on device ", t.get_device(), ", mask is on device ", mask.get_device(), ", out is on device ", r.get_device()); resize_as_sparse_(r, mask); diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu index bf7bb8b73cd8..452fb397c13b 100644 --- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu +++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu @@ -52,14 +52,14 @@ namespace { Tensor& s_addmm_out_sparse_dense_cuda(Tensor& r_, const Tensor& t, const SparseTensor& sparse_, const Tensor& dense, Scalar beta, Scalar alpha) { AT_ASSERT(t.is_cuda()); // dispatch argument - AT_CHECK(r_.is_cuda(), "addmm: expected 'out' to be CUDA, but got CPU"); - AT_CHECK(sparse_.is_cuda(), "addmm: expected 'mat1' to be CUDA, but got CPU"); - AT_CHECK(dense.is_cuda(), "addmm: expected 'mat2' to be CUDA, but got CPU"); + TORCH_CHECK(r_.is_cuda(), "addmm: expected 'out' to be CUDA, but got CPU"); + TORCH_CHECK(sparse_.is_cuda(), "addmm: expected 'mat1' to be CUDA, but got CPU"); + TORCH_CHECK(dense.is_cuda(), "addmm: expected 'mat2' to be CUDA, but got CPU"); - AT_CHECK(cuda::check_device({sparse_, r_, t, dense})); + TORCH_CHECK(cuda::check_device({sparse_, r_, t, dense})); - AT_CHECK(dense.dim() == 2, "addmm: 2D tensor expected, got ", dense.dim(), "D tensor"); - AT_CHECK(sparse_.sparse_dim() == 2, "addmm: expected first two dims to be sparse (indices has size 2 at first dim), but got ", sparse_.sparse_dim(), " spase dims"); + TORCH_CHECK(dense.dim() == 2, "addmm: 2D tensor expected, got ", dense.dim(), "D tensor"); + TORCH_CHECK(sparse_.sparse_dim() == 2, "addmm: expected first two dims to be sparse (indices has size 2 at first dim), but got ", sparse_.sparse_dim(), " spase dims"); // no need to check dense_dim because dense_dim + sparse_dim = dim // mxk * kxn = mxn @@ -67,11 +67,11 @@ Tensor& s_addmm_out_sparse_dense_cuda(Tensor& r_, const Tensor& t, const SparseT int64_t k = sparse_.size(1); int64_t n = dense.size(1); - AT_CHECK(t.size(0) == m, + TORCH_CHECK(t.size(0) == m, "addmm: Argument #1 (t): Expected dim 0 size ", m, ", got ", t.size(0)); - AT_CHECK(t.size(1) == n, + TORCH_CHECK(t.size(1) == n, "addmm: Argument #1 (t): Expected dim 1 size ", n, ", got ", t.size(1)); - AT_CHECK(dense.size(0) == k, + TORCH_CHECK(dense.size(0) == k, "addmm: Argument #3 (dense): Expected dim 0 size ", k, ", got ", dense.size(0)); r_.resize_({m, n}); @@ -181,23 +181,23 @@ Tensor& s_addmm_sparse_dense_cuda_( SparseTensor& hspmm_out_sparse_cuda(SparseTensor& r_, const SparseTensor& sparse_, const Tensor& dense/* , Scalar alpha */) { AT_ASSERT(sparse_.is_cuda()); // dispatch argument - AT_CHECK(r_.is_cuda(), "hspmm: expected 'out' to be CUDA, but got CPU"); - AT_CHECK(dense.is_cuda(), "hspmm: expected 'mat2' to be CUDA, but got CPU"); + TORCH_CHECK(r_.is_cuda(), "hspmm: expected 'out' to be CUDA, but got CPU"); + TORCH_CHECK(dense.is_cuda(), "hspmm: expected 'mat2' to be CUDA, but got CPU"); - AT_CHECK(cuda::check_device({r_, sparse_, dense})); + TORCH_CHECK(cuda::check_device({r_, sparse_, dense})); - AT_CHECK(sparse_.sparse_dim() == 2, + TORCH_CHECK(sparse_.sparse_dim() == 2, "hspmm: Argument #2: 2D tensor expected, got ", sparse_.sparse_dim(), "D tensor"); - AT_CHECK(sparse_.dense_dim() == 0, + TORCH_CHECK(sparse_.dense_dim() == 0, "hspmm: Argument #2: scalar values expected, got ", sparse_.dense_dim(), "D values"); - AT_CHECK(dense.dim() == 2, + TORCH_CHECK(dense.dim() == 2, "hspmm: Argument #3: 2D tensor expected, got ", dense.dim(), "D tensor"); int64_t m = sparse_.size(0); int64_t k = sparse_.size(1); int64_t n = dense.size(1); - AT_CHECK(dense.size(0) == k, + TORCH_CHECK(dense.size(0) == k, "hspmm: Argument #3: Expected dim 0 size ", k, ", got ", dense.size(0)); get_sparse_impl(r_)->resize_and_clear_(1, 1, {m, n}); @@ -252,12 +252,12 @@ Tensor& add_out_dense_sparse_cuda(Tensor& r_, const Tensor& dense, SparseTensorR const SparseTensor& sparse = sparse_.tref; AT_ASSERT(dense.is_cuda()); // dispatch argument - AT_CHECK(sparse.is_cuda(), "add: expected 'other' to be CUDA, but got CPU"); - AT_CHECK(r_.is_cuda(), "add: expected 'out' to be CUDA, but got CPU"); + TORCH_CHECK(sparse.is_cuda(), "add: expected 'other' to be CUDA, but got CPU"); + TORCH_CHECK(r_.is_cuda(), "add: expected 'out' to be CUDA, but got CPU"); - AT_CHECK(cuda::check_device({sparse, r_, dense})); + TORCH_CHECK(cuda::check_device({sparse, r_, dense})); - AT_CHECK(dense.sizes().equals(sparse.sizes()), "add: expected 'self' and 'other' to have same size, but self has size ", + TORCH_CHECK(dense.sizes().equals(sparse.sizes()), "add: expected 'self' and 'other' to have same size, but self has size ", dense.sizes(), " while other has size ", sparse.sizes(), " (FYI: dense-sparse addition does not currently support broadcasting)"); const int64_t nnz = sparse._nnz(); @@ -272,7 +272,7 @@ Tensor& add_out_dense_sparse_cuda(Tensor& r_, const Tensor& dense, SparseTensorR r_.resize_as_(dense); r_.copy_(dense); } else { - AT_CHECK(r_.is_contiguous(), "add: CUDA dense-sparse addition with a non-contiguous output tensor does not work; shout if you need it (see https://github.com/pytorch/pytorch/issues/1521 )"); + TORCH_CHECK(r_.is_contiguous(), "add: CUDA dense-sparse addition with a non-contiguous output tensor does not work; shout if you need it (see https://github.com/pytorch/pytorch/issues/1521 )"); r = r_.contiguous(); } @@ -293,7 +293,7 @@ Tensor& add_out_dense_sparse_cuda(Tensor& r_, const Tensor& dense, SparseTensorR cudaGetDevice(&curDevice); cudaStream_t stream = at::cuda::getCurrentCUDAStream(curDevice); if (sparse.dense_dim() == 0) { - AT_CHECK(cuda::getApplyGrid(nnz, grid, curDevice), "add: Argument #0: tensor too large or too many dimensions"); + TORCH_CHECK(cuda::getApplyGrid(nnz, grid, curDevice), "add: Argument #0: tensor too large or too many dimensions"); AT_DISPATCH_ALL_TYPES_AND( at::ScalarType::Half, values.scalar_type(), "add_out_dense_sparse_cuda", [&] { @@ -304,7 +304,7 @@ Tensor& add_out_dense_sparse_cuda(Tensor& r_, const Tensor& dense, SparseTensorR static_cast(nnz)); }); } else { - AT_CHECK(cuda::getApplyGrid(nnz * block.x, grid, curDevice), "add: Argument #0: tensor too large or too many dimensions"); + TORCH_CHECK(cuda::getApplyGrid(nnz * block.x, grid, curDevice), "add: Argument #0: tensor too large or too many dimensions"); // sparseElementwiseKernel needs values to be contiguous too values = values.contiguous(); @@ -354,11 +354,11 @@ Tensor& add_out_dense_sparse_cuda(Tensor& r_, const Tensor& dense, SparseTensorR SparseTensor& add_out_sparse_cuda(SparseTensor& r_, const SparseTensor& t, const SparseTensor& src, Scalar value) { AT_ASSERT(t.is_cuda()); // dispatch argument - AT_CHECK(src.is_cuda(), "add: expected 'other' to be CUDA, but got CPU"); - AT_CHECK(r_.is_cuda(), "add: expected 'out' to be CUDA, but got CPU"); + TORCH_CHECK(src.is_cuda(), "add: expected 'other' to be CUDA, but got CPU"); + TORCH_CHECK(r_.is_cuda(), "add: expected 'out' to be CUDA, but got CPU"); - AT_CHECK(cuda::check_device({r_, t, src})); - AT_CHECK(t.sizes().equals(src.sizes()), "add: expected 'self' and 'other' to have same size, but ", t.sizes(), " != ", src.sizes()); + TORCH_CHECK(cuda::check_device({r_, t, src})); + TORCH_CHECK(t.sizes().equals(src.sizes()), "add: expected 'self' and 'other' to have same size, but ", t.sizes(), " != ", src.sizes()); if (src._nnz() == 0) { return copy_sparse_to_sparse_(r_, t); @@ -367,7 +367,7 @@ SparseTensor& add_out_sparse_cuda(SparseTensor& r_, const SparseTensor& t, const return mul_out_sparse_scalar(r_, src, value); } - AT_CHECK(is_same_density(t, src), "add: expected 'self' and 'other' to have same density, but 'self' has ", t.sparse_dim(), " sparse dimensions while 'other' has ", src.sparse_dim(), " sparse dimensions"); + TORCH_CHECK(is_same_density(t, src), "add: expected 'self' and 'other' to have same density, but 'self' has ", t.sparse_dim(), " sparse dimensions while 'other' has ", src.sparse_dim(), " sparse dimensions"); // We deliberately choose to simply concat the indices and values tensors // rather than merging them. This removes the need to synchronously fetch nnz @@ -413,10 +413,10 @@ SparseTensor& mul_out_sparse_cuda(SparseTensor& r_, const SparseTensor& t_, cons } AT_ASSERT(t_.is_cuda()); // dispatch argument - AT_CHECK(src_.is_cuda(), "mul: expected 'other' to be CUDA, but got CPU"); - AT_CHECK(r_.is_cuda(), "mul: expected 'out' to be CUDA, but got CPU"); - AT_CHECK(cuda::check_device({r_, t_, src_})); - AT_CHECK(t_.sizes().equals(src_.sizes()), "mul: expected 'self' and 'other' to have same size, but ", t_.sizes(), " != ", src_.sizes()); + TORCH_CHECK(src_.is_cuda(), "mul: expected 'other' to be CUDA, but got CPU"); + TORCH_CHECK(r_.is_cuda(), "mul: expected 'out' to be CUDA, but got CPU"); + TORCH_CHECK(cuda::check_device({r_, t_, src_})); + TORCH_CHECK(t_.sizes().equals(src_.sizes()), "mul: expected 'self' and 'other' to have same size, but ", t_.sizes(), " != ", src_.sizes()); SparseTensor t = t_.coalesce(); SparseTensor src = src_.coalesce(); @@ -445,7 +445,7 @@ SparseTensor& mul_out_sparse_cuda(SparseTensor& r_, const SparseTensor& t_, cons int curDevice = -1; cudaGetDevice(&curDevice); cudaStream_t stream = at::cuda::getCurrentCUDAStream(curDevice); - AT_CHECK(cuda::getApplyGrid(valueSize, grid, curDevice), "mul: Argument #0: tensor too large or too many dimensions"); + TORCH_CHECK(cuda::getApplyGrid(valueSize, grid, curDevice), "mul: Argument #0: tensor too large or too many dimensions"); LongTensor resultNnz = at::empty({1}, CUDA(kLong)); AT_DISPATCH_ALL_TYPES_AND( @@ -519,8 +519,8 @@ __global__ void _sparse_sum_backward_cuda_kernel( } Tensor _sparse_sum_backward_cuda(const Tensor& grad_, const SparseTensor& input_, IntArrayRef dims_to_sum) { - AT_CHECK(grad_.is_cuda(), "_sparse_sum_backward_cuda: expected 'grad_' to be CUDA tensor, but got CPU tensor"); - AT_CHECK(input_.is_cuda(), "_sparse_sum_backward_cuda: expected 'input_' to be CUDA tensor, but got CPU tensor"); + TORCH_CHECK(grad_.is_cuda(), "_sparse_sum_backward_cuda: expected 'grad_' to be CUDA tensor, but got CPU tensor"); + TORCH_CHECK(input_.is_cuda(), "_sparse_sum_backward_cuda: expected 'input_' to be CUDA tensor, but got CPU tensor"); auto input = input_.coalesce(); const int64_t input_dim = input.dim(); @@ -553,7 +553,7 @@ Tensor _sparse_sum_backward_cuda(const Tensor& grad_, const SparseTensor& input_ const bool sum_sparse_dim = (sparse_dims_to_sum_size > 0); if (sum_all_sparse_dim) { - AT_CHECK(!grad_.is_sparse(), "_sparse_sum_backward_cuda: expected grad Tensor to be dense since all sparse dims are summed"); + TORCH_CHECK(!grad_.is_sparse(), "_sparse_sum_backward_cuda: expected grad Tensor to be dense since all sparse dims are summed"); auto grad_input_values = grad_; auto expand_size = input_values.sizes().vec(); if (sum_dense_dim) { @@ -566,7 +566,7 @@ Tensor _sparse_sum_backward_cuda(const Tensor& grad_, const SparseTensor& input_ return at::_sparse_coo_tensor_with_dims_and_tensors(input_sparse_dim, input_dense_dim, input_sizes, input_indices.clone(), grad_input_values, input.options().dtype(grad_.dtype())); // convert to grad dtype } else { - AT_CHECK(grad_.is_sparse(), "_sparse_sum_backward_cuda: expected grad_ Tensor to be sparse, but got dense"); + TORCH_CHECK(grad_.is_sparse(), "_sparse_sum_backward_cuda: expected grad_ Tensor to be sparse, but got dense"); auto grad = grad_.coalesce(); LongTensor grad_indices = grad._indices(); Tensor grad_values = grad._values(); @@ -617,7 +617,7 @@ Tensor _sparse_sum_backward_cuda(const Tensor& grad_, const SparseTensor& input_ int64_t total_threads = input_nnz; const dim3 block = dim3(std::min(static_cast(cuda::getApplyBlock().x), total_threads)); dim3 grid; - AT_CHECK(cuda::getApplyGrid(total_threads, grid, curDevice), "_sparse_sum_backward_cuda: input too large or too many dimensions"); + TORCH_CHECK(cuda::getApplyGrid(total_threads, grid, curDevice), "_sparse_sum_backward_cuda: input too large or too many dimensions"); auto grad_indices_ti = getTensorInfo(grad_indices_1D); auto input_indices_ti = getTensorInfo(input_indices_1D); diff --git a/aten/src/ATen/native_parse.py b/aten/src/ATen/native_parse.py index 30a40ae83c34..e93e41dbd46a 100644 --- a/aten/src/ATen/native_parse.py +++ b/aten/src/ATen/native_parse.py @@ -127,6 +127,8 @@ def type_argument_translations(arg): # we change this at either a JIT schema or C++ level. elif default == 'Mean': default = 'Reduction::Mean' + elif default == 'contiguous_format': + default = 'MemoryFormat::Contiguous' else: try: default = int(default) diff --git a/aten/src/ATen/nn.yaml b/aten/src/ATen/nn.yaml index 37a50089cf1f..5f15b1ec59bc 100644 --- a/aten/src/ATen/nn.yaml +++ b/aten/src/ATen/nn.yaml @@ -118,12 +118,6 @@ # Pooling -- name: _thnn_adaptive_avg_pool3d(Tensor self, IntArrayRef[3] output_size) - cname: VolumetricAdaptiveAveragePooling - scalar_check: - output: 'false' - grad_input: 'false' - - name: _thnn_avg_pool2d(Tensor self, IntArrayRef[2] kernel_size, IntArrayRef[2] stride={}, IntArrayRef[2] padding=0, bool ceil_mode=false, bool count_include_pad=true) cname: SpatialAveragePooling default_init: @@ -140,14 +134,6 @@ output: 'false' grad_input: 'false' -- name: _thnn_max_pool2d_with_indices(Tensor self, IntArrayRef[2] kernel_size, IntArrayRef[2] stride={}, IntArrayRef[2] padding=0, IntArrayRef[2] dilation=1, bool ceil_mode=false) - cname: SpatialDilatedMaxPooling - default_init: - stride: kernel_size - scalar_check: - output: 'false' - grad_input: 'false' - - name: _thnn_max_pool3d_with_indices(Tensor self, IntArrayRef[3] kernel_size, IntArrayRef[3] stride={}, IntArrayRef[3] padding=0, IntArrayRef[3] dilation=1, bool ceil_mode=false) cname: VolumetricDilatedMaxPooling default_init: @@ -168,53 +154,6 @@ output: 'false' grad_input: 'false' -# Upsampling - -# Note: The upsampling backwards functions also include an IntArrayRef input_size -# parameter, which is added by nn_parse.py - -- name: _thnn_upsample_linear1d(Tensor self, IntArrayRef[1] output_size, bool align_corners) - cname: TemporalUpSamplingLinear - scalar_check: - self: 'false' - grad_input: 'false' - -- name: _thnn_upsample_bilinear2d(Tensor self, IntArrayRef[2] output_size, bool align_corners) - cname: SpatialUpSamplingBilinear - scalar_check: - self: 'false' - grad_input: 'false' - -- name: _thnn_upsample_bicubic2d(Tensor self, IntArrayRef[2] output_size, bool align_corners) - cname: SpatialUpSamplingBicubic - scalar_check: - grad_input: 'false' - -- name: _thnn_upsample_trilinear3d(Tensor self, IntArrayRef[3] output_size, bool align_corners) - cname: VolumetricUpSamplingTrilinear - scalar_check: - self: 'false' - grad_input: 'false' - -- name: _thnn_upsample_nearest1d(Tensor self, IntArrayRef[1] output_size) - cname: TemporalUpSamplingNearest - scalar_check: - self: 'false' - grad_input: 'false' - -- name: _thnn_upsample_nearest2d(Tensor self, IntArrayRef[2] output_size) - cname: SpatialUpSamplingNearest - scalar_check: - self: 'false' - grad_input: 'false' - -- name: _thnn_upsample_nearest3d(Tensor self, IntArrayRef[3] output_size) - cname: VolumetricUpSamplingNearest - scalar_check: - self: 'false' - grad_input: 'false' - - # Private functions. These also exist in TH, but we want the backwards functions # to implement derivatives. diff --git a/aten/src/ATen/preprocess_declarations.py b/aten/src/ATen/preprocess_declarations.py index c5e7cdfad0f2..0564a9904411 100644 --- a/aten/src/ATen/preprocess_declarations.py +++ b/aten/src/ATen/preprocess_declarations.py @@ -19,6 +19,8 @@ ], 'quantized': [ 'QInt8', + 'QUInt8', + 'QInt32', ] } diff --git a/aten/src/ATen/quantized/QTensorImpl.h b/aten/src/ATen/quantized/QTensorImpl.h index 3aeeb4025702..f41d1ec15ca6 100644 --- a/aten/src/ATen/quantized/QTensorImpl.h +++ b/aten/src/ATen/quantized/QTensorImpl.h @@ -25,7 +25,9 @@ struct CAFFE2_API QTensorImpl : public c10::TensorImpl { return quantizer_; } - c10::intrusive_ptr shallow_copy_and_detach() const override { + c10::intrusive_ptr shallow_copy_and_detach( + const c10::VariableVersion& version_counter, + bool allow_tensor_metadata_change) const override { auto impl = c10::make_intrusive( Storage(storage()), type_id(), quantizer_); impl->set_sizes_and_strides(sizes(), strides()); @@ -34,6 +36,8 @@ struct CAFFE2_API QTensorImpl : public c10::TensorImpl { impl->reserved_ = reserved_; impl->refresh_numel(); impl->refresh_contiguous(); + impl->set_version_counter(version_counter); + impl->set_allow_tensor_metadata_change(allow_tensor_metadata_change); return impl; } diff --git a/aten/src/ATen/quantized/Quantizer.cpp b/aten/src/ATen/quantized/Quantizer.cpp index 41c29b7bc414..0e913357e887 100644 --- a/aten/src/ATen/quantized/Quantizer.cpp +++ b/aten/src/ATen/quantized/Quantizer.cpp @@ -11,11 +11,263 @@ namespace at { +void checkFloatCPUTensor(std::string fn_name, Tensor t) { + TORCH_CHECK( + t.scalar_type() == kFloat, + fn_name, + "expects a Float Tensor."); + TORCH_CHECK( + t.device() == kCPU, + fn_name, + "expects a CPU Tensor."); +} + +template +void checkQuantizedCPUTensor(std::string fn_name, Tensor t) { + TORCH_CHECK(t.is_quantized(), + fn_name, + "expects a quantized Tensor."); + TORCH_CHECK(t.scalar_type() == caffe2::TypeMeta::Make(), + fn_name, + "expects a ", + caffe2::TypeMeta::Make(), + "Tensor"); + TORCH_CHECK(t.device() == kCPU, + fn_name, + "expects a CPU quantized Tensor"); +} + +template +void checkZeroPoint(std::string fn_name, int32_t zero_point) { + TORCH_CHECK(zero_point <= std::numeric_limits::max(), + fn_name, + "zero_point is out of range."); + TORCH_CHECK(zero_point >= std::numeric_limits::min(), + fn_name, + "zero_point is out of range."); +} + +template +void checkZeroPoints(std::string fn_name, std::vector zero_points) { + for (int i = 0; i < zero_points.size(); ++i) { + TORCH_CHECK(zero_points[i] <= std::numeric_limits::max(), + fn_name, + "zero_point", + i, + "is out of range."); + TORCH_CHECK(zero_points[i] >= std::numeric_limits::min(), + fn_name, + "zero_point", + i, + "is out of range."); + } +} + +#ifdef USE_FBGEMM +// Note: quantize_val is only explicitly used in test outside of this file +template +T quantize_val(float scale, int32_t zero_point, float value) { + // Internally, fbgemm::Quantize uses std::nearbyint. + // std::nearbyint results in nearest integer value according to the current + // rounding mode and the default rounding mode is rounds to even in half-way + // cases in most popular processor architectures like x86 and ARM. This is + // typically faster than an alternatives like std::round that rounds half-way + // cases away from zero, and can be consistent with SIMD implementations for + // example in x86 using _mm512_cvtps_epi32 or mm512_round_ps with + // _MM_FROUND_CUR_DIRECTION option that also follow the current rounding mode. + int32_t qvalue; + qvalue = fbgemm::Quantize( + value, + zero_point, + scale, + /*result_precision=*/CHAR_BIT * sizeof(typename T::underlying)); + return static_cast(qvalue); +} + +// TODO: dequantize_val? + +template +Tensor quantize_tensor(Tensor rtensor, Tensor qtensor, float scale, int32_t zero_point) { + auto fn_name = "quantize_tensor"; + checkFloatCPUTensor(fn_name, rtensor); + checkQuantizedCPUTensor(fn_name, qtensor); + checkZeroPoint(fn_name, zero_point); + const float* rd = rtensor.data(); + auto qd = reinterpret_cast(qtensor.data()); + fbgemm::TensorQuantizationParams qparams; + qparams.scale = scale; + qparams.zero_point = zero_point; + qparams.precision = CHAR_BIT * sizeof(typename T::underlying); + fbgemm::Quantize(/*src=*/rd, + /*dst=*/qd, + /*len=*/rtensor.numel(), + /*qparams=*/qparams); + return qtensor; +} + +template +Tensor dequantize_tensor(Tensor qtensor, Tensor rtensor, float scale, int32_t zero_point) { + auto fn_name = "dequantize_tensor"; + checkFloatCPUTensor(fn_name, rtensor); + checkQuantizedCPUTensor(fn_name, qtensor); + checkZeroPoint(fn_name, zero_point); + const auto* qd = reinterpret_cast(qtensor.data()); + fbgemm::TensorQuantizationParams qparams; + qparams.scale = scale; + qparams.zero_point = zero_point; + qparams.precision = CHAR_BIT * sizeof(typename T::underlying); + float* rd = rtensor.data(); + fbgemm::Dequantize(/*src=*/qd, + /*dst=*/rd, + /*len=*/qtensor.numel(), + /*qparams=*/qparams); + return rtensor; +} +#else + +template +T quantize_val(float scale, int32_t zero_point, float value) { + // std::nearbyint results in nearest integer value according to the current + // rounding mode and the default rounding mode is rounds to even in half-way + // cases in most popular processor architectures like x86 and ARM. This is + // typically faster than an alternatives like std::round that rounds half-way + // cases away from zero, and can be consistent with SIMD implementations for + // example in x86 using _mm512_cvtps_epi32 or mm512_round_ps with + // _MM_FROUND_CUR_DIRECTION option that also follow the current rounding mode. + int32_t qvalue; + constexpr int32_t qmin = std::numeric_limits::min(); + constexpr int32_t qmax = std::numeric_limits::max(); + checkZeroPoint("quantize_val", zero_point); + qvalue = static_cast(std::nearbyint(value / scale + zero_point)); + qvalue = std::max(qvalue, qmin); + qvalue = std::min(qvalue, qmax); + return static_cast(qvalue); +} + +template +Tensor quantize_tensor(Tensor rtensor, Tensor qtensor, float scale, int32_t zero_point) { + auto fn_name = "quantize_tensor"; + checkFloatCPUTensor(fn_name, rtensor); + checkQuantizedCPUTensor(fn_name, qtensor); + checkZeroPoint(fn_name, zero_point); + const float* rdata = rtensor.data(); + auto qdata = qtensor.data(); + for (int i = 0; i < rtensor.numel(); ++i) { + qdata[i] = quantize_val(scale, zero_point, rdata[i]); + } + return qtensor; +} + +template +Tensor dequantize_tensor(Tensor qtensor, Tensor rtensor, float scale, int32_t zero_point) { + auto fn_name = "dequantize_tensor"; + checkFloatCPUTensor(fn_name, rtensor); + checkQuantizedCPUTensor(fn_name, qtensor); + checkZeroPoint(fn_name, zero_point); + const auto* qd = qtensor.data(); + float* rd = rtensor.data(); + for (auto i = 0; i < qtensor.numel(); ++i) { + // We need to convert the qint8 value to float to ensure the subtraction + // subexpression returns a float + rd[i] = (static_cast(qd[i].val_) - zero_point) * scale; + } + return rtensor; +} +#endif +template CAFFE2_API qint8 quantize_val(float scale, int32_t zero_point, float value); +template CAFFE2_API quint8 quantize_val(float scale, int32_t zero_point, float value); +template CAFFE2_API qint32 quantize_val(float scale, int32_t zero_point, float value); +template CAFFE2_API Tensor quantize_tensor(Tensor rtensor, Tensor qtensor, float scale, int32_t zero_point); +template CAFFE2_API Tensor quantize_tensor(Tensor rtensor, Tensor qtensor, float scale, int32_t zero_point); +template CAFFE2_API Tensor quantize_tensor(Tensor rtensor, Tensor qtensor, float scale, int32_t zero_point); +template CAFFE2_API Tensor dequantize_tensor(Tensor rtensor, Tensor qtensor, float scale, int32_t zero_point); +template CAFFE2_API Tensor dequantize_tensor(Tensor rtensor, Tensor qtensor, float scale, int32_t zero_point); +template CAFFE2_API Tensor dequantize_tensor(Tensor rtensor, Tensor qtensor, float scale, int32_t zero_point); + + +// TODO: add fbgemm for per channel +template +Tensor quantize_tensor_per_channel_affine(Tensor rtensor, + Tensor qtensor, + std::vector scales, + std::vector zero_points, + std::vector axis) { + auto fn_name = "quantize_tensor_per_channel_affine"; + checkFloatCPUTensor(fn_name, rtensor); + checkQuantizedCPUTensor(fn_name, qtensor); + checkZeroPoints(fn_name, zero_points); + int64_t channel_axis = axis[0]; + TORCH_CHECK(channel_axis < rtensor.dim(), "Channel axis out of range in per channel affine quantization."); + int64_t batches = size_to_dim_(channel_axis, rtensor.sizes()); + int64_t elements_per_channel = size_from_dim_(channel_axis + 1, rtensor.sizes()); + int64_t channel = rtensor.size(channel_axis); + TORCH_CHECK(channel == scales.size(), + "length of scales must equal to channel"); + TORCH_CHECK(channel == zero_points.size(), + "length of zero_points must equal to channel"); + const float* rdata = rtensor.data(); + auto qdata = qtensor.data(); + for (auto b = 0; b < batches; ++b) { + for (auto c = 0; c < channel; ++c) { + for (auto e = 0; e < elements_per_channel; ++e) { + auto i = b * channel * elements_per_channel + c * elements_per_channel + e; + qdata[i] = quantize_val(scales[c], zero_points[c], rdata[i]); + } + } + } + return qtensor; +} + +template +Tensor dequantize_tensor_per_channel_affine(Tensor qtensor, + Tensor rtensor, + std::vector scales, + std::vector zero_points, + std::vector axis) { + auto fn_name = "dequantize_tensor_per_channel_affine"; + checkFloatCPUTensor(fn_name, rtensor); + checkQuantizedCPUTensor(fn_name, qtensor); + checkZeroPoints(fn_name, zero_points); + int64_t channel_axis = axis[0]; + TORCH_CHECK(channel_axis < qtensor.dim(), + "Channel axis out of range in per channel affine dequantization."); + int64_t batches = size_to_dim_(channel_axis, rtensor.sizes()); + int64_t elements_per_channel = size_from_dim_(channel_axis + 1, rtensor.sizes()); + int64_t channel = rtensor.size(channel_axis); + TORCH_CHECK(channel == scales.size(), + "length of scales must equal to channel"); + TORCH_CHECK(channel == zero_points.size(), + "length of zero_points must equal to channel"); + const auto* qd = qtensor.data(); + float* rd = rtensor.data(); + for (auto b = 0; b < batches; ++b) { + for (auto c = 0; c < channel; ++c) { + for (auto e = 0; e < elements_per_channel; ++e) { + auto i = b * channel * elements_per_channel + c * elements_per_channel + e; + // We need to convert the qint8 value to float to ensure the subtraction + // subexpression returns a float + rd[i] = (static_cast(qd[i].val_) - zero_points[c]) * scales[c]; + } + } + } + return rtensor; +} + QuantizerPtr make_per_tensor_affine_quantizer( double scale, - int64_t zero_point) { - return c10::make_intrusive( - static_cast(scale), static_cast(zero_point)); + int64_t zero_point, + ScalarType scalar_type) { + return c10::make_intrusive(scalar_type, + static_cast(scale), static_cast(zero_point)); +} + +QuantizerPtr make_per_channel_affine_quantizer( + std::vector scales, + std::vector zero_points, + std::vector axis, + ScalarType scalar_type) { + return c10::make_intrusive(scalar_type, + scales, zero_points, axis); } QTensorImpl* get_qtensorimpl(const Tensor& self) { @@ -39,7 +291,7 @@ inline Tensor new_qtensor_cpu( auto* allocator = at::getCPUAllocator(); int64_t nelements = at::prod_intlist(sizes); auto dtype = options.dtype(); - AT_CHECK(isQIntType(typeMetaToScalarType(dtype)), + TORCH_CHECK(isQIntType(typeMetaToScalarType(dtype)), "ScalarType is not supported in new_qtensor_cpu."); auto storage = c10::make_intrusive( dtype, @@ -53,91 +305,84 @@ inline Tensor new_qtensor_cpu( return tensor; } -qint8 quantize_uint8(float scale, uint8_t zero_point, float value) { - // Internally, fbgemm::Quantize uses std::nearbyint. - // std::nearbyint results in nearest integer value according to the current - // rounding mode and the default rounding mode is rounds to even in half-way - // cases in most popular processor architectures like x86 and ARM. This is - // typically faster than an alternatives like std::round that rounds half-way - // cases away from zero, and can be consistent with SIMD implementations for - // example in x86 using _mm512_cvtps_epi32 or mm512_round_ps with - // _MM_FROUND_CUR_DIRECTION option that also follow the current rounding mode. - int32_t qvalue; -#ifdef USE_FBGEMM - qvalue = fbgemm::Quantize(value, zero_point, scale, - /*result_precision=*/8); -#else - constexpr int32_t qmin = std::numeric_limits::min(); - constexpr int32_t qmax = std::numeric_limits::max(); - qvalue = static_cast(std::nearbyint(value / scale + zero_point)); - qvalue = std::max(qvalue, qmin); - qvalue = std::min(qvalue, qmax); -#endif - return static_cast(qvalue); -} - -Tensor PerTensorAffineQuantizer::quantize(Tensor tensor) { - IntArrayRef sizes = tensor.sizes(); +Tensor PerTensorAffineQuantizer::quantize(Tensor rtensor) { + TORCH_CHECK( + rtensor.scalar_type() == kFloat, + "quantize only works on Float Tensor."); + TORCH_CHECK( + rtensor.device() == kCPU, + "quantize only works for CPU backend right now."); // Here we need a std::intrusive_ptr.. but actually "this" is the // quantizer that can be reused, so I'm using intrusive_from_this here - AT_CHECK( - tensor.options().device() == kCPU, - "quantize only works for CPU backend right now."); - Tensor qv = new_qtensor_cpu( - sizes, - tensor.options().dtype(at::kQInt8), + Tensor qtensor = new_qtensor_cpu( + rtensor.sizes(), + rtensor.options().dtype(scalar_type_), intrusive_from_this()); - tensor = tensor.contiguous(); - const float* svd = tensor.data(); + rtensor = rtensor.contiguous(); + AT_DISPATCH_QINT_TYPES(qtensor.scalar_type(), "quantize_tensor", [&]() { + qtensor = quantize_tensor(rtensor, qtensor, scale_, zero_point_); + }); + return qtensor; +} -#ifdef USE_FBGEMM - auto qvd = reinterpret_cast(qv.data()); - fbgemm::TensorQuantizationParams qparams; - qparams.scale = scale_; - qparams.zero_point = zero_point_; - qparams.precision = 8; - fbgemm::Quantize(/*src=*/svd, - /*dst=*/qvd, - /*len=*/tensor.numel(), - /*qparams=*/qparams); -#else - auto qvd = qv.data(); - for (int i = 0; i < tensor.numel(); ++i) { - qvd[i] = quantize_uint8(scale_, zero_point_, svd[i]); - } -#endif - return qv; +Tensor PerTensorAffineQuantizer::dequantize(Tensor qtensor) { + TORCH_CHECK(qtensor.is_quantized(), + "dequantize is only supported in quantized Tensor."); + TORCH_CHECK( + qtensor.device() == kCPU, + "dequantize only works for CPU backend right now."); + Tensor rtensor = at::empty(qtensor.sizes(), qtensor.options().dtype(at::kFloat)); + qtensor = qtensor.contiguous(); + + AT_DISPATCH_QINT_TYPES(qtensor.scalar_type(), "dequantize_tensor", [&]() { + rtensor = dequantize_tensor(qtensor, rtensor, scale_, zero_point_); + }); + + return rtensor; } -Tensor PerTensorAffineQuantizer::dequantize(Tensor tensor) { - std::vector sizes = tensor.sizes().vec(); - at::TensorOptions options = tensor.options().dtype(at::kFloat); +Tensor PerChannelAffineQuantizer::quantize(Tensor rtensor) { + TORCH_CHECK( + rtensor.scalar_type() == kFloat, + "quantize only works on Float Tensor."); + TORCH_CHECK( + rtensor.device() == kCPU, + "quantize only works for CPU backend right now."); + // Here we need a std::intrusive_ptr.. but actually "this" is the + // quantizer that can be reused, so I'm using intrusive_from_this here + Tensor qtensor = new_qtensor_cpu( + rtensor.sizes(), + rtensor.options().dtype(scalar_type_), + intrusive_from_this()); - Tensor rv = at::empty(sizes, options); - float* rvd = rv.data(); - tensor = tensor.contiguous(); + rtensor = rtensor.contiguous(); + AT_DISPATCH_QINT_TYPES(qtensor.scalar_type(), + "quantize_tensor_per_channel_affine", + [&]() { + qtensor = quantize_tensor_per_channel_affine( + rtensor, qtensor, scales_, zero_points_, axis_); + }); + return qtensor; +} -#ifdef USE_FBGEMM - const auto* qvd = reinterpret_cast(tensor.data()); - fbgemm::TensorQuantizationParams qparams; - qparams.scale = scale_; - qparams.zero_point = zero_point_; - qparams.precision = 8; - fbgemm::Dequantize(/*src=*/qvd, - /*dst=*/rvd, - /*len=*/tensor.numel(), - /*qparams=*/qparams); -#else - const auto* qvd = tensor.data(); - for (auto i = 0; i < tensor.numel(); ++i) { - // We need to convert the qint8 value to float to ensure the subtraction - // subexpression returns a float - rvd[i] = (static_cast(qvd[i].val_) - zero_point_) * scale_; - } -#endif +Tensor PerChannelAffineQuantizer::dequantize(Tensor qtensor) { + TORCH_CHECK(qtensor.is_quantized(), + "dequantize is only supported in quantized Tensor."); + TORCH_CHECK( + qtensor.device() == kCPU, + "dequantize only works for CPU backend right now."); + Tensor rtensor = at::empty(qtensor.sizes(), qtensor.options().dtype(at::kFloat)); + qtensor = qtensor.contiguous(); + + AT_DISPATCH_QINT_TYPES(qtensor.scalar_type(), + "dequantize_tensor_per_channel_affine", + [&]() { + rtensor = dequantize_tensor_per_channel_affine( + qtensor, rtensor, scales_, zero_points_, axis_); + }); - return rv; + return rtensor; } Quantizer::~Quantizer() {} diff --git a/aten/src/ATen/quantized/Quantizer.h b/aten/src/ATen/quantized/Quantizer.h index e735f8f33c1d..d103b0801768 100644 --- a/aten/src/ATen/quantized/Quantizer.h +++ b/aten/src/ATen/quantized/Quantizer.h @@ -43,7 +43,8 @@ using QuantizerPtr = c10::intrusive_ptr; */ struct CAFFE2_API Quantizer : public c10::intrusive_ptr_target { const QScheme qscheme_; - explicit Quantizer(QScheme qscheme) : qscheme_(qscheme) {} + const ScalarType scalar_type_; + explicit Quantizer(QScheme qscheme, ScalarType scalar_type) : qscheme_(qscheme), scalar_type_(scalar_type) {} virtual ~Quantizer(); // Copied from torch/csrc/jit/scope.h @@ -55,10 +56,14 @@ struct CAFFE2_API Quantizer : public c10::intrusive_ptr_target { return c10::intrusive_ptr::reclaim(this); } - virtual QScheme qscheme() { + QScheme qscheme() { return qscheme_; } + ScalarType scalar_type() { + return scalar_type_; + } + /** * quantize a float Tensor into a quantized Tensor. */ @@ -77,7 +82,7 @@ struct CAFFE2_API Quantizer : public c10::intrusive_ptr_target { * the most commonly used scheme in this category. */ struct CAFFE2_API UniformQuantizer : public Quantizer { - explicit UniformQuantizer(QScheme qscheme) : Quantizer(qscheme) {} + explicit UniformQuantizer(QScheme qscheme, ScalarType scalar_type) : Quantizer(qscheme, scalar_type) {} }; /** @@ -86,7 +91,7 @@ struct CAFFE2_API UniformQuantizer : public Quantizer { * value. K-means quantization is a representative example in this category. */ struct CAFFE2_API NonUniformQuantizer : public Quantizer { - explicit NonUniformQuantizer(QScheme qscheme) : Quantizer(qscheme) {} + explicit NonUniformQuantizer(QScheme qscheme, ScalarType scalar_type) : Quantizer(qscheme, scalar_type) {} }; // There is also StochasticQuantizer which is uniform but not affine @@ -95,12 +100,12 @@ struct CAFFE2_API NonUniformQuantizer : public Quantizer { * AffineQuantizer uses affine transformation to do quantization. * * For quantize: - * Y = clamp((X * scale + zero_point, min, max) + * Y = clamp(round(X / scale + zero_point), min, max) * For dequantize: - * X = (Y - zero_point) / scale + * X = (Y - zero_point) * scale */ struct CAFFE2_API AffineQuantizer : public UniformQuantizer { - explicit AffineQuantizer(QScheme qscheme) : UniformQuantizer(qscheme) {} + explicit AffineQuantizer(QScheme qscheme, ScalarType scalar_type) : UniformQuantizer(qscheme, scalar_type) {} }; /** @@ -108,12 +113,12 @@ struct CAFFE2_API AffineQuantizer : public UniformQuantizer { * does not have zero_point * * For quantize: - * Y = clamp(X * scale, min, max) + * Y = clamp(round(X / scale), min, max) * For dequantize: - * X = Y / scale + * X = Y * scale */ struct CAFFE2_API SymmetricQuantizer : public UniformQuantizer { - explicit SymmetricQuantizer(QScheme qscheme) : UniformQuantizer(qscheme) {} + explicit SymmetricQuantizer(QScheme qscheme, ScalarType scalar_type) : UniformQuantizer(qscheme, scalar_type) {} }; /** @@ -121,8 +126,8 @@ struct CAFFE2_API SymmetricQuantizer : public UniformQuantizer { * used for quantizing all the values in the given Tensor */ struct CAFFE2_API PerTensorSymmetricQuantizer : public SymmetricQuantizer { - explicit PerTensorSymmetricQuantizer(float scale) - : SymmetricQuantizer(kPerTensorSymmetric), scale_(scale) {} + explicit PerTensorSymmetricQuantizer(ScalarType scalar_type, float scale) + : SymmetricQuantizer(kPerTensorSymmetric, scalar_type), scale_(scale) {} float scale_{1.0}; }; @@ -138,10 +143,11 @@ struct CAFFE2_API PerTensorSymmetricQuantizer : public SymmetricQuantizer { */ struct CAFFE2_API PerChannelSymmetricQuantizer : public SymmetricQuantizer { explicit PerChannelSymmetricQuantizer( + ScalarType scalar_type, const std::vector& scales, const std::vector& axis) - : SymmetricQuantizer(kPerChannelSymmetric), scales_(scales), axis_(axis) { - AT_CHECK( + : SymmetricQuantizer(kPerChannelSymmetric, scalar_type), scales_(scales), axis_(axis) { + TORCH_CHECK( axis_.size() == 1, "Per channel symmetric quantization in multiple axis is not supported yet."); } @@ -164,8 +170,8 @@ struct CAFFE2_API PerChannelSymmetricQuantizer : public SymmetricQuantizer { * all the values in the Tensor. */ struct CAFFE2_API PerTensorAffineQuantizer : public AffineQuantizer { - explicit PerTensorAffineQuantizer(float scale, uint8_t zero_point) - : AffineQuantizer(kPerTensorAffine), + explicit PerTensorAffineQuantizer(ScalarType scalar_type, float scale, int32_t zero_point) + : AffineQuantizer(kPerTensorAffine, scalar_type), scale_(scale), zero_point_(zero_point) {} @@ -176,13 +182,14 @@ struct CAFFE2_API PerTensorAffineQuantizer : public AffineQuantizer { return scale_; } - uint8_t zero_point() const { + int32_t zero_point() const { return zero_point_; } private: const float scale_; - const uint8_t zero_point_; + // We use int32_t to support both uint8_t and int32_t data types + const int32_t zero_point_; }; /** @@ -192,14 +199,15 @@ struct CAFFE2_API PerTensorAffineQuantizer : public AffineQuantizer { */ struct CAFFE2_API PerChannelAffineQuantizer : public AffineQuantizer { explicit PerChannelAffineQuantizer( + ScalarType scalar_type, const std::vector& scales, - const std::vector& zero_points, + const std::vector& zero_points, const std::vector& axis) - : AffineQuantizer(kPerChannelAffine), - scales_(scales), - zero_points_(zero_points), - axis_(axis) { - AT_CHECK( + : AffineQuantizer(kPerChannelAffine, scalar_type), + scales_(scales), + zero_points_(zero_points), + axis_(axis) { + TORCH_CHECK( axis_.size() == 1, "Per channel affine quantization in multiple axis is not supported yet."); } @@ -208,7 +216,7 @@ struct CAFFE2_API PerChannelAffineQuantizer : public AffineQuantizer { return scales_; } - std::vector zero_points() const { + std::vector zero_points() const { return zero_points_; } @@ -216,9 +224,12 @@ struct CAFFE2_API PerChannelAffineQuantizer : public AffineQuantizer { return axis_; } + Tensor quantize(Tensor tensor) override; + Tensor dequantize(Tensor tensor) override; + private: const std::vector scales_; - const std::vector zero_points_; + const std::vector zero_points_; const std::vector axis_; }; @@ -229,13 +240,21 @@ struct CAFFE2_API PerChannelAffineQuantizer : public AffineQuantizer { // This may be called repeatedly, so make sure it's pretty cheap. CAFFE2_API QTensorImpl* get_qtensorimpl(const Tensor& self); -// Quantize a float value into a uint8 value given scale and zero_point -CAFFE2_API qint8 quantize_uint8(float scale, uint8_t zero_point, float value); +// Quantize a float value into a uint value given scale and zero_point +template +CAFFE2_API T quantize_val(float scale, int32_t zero_point, float value); +template +CAFFE2_API Tensor quantize_tensor(Tensor rtensor, Tensor qtensor, float scale, int32_t zero_point); +template +CAFFE2_API Tensor dequantize_tensor(Tensor qtensor, Tensor rtensor, float scale, int32_t zero_point); // double and int64_t are because of the native function API, we only have these // argument types right now in native functions CAFFE2_API QuantizerPtr -make_per_tensor_affine_quantizer(double scale, int64_t zero_point); +make_per_tensor_affine_quantizer(double scale, int64_t zero_point, ScalarType scalar_type); + +CAFFE2_API QuantizerPtr +make_per_channel_affine_quantizer(std::vector scales, std::vector zero_points, std::vector axis, ScalarType scalar_type); // Create a Quantized Tensor given arguments for normal Tensor and a quantizer CAFFE2_API Tensor new_qtensor_cpu( diff --git a/aten/src/ATen/templates/Functions.h b/aten/src/ATen/templates/Functions.h index d8d8622ce2f7..8dc9ebd4e152 100644 --- a/aten/src/ATen/templates/Functions.h +++ b/aten/src/ATen/templates/Functions.h @@ -30,7 +30,7 @@ inline Tensor from_blob( const TensorOptions& options = {}) { auto device = getType(options).getDeviceFromPtr(data); if (options.device().has_index()) { - AT_CHECK( + TORCH_CHECK( options.device() == device, "Specified device ", options.device(), " does not match device of data ", device); @@ -71,11 +71,11 @@ inline Tensor from_blob( namespace detail { static inline TypeExtendedInterface & infer_type(const Tensor & t) { - AT_CHECK(t.defined(), "undefined Tensor"); + TORCH_CHECK(t.defined(), "undefined Tensor"); return getType(t); } static inline TypeExtendedInterface & infer_type(const TensorList & tl) { - AT_CHECK(tl.size() > 0, "expected a non-empty list of Tensors"); + TORCH_CHECK(tl.size() > 0, "expected a non-empty list of Tensors"); return getType(tl[0]); } diff --git a/aten/src/ATen/templates/LegacyTHFunctions.h b/aten/src/ATen/templates/LegacyTHFunctions.h index fb91eeb4b2dd..fe34623618db 100644 --- a/aten/src/ATen/templates/LegacyTHFunctions.h +++ b/aten/src/ATen/templates/LegacyTHFunctions.h @@ -9,11 +9,11 @@ namespace th { namespace detail { static inline LegacyTHDispatcher & infer_dispatcher(const Tensor & t) { - AT_CHECK(t.defined(), "undefined Tensor"); + TORCH_CHECK(t.defined(), "undefined Tensor"); return getLegacyTHDispatcher(t); } static inline LegacyTHDispatcher & infer_dispatcher(const TensorList & tl) { - AT_CHECK(tl.size() > 0, "expected a non-empty list of Tensors"); + TORCH_CHECK(tl.size() > 0, "expected a non-empty list of Tensors"); return getLegacyTHDispatcher(tl[0]); } diff --git a/aten/src/ATen/templates/Tensor.h b/aten/src/ATen/templates/Tensor.h index b8fab0379096..32a538017398 100644 --- a/aten/src/ATen/templates/Tensor.h +++ b/aten/src/ATen/templates/Tensor.h @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -154,7 +155,7 @@ class CAFFE2_API Tensor { return impl_.weak_use_count(); } - const char * toString() const; + std::string toString() const; IntArrayRef sizes() const { return impl_->sizes(); @@ -165,8 +166,8 @@ class CAFFE2_API Tensor { int64_t ndimension() const { return dim(); } - bool is_contiguous() const { - return impl_->is_contiguous(); + bool is_contiguous(at::MemoryFormat memory_format=at::MemoryFormat::Any) const { + return impl_->is_contiguous(memory_format); } // Total bytes consumed by the "view" of elements of the array. Does not @@ -193,7 +194,7 @@ class CAFFE2_API Tensor { return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties( tensorTypeIdToBackend(type_id()), scalar_type(), - is_variable() && !at::NonVariableTypeMode::is_enabled()); + is_variable()); } Type & dispatch_type() const { return legacyTensorType(*impl_); @@ -266,7 +267,7 @@ class CAFFE2_API Tensor { template TensorAccessor accessor() const& { static_assert(N > 0, "accessor is used for indexing tensor, for scalars use *data()"); - AT_CHECK(dim() == N, "expected ", N, " dims but tensor has ", dim()); + TORCH_CHECK(dim() == N, "expected ", N, " dims but tensor has ", dim()); return TensorAccessor(data(),sizes().data(),strides().data()); } template @@ -280,7 +281,7 @@ class CAFFE2_API Tensor { template class PtrTraits = DefaultPtrTraits, typename index_t = int64_t> PackedTensorAccessor packed_accessor() const& { static_assert(N > 0, "accessor is used for indexing tensor, for scalars use *data()"); - AT_CHECK(dim() == N, "expected ", N, " dims but tensor has ", dim()); + TORCH_CHECK(dim() == N, "expected ", N, " dims but tensor has ", dim()); return PackedTensorAccessor(static_cast::PtrType>(data()),sizes().data(),strides().data()); } template class PtrTraits = DefaultPtrTraits, typename index_t = int64_t> diff --git a/aten/src/ATen/templates/TensorMethods.h b/aten/src/ATen/templates/TensorMethods.h index b9c2e9cbf1e9..b78a83ccb294 100644 --- a/aten/src/ATen/templates/TensorMethods.h +++ b/aten/src/ATen/templates/TensorMethods.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include #include @@ -131,7 +132,7 @@ inline bool is_quantized(Tensor self) { #define DEFINE_CAST(T, name, _) \ template <> \ inline T* Tensor::data() const { \ - AT_CHECK( \ + TORCH_CHECK( \ scalar_type() == ScalarType::name, \ "expected scalar type ", \ #name, \ diff --git a/aten/src/ATen/templates/Type.h b/aten/src/ATen/templates/Type.h index a6f4f479bf19..f6b3d8be7417 100644 --- a/aten/src/ATen/templates/Type.h +++ b/aten/src/ATen/templates/Type.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include diff --git a/aten/src/ATen/templates/TypeDerived.h b/aten/src/ATen/templates/TypeDerived.h index fed4545b11df..685771858e2b 100644 --- a/aten/src/ATen/templates/TypeDerived.h +++ b/aten/src/ATen/templates/TypeDerived.h @@ -31,7 +31,7 @@ struct ${Type} final : public ${DeviceType}TypeDefault { return t.scalar_type(); } ScalarType infer_scalar_type(const TensorList & tl) const { - AT_CHECK(tl.size() > 0, "expected a non-empty list of Tensors"); + TORCH_CHECK(tl.size() > 0, "expected a non-empty list of Tensors"); return tl[0].scalar_type(); } }; diff --git a/aten/src/ATen/test/CMakeLists.txt b/aten/src/ATen/test/CMakeLists.txt index 20b643c521e4..43e9ec305fb8 100644 --- a/aten/src/ATen/test/CMakeLists.txt +++ b/aten/src/ATen/test/CMakeLists.txt @@ -31,6 +31,7 @@ list(APPEND ATen_CUDA_TEST_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/cuda_apply_test.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cuda_stream_test.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cuda_half_test.cu + ${CMAKE_CURRENT_SOURCE_DIR}/cuda_distributions_test.cu ${CMAKE_CURRENT_SOURCE_DIR}/cuda_optional_test.cu ${CMAKE_CURRENT_SOURCE_DIR}/cuda_packedtensoraccessor_test.cu ${CMAKE_CURRENT_SOURCE_DIR}/cuda_tensor_interop_test.cpp) diff --git a/aten/src/ATen/test/cuda_distributions_test.cu b/aten/src/ATen/test/cuda_distributions_test.cu new file mode 100644 index 000000000000..027c3157a59d --- /dev/null +++ b/aten/src/ATen/test/cuda_distributions_test.cu @@ -0,0 +1,143 @@ +#include + +#include +#include +#include +#include + +#include +#include +#include + +__global__ void expected_randoms(float* x, uint64_t counter_offset) { + for(int i=0; i < 4; i++) { + curandStatePhilox4_32_10_t state; + curand_init( + 123, + i, + counter_offset, + &state); + auto ret = curand_uniform4(&state); + x[i] = ret.x; + } +} + +TEST(DistributionsTest, TestPhiloxIncrementSmallTensor) { + // Test Description: + // In Distributions.cu we mentioned that philox increment + // should be at least the number of curand() random numbers used in + // each thread. In this test, we make sure that uniform_ correctly + // increments philox and doesn't reuse randoms from previous calls + // for a small tensor size of 4. + // - We check that by first getting 4 randoms from uniform_. + // Once we get these 4 randoms, that would mean that philox counter for + // thread 0, 1, 2 and 3, was incremented by 4 (check calc_execution_policy + // function for details). + // - Now get 4 randoms with offset=4 for thread {0,1,2,3} from expected_randoms + // kernel above. + // - Now get 4 more randoms from uniform_ (note thread {0,1,2,3} for this call would + // start from a philox_offset value of 4) + // - the 4 randoms from expected_randoms and the 4 randoms from the previous call + // of uniform_ should match, signifying that the philox offset was + // incremented properly and no randoms are being reused from previous calls + + // if cuda not available, return + if (!at::cuda::is_available()) return; + + // manual seed to 123 + at::manual_seed(123); + + // get 4 randoms from uniform_(), philox offset is now incremented to 4 by this call + at::empty({4}, at::TensorOptions(at::kCUDA)).uniform_(); + + // allocate 4 float on host memory + float *x; + cudaMallocManaged(&x, 4*sizeof(float)); + + // launch kernel to get expected randoms + expected_randoms<<<1, 1>>>(x, 4); + + // Wait for GPU to finish before accessing on host + cudaDeviceSynchronize(); + + // get 4 new float from uniform_() + auto self = at::empty({4}, at::TensorOptions(at::kCUDA)); + self.uniform_(); + + // check randoms from expected_randoms kernel are equal to the randoms from the second + // call of uniform_() + for (int i = 0; i < 4; i++) { + ASSERT_EQ(self[i].item().to(), x[i]); + } + + // Free memory + cudaFree(x); +} + +TEST(DistributionsTest, TestPhiloxIncrementBigTensor) { + // Test Description: + // In Distributions.cu we mentioned that philox increment + // should be at least the number of curand() random numbers used in + // each thread. In this test, we make sure that uniform_ correctly + // increments philox and doesn't reuse randoms from previous calls + // for a big size tensor. + // - First of all, we come up with what the size of the big tensor + // should be for this test. Our goal is to show that when the uniform_ + // kernel runs at full occupancy (i.e. when the number of elements is + // greater the number of threads launched), it hits the unroll loop in + // the uniform_ kernel. + // - Hence, we set the size of the tensor in this test to be 8 times the + // maximum number of threads we can launch. This means that, each thread will + // be yielding 8 elements, and as a result, curand_uniform4 will be called twice + // and all the 8 elements in a thread will consume all the float4 from the + // two calls of curand_unfiorm4 as a result of the unroll loop. Therefore, + // after this call to the unform_, counter_offset for the next call to uniform_ + // will start from 8. This is what we test next. + // - Now get 4 randoms with offset=8 for thread {0,1,2,3} from expected_randoms + // kernel above. + // - Now get 4 more randoms from uniform_ (note thread {0,1,2,3} for this call would + // start from a philox_offset value of 8) + // - the 4 randoms from expected_randoms kernel and the 4 randoms from the previous call + // of uniform_ should match, signifying that the philox offset was + // incremented properly and no randoms are being reused from previous calls + + // if cuda not available, return + if (!at::cuda::is_available()) return; + + // manual seed to 123 + at::manual_seed(123); + + // calculate maximum number of threads that can be launched + // and set the numel to be 8 times that + const int block_size = 256; + dim3 dim_block(block_size); + uint32_t blocks_per_sm = at::cuda::getCurrentDeviceProperties()->maxThreadsPerMultiProcessor / block_size; + dim3 grid(static_cast(at::cuda::getCurrentDeviceProperties()->multiProcessorCount) * blocks_per_sm); + auto numel = block_size * grid.x * 8; + + // get numel randoms from uniform_(), philox offset is now incremented to 8 by this call + at::empty({numel}, at::TensorOptions(at::kCUDA)).uniform_(); + + // allocate 4 float on host memory + float *x; + cudaMallocManaged(&x, 4*sizeof(float)); + + // launch kernel to get expected randoms + expected_randoms<<<1, 1>>>(x, 8); + + // Wait for GPU to finish before accessing on host + cudaDeviceSynchronize(); + + // get 4 new float from uniform_() + auto self = at::empty({4}, at::TensorOptions(at::kCUDA)); + self.uniform_(); + + // check randoms from expected_randoms kernel are equal to the randoms from the second + // call of uniform_() + for (int i = 0; i < 4; i++) { + ASSERT_EQ(self[i].item().to(), x[i]); + } + + // Free memory + cudaFree(x); +} diff --git a/aten/src/ATen/test/quantized_test.cpp b/aten/src/ATen/test/quantized_test.cpp index 103e69e18662..64b4f2e4acc1 100644 --- a/aten/src/ATen/test/quantized_test.cpp +++ b/aten/src/ATen/test/quantized_test.cpp @@ -7,7 +7,7 @@ #include #include #include -// For quantize_uint8 +// For quantize_val #include #include @@ -18,7 +18,7 @@ TEST(TestQTensor, QuantDequantAPIs) { Tensor r = at::ones({num_elements}); const float scale = 1.0; const int32_t zero_point = 2; - Tensor qr = r.quantize_linear(scale, zero_point); + Tensor qr = r.quantize_linear(scale, zero_point, kQUInt8); ASSERT_EQ(qr.q_scale().to(), scale); ASSERT_EQ(qr.q_zero_point().to(), zero_point); ASSERT_TRUE(qr.is_quantized()); @@ -33,10 +33,10 @@ TEST(TestQTensor, QuantDequantAPIs) { // Check for correct quantization auto r_data = r.data(); - auto qr_data = qr.data(); + auto qr_data = qr.data(); for (auto i = 0; i < num_elements; ++i) { ASSERT_EQ( - quantize_uint8(scale, zero_point, r_data[i]).val_, qr_data[i].val_); + quantize_val(scale, zero_point, r_data[i]).val_, qr_data[i].val_); } // Check for correct dequantization @@ -60,9 +60,9 @@ TEST(TestQTensor, RoundingMode) { 6, 6, 8, 8, 10, 10}; // scale = 1.0 Tensor x = from_blob(x_values.data(), x_values.size()); - Tensor qx = x.quantize_linear(/*scale=*/1.0, zero_point); + Tensor qx = x.quantize_linear(/*scale=*/1.0, zero_point, kQUInt8); - auto qx_data = qx.data(); + auto qx_data = qx.data(); for (int idx = 0; idx < x_values.size(); ++idx) { ASSERT_EQ(qx_expect[idx], qx_data[idx].val_) << "Tie breaking during rounding element " << idx << " failed!"; @@ -73,7 +73,7 @@ TEST(TestQTensor, Item) { Tensor r = at::ones({1}); const float scale = 1; const int32_t zero_point = 2; - Tensor qr = r.quantize_linear(scale, zero_point); + Tensor qr = r.quantize_linear(scale, zero_point, kQUInt8); ASSERT_EQ(r.item().to(), qr.item().to()); } @@ -82,9 +82,9 @@ TEST(TestQTensor, EmptyQuantized) { int zero_point = 10; int val = 100; int numel = 10; - Tensor q = at::_empty_affine_quantized({numel}, at::device(at::kCPU).dtype(kQInt8), scale, zero_point); + Tensor q = at::_empty_affine_quantized({numel}, at::device(at::kCPU).dtype(kQUInt8), scale, zero_point); // Assigning to QTensor - auto* q_data = q.data(); + auto* q_data = q.data(); for (int i = 0; i < numel; ++i) { q_data[i].val_ = val; } diff --git a/aten/src/ATen/test/thread_init_test.cpp b/aten/src/ATen/test/thread_init_test.cpp index 1c0d8576e32d..6f9ae19485e4 100644 --- a/aten/src/ATen/test/thread_init_test.cpp +++ b/aten/src/ATen/test/thread_init_test.cpp @@ -1,6 +1,6 @@ #include #include -#include +#include #include @@ -11,8 +11,8 @@ void test(int given_num_threads) { at::init_num_threads(); auto t = at::ones({1000 * 1000}, at::CPU(at::kFloat)); - ASSERT(given_num_threads >= 0); - ASSERT(at::get_num_threads() == given_num_threads); + ASSERT_TRUE(given_num_threads >= 0); + ASSERT_EQ(at::get_num_threads(), given_num_threads); auto t_sum = t.sum(); for (int i = 0; i < 1000; ++i) { t_sum = t_sum + t.sum(); @@ -38,5 +38,11 @@ int main() { at::set_num_threads(5); test(at::get_num_threads()); + // test inter-op settings + ASSERT_EQ(at::get_num_interop_threads(), std::thread::hardware_concurrency()); + at::set_num_interop_threads(5); + ASSERT_EQ(at::get_num_interop_threads(), 5); + ASSERT_ANY_THROW(at::set_num_interop_threads(6)); + return 0; } diff --git a/aten/src/TH/THAllocator.cpp b/aten/src/TH/THAllocator.cpp index b73ea191795d..595462a2b9ba 100644 --- a/aten/src/TH/THAllocator.cpp +++ b/aten/src/TH/THAllocator.cpp @@ -89,10 +89,8 @@ THMapAllocator::THMapAllocator(WithFd, const char *filename, int fd, int flags, hfilesz.QuadPart = size; if (flags_ & TH_ALLOCATOR_MAPPED_EXCLUSIVE) { - handle_ = CreateFileMapping(INVALID_HANDLE_VALUE, nullptr, PAGE_READWRITE, hfilesz.HighPart, hfilesz.LowPart, filename); event_ = CreateEvent(nullptr, FALSE, FALSE, eventname); } else if (flags_ & TH_ALLOCATOR_MAPPED_NOCREATE) { - handle_ = OpenFileMapping(FILE_MAP_ALL_ACCESS, FALSE, filename); event_ = OpenEvent(EVENT_ALL_ACCESS, FALSE, eventname); } else { AT_ERROR("Expected either TH_ALLOCATOR_MAPPED_EXCLUSIVE or TH_ALLOCATOR_MAPPED_NOCREATE"); @@ -102,6 +100,14 @@ THMapAllocator::THMapAllocator(WithFd, const char *filename, int fd, int flags, AT_ERROR("Couldn't open shared event: <", eventname, ">, error code: <", GetLastError(), ">"); } + if (flags_ & TH_ALLOCATOR_MAPPED_EXCLUSIVE) { + handle_ = CreateFileMapping(INVALID_HANDLE_VALUE, nullptr, PAGE_READWRITE, hfilesz.HighPart, hfilesz.LowPart, filename); + } else if (flags_ & TH_ALLOCATOR_MAPPED_NOCREATE) { + handle_ = OpenFileMapping(FILE_MAP_ALL_ACCESS, FALSE, filename); + } else { + AT_ERROR("Expected either TH_ALLOCATOR_MAPPED_EXCLUSIVE or TH_ALLOCATOR_MAPPED_NOCREATE"); + } + if (handle_ == nullptr) { AT_ERROR("Couldn't open shared file mapping: <", filename, ">, error code: <", GetLastError(), ">"); } diff --git a/aten/src/TH/THTensor.cpp b/aten/src/TH/THTensor.cpp index 06b02d8dd5bd..0c5cebd223e1 100644 --- a/aten/src/TH/THTensor.cpp +++ b/aten/src/TH/THTensor.cpp @@ -79,7 +79,7 @@ void THTensor_resize(THTensor *self, at::IntArrayRef size, at::IntArrayRef strid void THTensor_resizeNd(THTensor *self, int nDimension, const int64_t *size, const int64_t *stride) { - AT_CHECK(nDimension >= 0, "resizeNd nDimension must be non-negative"); + TORCH_CHECK(nDimension >= 0, "resizeNd nDimension must be non-negative"); at::IntArrayRef sizes(size, nDimension); at::optional strides; if (stride) { @@ -167,7 +167,7 @@ void THTensor_stealAndSetStoragePtr(THTensor* tensor, THStorage* storage) { // We used to allow this, but this breaks device caching, // see Note [We regret making Variable hold a Tensor] // Let's put an actual error message for this one. - AT_CHECK(tensor->storage().device() == storage->device(), + TORCH_CHECK(tensor->storage().device() == storage->device(), "Attempted to set the storage of a tensor on device \"", tensor->storage().device(), "\" to a storage on different device \"", storage->device(), "\". This is no longer allowed; the devices must match."); diff --git a/aten/src/TH/THTensor.h b/aten/src/TH/THTensor.h index c73415dc0816..61c05160b190 100644 --- a/aten/src/TH/THTensor.h +++ b/aten/src/TH/THTensor.h @@ -31,6 +31,9 @@ #include #include +#include +#include + /* fill and zero*/ #include #include diff --git a/aten/src/TH/THTensor.hpp b/aten/src/TH/THTensor.hpp index 1ebaf9094a67..a9c89f222189 100644 --- a/aten/src/TH/THTensor.hpp +++ b/aten/src/TH/THTensor.hpp @@ -34,7 +34,7 @@ inline THStorage* THTensor_getStoragePtr(const THTensor* tensor) { // for the first time (providing the necessary type). It is an ERROR to // invoke any PyTorch operations on such a half-constructed storage, // and this check tests for that case. - AT_CHECK(tensor->storage(), "Cannot use PyTorch operations on a half-constructed " + TORCH_CHECK(tensor->storage(), "Cannot use PyTorch operations on a half-constructed " "tensor. If this tensor came from Caffe2, please call GetMutableData on " "it first; otherwise, this is a bug, please report it."); return tensor->storage().unsafeGetStorageImpl(); diff --git a/aten/src/TH/THTensorEvenMoreMath.cpp b/aten/src/TH/THTensorEvenMoreMath.cpp index a0b9e190998d..432deb26828d 100644 --- a/aten/src/TH/THTensorEvenMoreMath.cpp +++ b/aten/src/TH/THTensorEvenMoreMath.cpp @@ -8,3 +8,6 @@ #include #include + +#include +#include diff --git a/aten/src/TH/generic/THLapack.cpp b/aten/src/TH/generic/THLapack.cpp index 1c81ed291dad..23a2b3f8b6f4 100644 --- a/aten/src/TH/generic/THLapack.cpp +++ b/aten/src/TH/generic/THLapack.cpp @@ -11,12 +11,8 @@ TH_EXTERNC void dgeev_(char *jobvl, char *jobvr, int *n, double *a, int *lda, do TH_EXTERNC void sgeev_(char *jobvl, char *jobvr, int *n, float *a, int *lda, float *wr, float *wi, float* vl, int *ldvl, float *vr, int *ldvr, float *work, int *lwork, int *info); TH_EXTERNC void dgesdd_(char *jobz, int *m, int *n, double *a, int *lda, double *s, double *u, int *ldu, double *vt, int *ldvt, double *work, int *lwork, int *iwork, int *info); TH_EXTERNC void sgesdd_(char *jobz, int *m, int *n, float *a, int *lda, float *s, float *u, int *ldu, float *vt, int *ldvt, float *work, int *lwork, int *iwork, int *info); -TH_EXTERNC void dgetrf_(int *m, int *n, double *a, int *lda, int *ipiv, int *info); -TH_EXTERNC void sgetrf_(int *m, int *n, float *a, int *lda, int *ipiv, int *info); TH_EXTERNC void dgetrs_(char *trans, int *n, int *nrhs, double *a, int *lda, int *ipiv, double *b, int *ldb, int *info); TH_EXTERNC void sgetrs_(char *trans, int *n, int *nrhs, float *a, int *lda, int *ipiv, float *b, int *ldb, int *info); -TH_EXTERNC void dgetri_(int *n, double *a, int *lda, int *ipiv, double *work, int *lwork, int *info); -TH_EXTERNC void sgetri_(int *n, float *a, int *lda, int *ipiv, float *work, int *lwork, int *info); TH_EXTERNC void dpotri_(char *uplo, int *n, double *a, int *lda, int *info); TH_EXTERNC void spotri_(char *uplo, int *n, float *a, int *lda, int *info); TH_EXTERNC void sgeqrf_(int *m, int *n, float *a, int *lda, float *tau, float *work, int *lwork, int *info); @@ -89,20 +85,6 @@ void THLapack_(gesdd)(char jobz, int m, int n, scalar_t *a, int lda, scalar_t *s #endif } -/* LU decomposition */ -void THLapack_(getrf)(int m, int n, scalar_t *a, int lda, int *ipiv, int *info) -{ -#ifdef USE_LAPACK -#if defined(TH_REAL_IS_DOUBLE) - dgetrf_(&m, &n, a, &lda, ipiv, info); -#else - sgetrf_(&m, &n, a, &lda, ipiv, info); -#endif -#else - THError("getrf : Lapack library not found in compile time\n"); -#endif -} - void THLapack_(getrs)(char trans, int n, int nrhs, scalar_t *a, int lda, int *ipiv, scalar_t *b, int ldb, int *info) { #ifdef USE_LAPACK @@ -116,20 +98,6 @@ void THLapack_(getrs)(char trans, int n, int nrhs, scalar_t *a, int lda, int *ip #endif } -/* Matrix Inverse */ -void THLapack_(getri)(int n, scalar_t *a, int lda, int *ipiv, scalar_t *work, int lwork, int* info) -{ -#ifdef USE_LAPACK -#if defined(TH_REAL_IS_DOUBLE) - dgetri_(&n, a, &lda, ipiv, work, &lwork, info); -#else - sgetri_(&n, a, &lda, ipiv, work, &lwork, info); -#endif -#else - THError("getri : Lapack library not found in compile time\n"); -#endif -} - /* Cholesky factorization based Matrix Inverse */ void THLapack_(potri)(char uplo, int n, scalar_t *a, int lda, int *info) { diff --git a/aten/src/TH/generic/THLapack.h b/aten/src/TH/generic/THLapack.h index 055783464d4f..20d469d1eb6e 100644 --- a/aten/src/TH/generic/THLapack.h +++ b/aten/src/TH/generic/THLapack.h @@ -10,11 +10,7 @@ TH_API void THLapack_(syev)(char jobz, char uplo, int n, scalar_t *a, int lda, s TH_API void THLapack_(geev)(char jobvl, char jobvr, int n, scalar_t *a, int lda, scalar_t *wr, scalar_t *wi, scalar_t* vl, int ldvl, scalar_t *vr, int ldvr, scalar_t *work, int lwork, int *info); /* svd */ TH_API void THLapack_(gesdd)(char jobz, int m, int n, scalar_t *a, int lda, scalar_t *s, scalar_t *u, int ldu, scalar_t *vt, int ldvt, scalar_t *work, int lwork, int *iwork, int *info); -/* LU decomposition */ -TH_API void THLapack_(getrf)(int m, int n, scalar_t *a, int lda, int *ipiv, int *info); TH_API void THLapack_(getrs)(char trans, int n, int nrhs, scalar_t *a, int lda, int *ipiv, scalar_t *b, int ldb, int *info); -/* Matrix Inverse */ -TH_API void THLapack_(getri)(int n, scalar_t *a, int lda, int *ipiv, scalar_t *work, int lwork, int* info); /* Positive Definite matrices */ /* Matrix inverse based on Cholesky factorization */ diff --git a/aten/src/TH/generic/THTensor.cpp b/aten/src/TH/generic/THTensor.cpp index 2c28efcaf3f1..fcb7e8f4e2b9 100644 --- a/aten/src/TH/generic/THTensor.cpp +++ b/aten/src/TH/generic/THTensor.cpp @@ -79,7 +79,7 @@ THTensor *THTensor_(newWithTensor)(THTensor *tensor) /* Storage init */ THTensor *THTensor_(newWithStorage)(THStorage *storage, ptrdiff_t storageOffset, at::IntArrayRef sizes, at::IntArrayRef strides) { if (strides.data()) { - AT_CHECK(sizes.size() == strides.size(), "number of sizes and strides must match"); + TORCH_CHECK(sizes.size() == strides.size(), "number of sizes and strides must match"); } THTensor *self = c10::make_intrusive( c10::intrusive_ptr::reclaim(THStorage_(new)()), @@ -154,7 +154,7 @@ THTensor *THTensor_(newClone)(THTensor *self) THTensor_(resizeAs)(tensor, self); at::Tensor tensor_wrap = THTensor_wrap(tensor); at::Tensor self_wrap = THTensor_wrap(self); - at::_copy_same_type_(tensor_wrap, self_wrap); + at::native::copy_(tensor_wrap, self_wrap, false); return tensor; } @@ -596,7 +596,7 @@ void THTensor_(freeCopyTo)(THTensor *self, THTensor *dst) if(self != dst) { at::Tensor dst_wrap = THTensor_wrap(dst); at::Tensor self_wrap = THTensor_wrap(self); - at::_copy_same_type_(dst_wrap, self_wrap); + at::native::copy_(dst_wrap, self_wrap, false); } THTensor_(free)(self); @@ -816,7 +816,7 @@ void THTensor_(catArray)(THTensor *result, THTensor **inputs, int numInputs, int THTensor_(narrow)(nt, NULL, dimension, offset, dimSize); at::Tensor nt__wrap = THTensor_wrap(nt); at::Tensor inputs_wrap = THTensor_wrap(inputs[j]); - at::_copy_same_type_(nt__wrap, inputs_wrap); + at::native::copy_(nt__wrap, inputs_wrap, false); c10::raw::intrusive_ptr::decref(nt); offset += dimSize; } diff --git a/aten/src/TH/generic/THTensorApply.hpp b/aten/src/TH/generic/THTensorApply.hpp index 7d9b6bc9a0ab..a7994c6bbad1 100644 --- a/aten/src/TH/generic/THTensorApply.hpp +++ b/aten/src/TH/generic/THTensorApply.hpp @@ -61,11 +61,20 @@ if (std::isnan(val)) break; #define th_isnan_break(val) #endif -#ifdef _WIN32 -// MSVC doesn't support loop pragmas, but does support others. Create a new macro to account for those differences. -#define PRAGMA_LOOP(P) // Noop +#ifdef _MSC_VER +#define PRAGMA(P) __pragma(P) +# if _MSC_VER < 1920 +// MSVC < 2019 doesn't support loop pragmas. +# define PRAGMA_IVDEP // Noop +# define PRAGMA_SIMD // Noop +# else +# define PRAGMA_IVDEP PRAGMA(loop(ivdep)) +# define PRAGMA_SIMD PRAGMA(omp simd) +# endif #else -#define PRAGMA_LOOP(P) _Pragma(#P) +#define PRAGMA(P) _Pragma(#P) +#define PRAGMA_IVDEP PRAGMA(ivdep) +#define PRAGMA_SIMD PRAGMA(simd) #endif #define TH_TENSOR_APPLY2_PARALLEL(SIZE, CONTIG1, CONTIG2, TYPE1, TENSOR1, TYPE2, TENSOR2, CODE, THRESHOLD) \ @@ -76,7 +85,7 @@ if (std::isnan(val)) break; TYPE2 *tp = THTensor_getStoragePtr(TENSOR2)->data()+TENSOR2->storage_offset(); \ if (tp != (TYPE2*)rp) { \ at::parallel_for(0, SIZE, (THRESHOLD * 10), [&](int64_t begin, int64_t end) { \ - PRAGMA_LOOP(ivdep) \ + PRAGMA_IVDEP \ for (auto iter = begin; iter < end; iter++) { \ TYPE2 *TENSOR2##_data = tp+iter; \ TYPE1 *TENSOR1##_data = rp+iter; \ @@ -85,7 +94,7 @@ if (std::isnan(val)) break; }); \ } else { \ at::parallel_for(0, SIZE, (THRESHOLD * 10), [&](int64_t begin, int64_t end) { \ - PRAGMA_LOOP(simd) \ + PRAGMA_SIMD \ for (auto iter = begin; iter < end; iter++) { \ TYPE2* TENSOR2##_data = tp+iter; \ TYPE1* TENSOR1##_data = rp+iter; \ @@ -165,7 +174,7 @@ if (std::isnan(val)) break; TYPE3 *srcp = THTensor_getStoragePtr(TENSOR3)->data()+TENSOR3->storage_offset(); \ if (tp != (TYPE2*)rp) { \ at::parallel_for(0, SIZE, (THRESHOLD * 10), [&](int64_t begin, int64_t end) { \ - PRAGMA_LOOP(ivdep) \ + PRAGMA_IVDEP \ for (auto iter = begin; iter < end; iter++) { \ TYPE1 *TENSOR1##_data = rp+iter; \ TYPE2 *TENSOR2##_data = tp+iter; \ @@ -175,7 +184,7 @@ if (std::isnan(val)) break; }); \ } else { \ at::parallel_for(0, SIZE, (THRESHOLD * 10), [&](int64_t begin, int64_t end) { \ - PRAGMA_LOOP(simd) \ + PRAGMA_SIMD \ for (auto iter = begin; iter < end; iter++) { \ TYPE1 *TENSOR1##_data = rp+iter; \ TYPE2 *TENSOR2##_data = tp+iter; \ diff --git a/aten/src/TH/generic/THTensorConv.cpp b/aten/src/TH/generic/THTensorConv.cpp index 375042787a38..1fbbc6e491ae 100644 --- a/aten/src/TH/generic/THTensorConv.cpp +++ b/aten/src/TH/generic/THTensorConv.cpp @@ -591,8 +591,8 @@ void THTensor_(conv2DRevger)(THTensor *r_, scalar_t beta, scalar_t alpha, THTens scalar_t *output_data; ptrdiff_t nelem; - AT_CHECK(!t_->is_empty() && t_->dim() == 3, "input: non-empty 3D Tensor expected, got size: ", t_->sizes()); - AT_CHECK(!k_->is_empty() && k_->dim() == 3, "kernel: non-empty 3D Tensor expected, got size: ", k_->sizes()); + TORCH_CHECK(!t_->is_empty() && t_->dim() == 3, "input: non-empty 3D Tensor expected, got size: ", t_->sizes()); + TORCH_CHECK(!k_->is_empty() && k_->dim() == 3, "kernel: non-empty 3D Tensor expected, got size: ", k_->sizes()); THArgCheck(srow >= 1, 5, "Stride should be a positive integer"); THArgCheck(scol >= 1, 6, "Stride should be a positive integer"); @@ -696,8 +696,8 @@ void THTensor_(conv2DRevgerm)(THTensor *r_, scalar_t beta, scalar_t alpha, THTen scalar_t *output_data; ptrdiff_t nelem; - AT_CHECK(!t_->is_empty() && t_->dim() == 4, "input: non-empty 4D Tensor expected, got size: ", t_->sizes()); - AT_CHECK(!k_->is_empty() && k_->dim() == 4, "kernel: non-empty 4D Tensor expected, got size: ", k_->sizes()); + TORCH_CHECK(!t_->is_empty() && t_->dim() == 4, "input: non-empty 4D Tensor expected, got size: ", t_->sizes()); + TORCH_CHECK(!k_->is_empty() && k_->dim() == 4, "kernel: non-empty 4D Tensor expected, got size: ", k_->sizes()); THArgCheck(srow >= 1, 5, "Stride should be a positive integer"); THArgCheck(scol >= 1, 6, "Stride should be a positive integer"); @@ -807,8 +807,8 @@ void THTensor_(conv2Dger)(THTensor *r_, scalar_t beta, scalar_t alpha, THTensor scalar_t *output_data; ptrdiff_t nelem; - AT_CHECK(!t_->is_empty() && t_->dim() == 3, "input: non-empty 3D Tensor expected, got size: ", t_->sizes()); - AT_CHECK(!k_->is_empty() && k_->dim() == 3, "kernel: non-empty 3D Tensor expected, got size: ", k_->sizes()); + TORCH_CHECK(!t_->is_empty() && t_->dim() == 3, "input: non-empty 3D Tensor expected, got size: ", t_->sizes()); + TORCH_CHECK(!k_->is_empty() && k_->dim() == 3, "kernel: non-empty 3D Tensor expected, got size: ", k_->sizes()); THArgCheck(srow >= 1, 5, "Stride should be a positive integer"); THArgCheck(scol >= 1, 6, "Stride should be a positive integer"); THArgCheck(*vf == 'V' || *vf == 'F', 7, "type of convolution can 'V' or 'F'"); @@ -941,8 +941,8 @@ void THTensor_(conv2Dmv)(THTensor *r_, scalar_t beta, scalar_t alpha, THTensor * scalar_t *output_data; ptrdiff_t nelem; - AT_CHECK(!t_->is_empty() && t_->dim() == 3, "input: non-empty 3D Tensor expected, got size: ", t_->sizes()); - AT_CHECK(!k_->is_empty() && k_->dim() == 4, "kernel: non-empty 4D Tensor expected, got size: ", k_->sizes()); + TORCH_CHECK(!t_->is_empty() && t_->dim() == 3, "input: non-empty 3D Tensor expected, got size: ", t_->sizes()); + TORCH_CHECK(!k_->is_empty() && k_->dim() == 4, "kernel: non-empty 4D Tensor expected, got size: ", k_->sizes()); THArgCheck(srow >= 1, 5, "Stride should be a positive integer"); THArgCheck(scol >= 1, 6, "Stride should be a positive integer"); THArgCheck(*vf == 'V' || *vf == 'F', 7, "type of convolution can 'V' or 'F'"); @@ -1082,8 +1082,8 @@ void THTensor_(conv2Dmm)(THTensor *r_, scalar_t beta, scalar_t alpha, THTensor * scalar_t *weight_data; scalar_t *output_data; - AT_CHECK(!t_->is_empty() && t_->dim() == 4, "input: non-empty 4D Tensor expected, got size: ", t_->sizes()); - AT_CHECK(!k_->is_empty() && k_->dim() == 4, "kernel: non-empty 4D Tensor expected, got size: ", k_->sizes()); + TORCH_CHECK(!t_->is_empty() && t_->dim() == 4, "input: non-empty 4D Tensor expected, got size: ", t_->sizes()); + TORCH_CHECK(!k_->is_empty() && k_->dim() == 4, "kernel: non-empty 4D Tensor expected, got size: ", k_->sizes()); THArgCheck(srow >= 1, 5, "Stride should be a positive integer"); THArgCheck(scol >= 1, 6, "Stride should be a positive integer"); THArgCheck(*vf == 'V' || *vf == 'F', 7, "type of convolution can 'V' or 'F'"); @@ -1232,8 +1232,8 @@ void THTensor_(conv2Dmul)(THTensor *r_, scalar_t beta, scalar_t alpha, THTensor scalar_t *output_data; ptrdiff_t nelem; - AT_CHECK(!t_->is_empty() && t_->dim() == 2, "input: non-empty 2D Tensor expected, got size: ", t_->sizes()); - AT_CHECK(!k_->is_empty() && k_->dim() == 2, "kernel: non-empty 2D Tensor expected, got size: ", k_->sizes()); + TORCH_CHECK(!t_->is_empty() && t_->dim() == 2, "input: non-empty 2D Tensor expected, got size: ", t_->sizes()); + TORCH_CHECK(!k_->is_empty() && k_->dim() == 2, "kernel: non-empty 2D Tensor expected, got size: ", k_->sizes()); THArgCheck(srow >= 1, 5, "Stride should be a positive integer"); THArgCheck(scol >= 1, 6, "Stride should be a positive integer"); @@ -1291,8 +1291,8 @@ void THTensor_(conv2Dcmul)(THTensor *r_, scalar_t beta, scalar_t alpha, THTensor ptrdiff_t nelem; int64_t k; - AT_CHECK(!t_->is_empty() && t_->dim() == 3, "input: non-empty 3D Tensor expected, got size: ", t_->sizes()); - AT_CHECK(!k_->is_empty() && k_->dim() == 3, "kernel: non-empty 3D Tensor expected, got size: ", k_->sizes()); + TORCH_CHECK(!t_->is_empty() && t_->dim() == 3, "input: non-empty 3D Tensor expected, got size: ", t_->sizes()); + TORCH_CHECK(!k_->is_empty() && k_->dim() == 3, "kernel: non-empty 3D Tensor expected, got size: ", k_->sizes()); THArgCheck(srow >= 1, 5, "Stride should be a positive integer"); THArgCheck(scol >= 1, 6, "Stride should be a positive integer"); @@ -1369,8 +1369,8 @@ void THTensor_(conv2Dmap)(THTensor *r_, scalar_t beta, scalar_t alpha, THTensor ptrdiff_t nelem; int64_t k; - AT_CHECK(!t_->is_empty() && t_->dim() == 3, "input: non-empty 3D Tensor expected, got size: ", t_->sizes()); - AT_CHECK(!k_->is_empty() && k_->dim() == 3, "kernel: non-empty 3D Tensor expected, got size: ", k_->sizes()); + TORCH_CHECK(!t_->is_empty() && t_->dim() == 3, "input: non-empty 3D Tensor expected, got size: ", t_->sizes()); + TORCH_CHECK(!k_->is_empty() && k_->dim() == 3, "kernel: non-empty 3D Tensor expected, got size: ", k_->sizes()); THArgCheck(THTensor_nDimensionLegacyAll(map) == 2 , 4, "map: 2D Tensor expected"); THArgCheck(srow >= 1, 6, "Stride should be a positive integer"); THArgCheck(scol >= 1, 7, "Stride should be a positive integer"); @@ -1457,8 +1457,8 @@ void THTensor_(conv3DRevger)(THTensor *r_, scalar_t beta, scalar_t alpha, THTens ptrdiff_t nelem; int64_t k, i; - AT_CHECK(!t_->is_empty() && t_->dim() == 4, "input: non-empty 4D Tensor expected, got size: ", t_->sizes()); - AT_CHECK(!k_->is_empty() && k_->dim() == 4, "kernel: non-empty 4D Tensor expected, got size: ", k_->sizes()); + TORCH_CHECK(!t_->is_empty() && t_->dim() == 4, "input: non-empty 4D Tensor expected, got size: ", t_->sizes()); + TORCH_CHECK(!k_->is_empty() && k_->dim() == 4, "kernel: non-empty 4D Tensor expected, got size: ", k_->sizes()); THArgCheck(sdepth >= 1, 5, "Stride should be a positive integer"); THArgCheck(srow >= 1, 6, "Stride should be a positive integer"); THArgCheck(scol >= 1, 7, "Stride should be a positive integer"); @@ -1543,8 +1543,8 @@ void THTensor_(conv3Dger)(THTensor *r_, scalar_t beta, scalar_t alpha, THTensor ptrdiff_t nelem; int64_t k, i; - AT_CHECK(!t_->is_empty() && t_->dim() == 4, "input: non-empty 4D Tensor expected, got size: ", t_->sizes()); - AT_CHECK(!k_->is_empty() && k_->dim() == 4, "kernel: non-empty 4D Tensor expected, got size: ", k_->sizes()); + TORCH_CHECK(!t_->is_empty() && t_->dim() == 4, "input: non-empty 4D Tensor expected, got size: ", t_->sizes()); + TORCH_CHECK(!k_->is_empty() && k_->dim() == 4, "kernel: non-empty 4D Tensor expected, got size: ", k_->sizes()); THArgCheck(sdepth >= 1, 5, "Stride should be a positive integer"); THArgCheck(srow >= 1, 6, "Stride should be a positive integer"); THArgCheck(scol >= 1, 7, "Stride should be a positive integer"); @@ -1634,8 +1634,8 @@ void THTensor_(conv3Dmv)(THTensor *r_, scalar_t beta, scalar_t alpha, THTensor * ptrdiff_t nelem; int64_t k, i; - AT_CHECK(!t_->is_empty() && t_->dim() == 4, "input: non-empty 4D Tensor expected, got size: ", t_->sizes()); - AT_CHECK(!k_->is_empty() && k_->dim() == 5, "kernel: non-empty 5D Tensor expected, got size: ", k_->sizes()); + TORCH_CHECK(!t_->is_empty() && t_->dim() == 4, "input: non-empty 4D Tensor expected, got size: ", t_->sizes()); + TORCH_CHECK(!k_->is_empty() && k_->dim() == 5, "kernel: non-empty 5D Tensor expected, got size: ", k_->sizes()); THArgCheck(sdepth >= 1, 5, "Stride should be a positive integer"); THArgCheck(srow >= 1, 6, "Stride should be a positive integer"); THArgCheck(scol >= 1, 7, "Stride should be a positive integer"); @@ -1729,8 +1729,8 @@ void THTensor_(conv3Dmul)(THTensor *r_, scalar_t beta, scalar_t alpha, THTensor scalar_t *output_data; ptrdiff_t nelem; - AT_CHECK(!t_->is_empty() && t_->dim() == 3, "input: non-empty 3D Tensor expected, got size: ", t_->sizes()); - AT_CHECK(!k_->is_empty() && k_->dim() == 3, "kernel: non-empty 3D Tensor expected, got size: ", k_->sizes()); + TORCH_CHECK(!t_->is_empty() && t_->dim() == 3, "input: non-empty 3D Tensor expected, got size: ", t_->sizes()); + TORCH_CHECK(!k_->is_empty() && k_->dim() == 3, "kernel: non-empty 3D Tensor expected, got size: ", k_->sizes()); THArgCheck(sdepth >= 1, 5, "Stride should be a positive integer"); THArgCheck(srow >= 1, 6, "Stride should be a positive integer"); THArgCheck(scol >= 1, 7, "Stride should be a positive integer"); @@ -1796,8 +1796,8 @@ void THTensor_(conv3Dcmul)(THTensor *r_, scalar_t beta, scalar_t alpha, THTensor ptrdiff_t nelem; int64_t k; - AT_CHECK(!t_->is_empty() && t_->dim() == 4, "input: non-empty 4D Tensor expected, got size: ", t_->sizes()); - AT_CHECK(!k_->is_empty() && k_->dim() == 4, "kernel: non-empty 4D Tensor expected, got size: ", k_->sizes()); + TORCH_CHECK(!t_->is_empty() && t_->dim() == 4, "input: non-empty 4D Tensor expected, got size: ", t_->sizes()); + TORCH_CHECK(!k_->is_empty() && k_->dim() == 4, "kernel: non-empty 4D Tensor expected, got size: ", k_->sizes()); THArgCheck(srow >= 1, 5, "Stride should be a positive integer"); THArgCheck(scol >= 1, 6, "Stride should be a positive integer"); THArgCheck(*vf == 'V' || *vf == 'F', 7, "type of convolution can 'V' or 'F'"); @@ -1882,8 +1882,8 @@ void THTensor_(conv3Dmap)(THTensor *r_, scalar_t beta, scalar_t alpha, THTensor int64_t nmaps; int64_t k; - AT_CHECK(!t_->is_empty() && t_->dim() == 4, "input: non-empty 4D Tensor expected, got size: ", t_->sizes()); - AT_CHECK(!k_->is_empty() && k_->dim() == 4, "kernel: non-empty 4D Tensor expected, got size: ", k_->sizes()); + TORCH_CHECK(!t_->is_empty() && t_->dim() == 4, "input: non-empty 4D Tensor expected, got size: ", t_->sizes()); + TORCH_CHECK(!k_->is_empty() && k_->dim() == 4, "kernel: non-empty 4D Tensor expected, got size: ", k_->sizes()); THArgCheck(THTensor_nDimensionLegacyAll(map) == 2 , 4, "map: 2D Tensor expected"); THArgCheck(srow >= 1, 6, "Stride should be a positive integer"); THArgCheck(scol >= 1, 7, "Stride should be a positive integer"); diff --git a/aten/src/TH/generic/THTensorEvenMoreMath.cpp b/aten/src/TH/generic/THTensorEvenMoreMath.cpp index 00da33304a1a..8596f8d38df2 100644 --- a/aten/src/TH/generic/THTensorEvenMoreMath.cpp +++ b/aten/src/TH/generic/THTensorEvenMoreMath.cpp @@ -11,7 +11,7 @@ void THTensor_(nonzero)(THLongTensor *subscript, THTensor *tensor) int64_t *subscript_data; int64_t i = 0; #ifdef TH_REAL_IS_HALF -#define IS_NONZERO(val) ((val.x & 0x7fff) != 0) +#define IS_NONZERO(val) (c10::Half(0)!=val) #else #define IS_NONZERO(val) ((val)!=0) #endif @@ -65,8 +65,12 @@ void THTensor_(nonzero)(THLongTensor *subscript, THTensor *tensor) ); delete [] sizes; delete [] idx; + +#undef IS_NONZERO } +#if !defined(TH_REAL_IS_HALF) /* non half only part */ + accreal THTensor_(sumall)(THTensor *tensor) { accreal sum = 0; @@ -75,7 +79,76 @@ accreal THTensor_(sumall)(THTensor *tensor) return sum; } -#if !defined(TH_REAL_IS_BOOL) /* non bool only part */ +void THTensor_(maskedSelect)(THTensor *tensor, THTensor *src, THByteTensor *mask) +{ + ptrdiff_t numel = THByteTensor_sumall(mask); + scalar_t *tensor_data; + +#ifdef DEBUG + THAssert(numel <= LONG_MAX); +#endif + THTensor_(resize1d)(tensor,numel); + tensor_data = tensor->data(); + TH_TENSOR_APPLY2(scalar_t, src, unsigned char, mask, + if (*mask_data > 1) + { + THFree(mask_counter); + THFree(src_counter); + THError("Mask tensor can take 0 and 1 values only"); + } + else if (*mask_data == 1) + { + *tensor_data = *src_data; + tensor_data++; + }); +} + +void THTensor_(maskedSelectBool)(THTensor *tensor, THTensor *src, THBoolTensor *mask) +{ + ptrdiff_t numel = THBoolTensor_sumall(mask); + scalar_t *tensor_data; + +#ifdef DEBUG + THAssert(numel <= LONG_MAX); +#endif + THTensor_(resize1d)(tensor,numel); + tensor_data = tensor->data(); + TH_TENSOR_APPLY2(scalar_t, src, bool, mask, + if (*mask_data) + { + *tensor_data = *src_data; + tensor_data++; + }); +} + +void THTensor_(bitand)(THTensor *r_, THTensor *t, scalar_t value) +{ +#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF) + (void)r_; + (void)t; + (void)value; + return THError("bitand is only supported for integer type tensors"); +#else + THTensor_(resizeAs)(r_, t); + int64_t r_Size = THTensor_(nElement)(r_); + int r_Contig = THTensor_(isContiguous)(r_); + int tContig = THTensor_(isContiguous)(t); + if (r_Contig && tContig) { + scalar_t *tp = t->data(); + scalar_t *rp = r_->data(); + at::parallel_for(0, r_Size, TH_OMP_OVERHEAD_THRESHOLD * 100, + [&](int64_t start, int64_t end) { + for (auto i = start; i < end; i++) { + rp[i] = tp[i] & value; + } + }); + } else { + TH_TENSOR_APPLY2_PARALLEL(r_Size, r_Contig, tContig, scalar_t, r_, scalar_t, t, *r__data = *t_data & value;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD); + } +#endif +} + +#if !defined(TH_REAL_IS_BOOL) void THTensor_(maskedFill)(THTensor *tensor, THByteTensor *mask, scalar_t value) { @@ -186,48 +259,6 @@ void THTensor_(maskedCopyBool)(THTensor *tensor, THBoolTensor *mask, THTensor* s c10::raw::intrusive_ptr::decref(srct); } -void THTensor_(maskedSelect)(THTensor *tensor, THTensor *src, THByteTensor *mask) -{ - ptrdiff_t numel = THByteTensor_sumall(mask); - scalar_t *tensor_data; - -#ifdef DEBUG - THAssert(numel <= LONG_MAX); -#endif - THTensor_(resize1d)(tensor,numel); - tensor_data = tensor->data(); - TH_TENSOR_APPLY2(scalar_t, src, unsigned char, mask, - if (*mask_data > 1) - { - THFree(mask_counter); - THFree(src_counter); - THError("Mask tensor can take 0 and 1 values only"); - } - else if (*mask_data == 1) - { - *tensor_data = *src_data; - tensor_data++; - }); -} - -void THTensor_(maskedSelectBool)(THTensor *tensor, THTensor *src, THBoolTensor *mask) -{ - ptrdiff_t numel = THBoolTensor_sumall(mask); - scalar_t *tensor_data; - -#ifdef DEBUG - THAssert(numel <= LONG_MAX); -#endif - THTensor_(resize1d)(tensor,numel); - tensor_data = tensor->data(); - TH_TENSOR_APPLY2(scalar_t, src, bool, mask, - if (*mask_data) - { - *tensor_data = *src_data; - tensor_data++; - }); -} - void THTensor_(indexSelect)(THTensor *tensor, THTensor *src, int dim, THLongTensor *index) { ptrdiff_t i, numel; @@ -304,7 +335,7 @@ void THTensor_(indexSelect)(THTensor *tensor, THTensor *src, int dim, THLongTens THTensor_(select)(sSlice, src, dim, index_data[i]); at::Tensor tSlice_wrap = THTensor_wrap(tSlice); at::Tensor sSlice_wrap = THTensor_wrap(sSlice); - at::_copy_same_type_(tSlice_wrap, sSlice_wrap); + at::native::copy_(tSlice_wrap, sSlice_wrap); c10::raw::intrusive_ptr::decref(tSlice); c10::raw::intrusive_ptr::decref(sSlice); } @@ -337,7 +368,7 @@ void THTensor_(indexCopy)(THTensor *tensor, int dim, THLongTensor *index, THTens THTensor_(select)(sSlice, src, dim, i); at::Tensor tSlice_wrap = THTensor_wrap(tSlice); at::Tensor sSlice_wrap = THTensor_wrap(sSlice); - at::_copy_same_type_(tSlice_wrap, sSlice_wrap); + at::native::copy_(tSlice_wrap, sSlice_wrap); } c10::raw::intrusive_ptr::decref(tSlice); @@ -877,32 +908,7 @@ void THTensor_(remainder)(THTensor *r_, THTensor *t, scalar_t value) } } -void THTensor_(bitand)(THTensor *r_, THTensor *t, scalar_t value) -{ -#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF) - (void)r_; - (void)t; - (void)value; - return THError("bitand is only supported for integer type tensors"); -#else - THTensor_(resizeAs)(r_, t); - int64_t r_Size = THTensor_(nElement)(r_); - int r_Contig = THTensor_(isContiguous)(r_); - int tContig = THTensor_(isContiguous)(t); - if (r_Contig && tContig) { - scalar_t *tp = t->data(); - scalar_t *rp = r_->data(); - at::parallel_for(0, r_Size, TH_OMP_OVERHEAD_THRESHOLD * 100, - [&](int64_t start, int64_t end) { - for (auto i = start; i < end; i++) { - rp[i] = tp[i] & value; - } - }); - } else { - TH_TENSOR_APPLY2_PARALLEL(r_Size, r_Contig, tContig, scalar_t, r_, scalar_t, t, *r__data = *t_data & value;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD); - } #endif -} #endif diff --git a/aten/src/TH/generic/THTensorLapack.cpp b/aten/src/TH/generic/THTensorLapack.cpp index e0ed46a8b241..6f53e9ad5ef6 100644 --- a/aten/src/TH/generic/THTensorLapack.cpp +++ b/aten/src/TH/generic/THTensorLapack.cpp @@ -83,14 +83,14 @@ static THTensor *THTensor_(cloneColumnMajorNrows)(THTensor *self, THTensor *src, if (src->size(0) == nrows) { at::Tensor result_wrap = THTensor_wrap(result); at::Tensor src_wrap = THTensor_wrap(src); - at::_copy_same_type_(result_wrap, src_wrap); + at::native::copy_(result_wrap, src_wrap); } else { view = THTensor_(newNarrow)(result, 0, 0, src->size(0)); at::Tensor view_wrap = THTensor_wrap(view); at::Tensor src_wrap = THTensor_wrap(src); - at::_copy_same_type_(view_wrap, src_wrap); + at::native::copy_(view_wrap, src_wrap); c10::raw::intrusive_ptr::decref(view); } return result; @@ -118,7 +118,7 @@ void THTensor_(gels)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a) THArgCheck(b->dim() == 1 || b->dim() == 2, 1, "B should have 1 or 2 " "dimensions, but has %d", b->dim()); THArgCheck(!b->is_empty(), 1, "B should not be empty"); - AT_CHECK(a->size(0) == b->size(0), "Expected A and b to have same size " + TORCH_CHECK(a->size(0) == b->size(0), "Expected A and b to have same size " "at dim 0, but A has ", a->size(0), " rows and B has ", b->size(0), " rows"); if (THTensor_nDimensionLegacyAll(b) == 1) { @@ -432,7 +432,7 @@ void THTensor_(gesdd2)(THTensor *ru_, THTensor *rs_, THTensor *rv_, THTensor *ra THTensor_(resizeAs)(rv_, rvf_); at::Tensor rv__wrap = THTensor_wrap(rv_); at::Tensor rvf__wrap = THTensor_wrap(rvf_); - at::_copy_same_type_(rv__wrap, rvf__wrap); + at::native::copy_(rv__wrap, rvf__wrap); c10::raw::intrusive_ptr::decref(rvf_); } else { THTensor_(zero)(ru_); @@ -440,50 +440,6 @@ void THTensor_(gesdd2)(THTensor *ru_, THTensor *rs_, THTensor *rv_, THTensor *ra } } -void THTensor_(getri)(THTensor *ra_, THTensor *a) -{ - if (a == NULL) a = ra_; - THArgCheck(THTensor_nDimensionLegacyAll(a) == 2, 1, "A should be 2 dimensional"); - THArgCheck(a->size(0) == a->size(1), 1, "A should be square"); - - int m, n, lda, info, lwork; - scalar_t wkopt; - THIntTensor *ipiv; - THTensor *work; - THTensor *ra__ = NULL; - - ra__ = THTensor_(cloneColumnMajor)(ra_, a); - - m = ra__->size(0); - n = ra__->size(1); - lda = m; - ipiv = THIntTensor_newWithSize1d((int64_t)m); - - /* Run LU */ - THLapack_(getrf)(n, n, ra__->data(), lda, THIntTensor_data(ipiv), &info); - THLapackCheckWithCleanup("Lapack Error %s : U(%d,%d) is 0, U is singular", - THCleanup( - c10::raw::intrusive_ptr::decref(ra__); - THIntTensor_free(ipiv);), - "getrf", info, info); - - /* Run inverse */ - THLapack_(getri)(n, ra__->data(), lda, THIntTensor_data(ipiv), &wkopt, -1, &info); - lwork = (int)wkopt; - work = THTensor_(newWithSize1d)(lwork); - THLapack_(getri)(n, ra__->data(), lda, THIntTensor_data(ipiv), work->data(), lwork, &info); - THLapackCheckWithCleanup("Lapack Error %s : U(%d,%d) is 0, U is singular", - THCleanup( - c10::raw::intrusive_ptr::decref(ra__); - c10::raw::intrusive_ptr::decref(work); - THIntTensor_free(ipiv);), - "getri", info, info); - - THTensor_(freeCopyTo)(ra__, ra_); - c10::raw::intrusive_ptr::decref(work); - THIntTensor_free(ipiv); -} - void THTensor_(clearUpLoTriangle)(THTensor *a, const char *uplo) { THArgCheck(THTensor_nDimensionLegacyAll(a) == 2, 1, "A should be 2 dimensional"); @@ -831,9 +787,9 @@ void THTensor_(ormqr)(THTensor *ra_, THTensor *a, THTensor *tau, THTensor *c, co void THTensor_(btrisolve)(THTensor *rb_, THTensor *b, THTensor *atf, THIntTensor *pivots) { - AT_CHECK(!atf->is_empty() && THTensor_(nDimensionLegacyNoScalars)(atf) == 3, "expected non-empty 3D tensor, got size: ", + TORCH_CHECK(!atf->is_empty() && THTensor_(nDimensionLegacyNoScalars)(atf) == 3, "expected non-empty 3D tensor, got size: ", atf->sizes()); - AT_CHECK(!b->is_empty() && (THTensor_(nDimensionLegacyNoScalars)(b) == 3 || + TORCH_CHECK(!b->is_empty() && (THTensor_(nDimensionLegacyNoScalars)(b) == 3 || THTensor_(nDimensionLegacyNoScalars)(b) == 2), "expected non-empty 2D or 3D tensor, got size: ", b->sizes()); THArgCheck(THTensor_(size)(atf, 0) == THTensor_(size)(b, 0), 3, "number of batches must be equal"); @@ -846,7 +802,7 @@ void THTensor_(btrisolve)(THTensor *rb_, THTensor *b, THTensor *atf, THIntTensor THTensor_(resizeAs)(rb_, b); at::Tensor rb__wrap = THTensor_wrap(rb_); at::Tensor b_wrap = THTensor_wrap(b); - at::_copy_same_type_(rb__wrap, b_wrap); + at::native::copy_(rb__wrap, b_wrap); } int64_t num_batches = atf->size(0); diff --git a/aten/src/TH/generic/THTensorLapack.h b/aten/src/TH/generic/THTensorLapack.h index 4c693a870a86..5c512ab98110 100644 --- a/aten/src/TH/generic/THTensorLapack.h +++ b/aten/src/TH/generic/THTensorLapack.h @@ -8,7 +8,6 @@ TH_API void THTensor_(geev)(THTensor *re_, THTensor *rv_, THTensor *a_, const ch TH_API void THTensor_(gesdd)(THTensor *ru_, THTensor *rs_, THTensor *rv_, THTensor *a, const char *some, const char* compute_uv); TH_API void THTensor_(gesdd2)(THTensor *ru_, THTensor *rs_, THTensor *rv_, THTensor *ra_, THTensor *a, const char *some, const char* compute_uv); -TH_API void THTensor_(getri)(THTensor *ra_, THTensor *a); TH_API void THTensor_(potri)(THTensor *ra_, THTensor *a, const char *uplo); TH_API void THTensor_(qr)(THTensor *rq_, THTensor *rr_, THTensor *a); TH_API void THTensor_(geqrf)(THTensor *ra_, THTensor *rtau_, THTensor *a); diff --git a/aten/src/TH/generic/THTensorMath.cpp b/aten/src/TH/generic/THTensorMath.cpp index 4dbc8fbdaecb..52718e541d57 100644 --- a/aten/src/TH/generic/THTensorMath.cpp +++ b/aten/src/TH/generic/THTensorMath.cpp @@ -21,20 +21,115 @@ // sense (rather than just having cut the file down the middle, which is // what I did when I split these up originally). -#if !defined(TH_REAL_IS_BOOL) /* non bool only part */ +void THTensor_(cbitand)(THTensor *r_, THTensor *t, THTensor *src) +{ +#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF) + (void)r_; + (void)t; + (void)src; + return THError("cbitand is only supported for integer type tensors"); +#else + THTensor_(resizeAs)(r_, t); + int64_t r_Size = THTensor_(nElement)(r_); + int64_t srcSize = THTensor_(nElement)(src); + int r_Contig = THTensor_(isContiguous)(r_); + int tContig = THTensor_(isContiguous)(t); + int srcContig = THTensor_(isContiguous)(src); + if (srcSize == r_Size){ + if (r_Contig && tContig && srcContig) { + scalar_t *tp = t->data(); + scalar_t *sp = src->data(); + scalar_t *rp = r_->data(); + at::parallel_for(0, r_Size, TH_OMP_OVERHEAD_THRESHOLD, + [&](int64_t start, int64_t end) { + for (auto i = start; i < end; i++) { + rp[i] = tp[i] & sp[i]; + } + }); + } else { + TH_TENSOR_APPLY3_PARALLEL(r_Size, r_Contig, tContig, srcContig, scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = *t_data & *src_data;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD); + } + } else { + TH_TENSOR_APPLY3(scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = *t_data & *src_data;); + } +#endif +} -// Should wrap if the value (a) has a different sign than the divisor (b), but is not 0. -static inline bool modulo_wrap(scalar_t a, scalar_t b) { - return (a != 0) && (a < 0) != (b < 0); +void THTensor_(cbitor)(THTensor *r_, THTensor *t, THTensor *src) +{ +#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF) + (void)r_; + (void)t; + (void)src; + return THError("cbitor is only supported for integer type tensors"); +#else + THTensor_(resizeAs)(r_, t); + int64_t r_Size = THTensor_(nElement)(r_); + int64_t srcSize = THTensor_(nElement)(src); + int r_Contig = THTensor_(isContiguous)(r_); + int tContig = THTensor_(isContiguous)(t); + int srcContig = THTensor_(isContiguous)(src); + if (srcSize == r_Size){ + if (r_Contig && tContig && srcContig) { + scalar_t *tp = t->data(); + scalar_t *sp = src->data(); + scalar_t *rp = r_->data(); + at::parallel_for(0, r_Size, TH_OMP_OVERHEAD_THRESHOLD, + [&](int64_t start, int64_t end) { + for (auto i = start; i < end; i++) { + rp[i] = tp[i] | sp[i]; + } + }); + } else { + TH_TENSOR_APPLY3_PARALLEL(r_Size, r_Contig, tContig, srcContig, scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = *t_data | *src_data;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD); + } + } else { + TH_TENSOR_APPLY3(scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = *t_data | *src_data;); + } +#endif } -void THTensor_(bitor)(THTensor *r_, THTensor *t, scalar_t value) +void THTensor_(cbitxor)(THTensor *r_, THTensor *t, THTensor *src) +{ +#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF) + (void)r_; + (void)t; + (void)src; + return THError("cbitxor is only supported for integer type tensors"); +#else + THTensor_(resizeAs)(r_, t); + int64_t r_Size = THTensor_(nElement)(r_); + int64_t srcSize = THTensor_(nElement)(src); + int r_Contig = THTensor_(isContiguous)(r_); + int tContig = THTensor_(isContiguous)(t); + int srcContig = THTensor_(isContiguous)(src); + if (srcSize == r_Size){ + if (r_Contig && tContig && srcContig) { + scalar_t *tp = t->data(); + scalar_t *sp = src->data(); + scalar_t *rp = r_->data(); + at::parallel_for(0, r_Size, TH_OMP_OVERHEAD_THRESHOLD, + [&](int64_t start, int64_t end) { + for (auto i = start; i < end; i++) { + rp[i] = tp[i] ^ sp[i]; + } + }); + } else { + TH_TENSOR_APPLY3_PARALLEL(r_Size, r_Contig, tContig, srcContig, scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = *t_data ^ *src_data;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD); + } + } else { + TH_TENSOR_APPLY3(scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = *t_data ^ *src_data;); + } +#endif +} + +void THTensor_(bitxor)(THTensor *r_, THTensor *t, scalar_t value) { #if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF) (void)r_; (void)t; (void)value; - return THError("bitor is only supported for integer type tensors"); + return THError("bitxor is only supported for integer type tensors"); #else THTensor_(resizeAs)(r_, t); int64_t r_Size = THTensor_(nElement)(r_); @@ -46,22 +141,22 @@ void THTensor_(bitor)(THTensor *r_, THTensor *t, scalar_t value) at::parallel_for(0, r_Size, TH_OMP_OVERHEAD_THRESHOLD * 100, [&](int64_t start, int64_t end) { for (auto i = start; i < end; i++) { - rp[i] = tp[i] | value; + rp[i] = tp[i] ^ value; } }); } else { - TH_TENSOR_APPLY2_PARALLEL(r_Size, r_Contig, tContig, scalar_t, r_, scalar_t, t, *r__data = *t_data | value;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD); + TH_TENSOR_APPLY2_PARALLEL(r_Size, r_Contig, tContig, scalar_t, r_, scalar_t, t, *r__data = *t_data ^ value;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD); } #endif } -void THTensor_(bitxor)(THTensor *r_, THTensor *t, scalar_t value) +void THTensor_(bitor)(THTensor *r_, THTensor *t, scalar_t value) { #if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF) (void)r_; (void)t; (void)value; - return THError("bitxor is only supported for integer type tensors"); + return THError("bitor is only supported for integer type tensors"); #else THTensor_(resizeAs)(r_, t); int64_t r_Size = THTensor_(nElement)(r_); @@ -73,15 +168,22 @@ void THTensor_(bitxor)(THTensor *r_, THTensor *t, scalar_t value) at::parallel_for(0, r_Size, TH_OMP_OVERHEAD_THRESHOLD * 100, [&](int64_t start, int64_t end) { for (auto i = start; i < end; i++) { - rp[i] = tp[i] ^ value; + rp[i] = tp[i] | value; } }); } else { - TH_TENSOR_APPLY2_PARALLEL(r_Size, r_Contig, tContig, scalar_t, r_, scalar_t, t, *r__data = *t_data ^ value;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD); + TH_TENSOR_APPLY2_PARALLEL(r_Size, r_Contig, tContig, scalar_t, r_, scalar_t, t, *r__data = *t_data | value;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD); } #endif } +#if !defined(TH_REAL_IS_BOOL) /* non bool only part */ + +// Should wrap if the value (a) has a different sign than the divisor (b), but is not 0. +static inline bool modulo_wrap(scalar_t a, scalar_t b) { + return (a != 0) && (a < 0) != (b < 0); +} + void THTensor_(clamp)(THTensor *r_, THTensor *t, scalar_t min_value, scalar_t max_value) { THTensor_(resizeAs)(r_, t); @@ -176,7 +278,7 @@ void THTensor_(pow)(THTensor *r_, THTensor *t, scalar_t value) if(value == 1) { at::Tensor r__wrap = THTensor_wrap(r_); at::Tensor t_wrap = THTensor_wrap(t); - at::_copy_same_type_(r__wrap, t_wrap); + at::native::copy_(r__wrap, t_wrap); } else if(value == 2){ THTensor_(cmul)(r_, t, t); @@ -453,108 +555,6 @@ void THTensor_(cremainder)(THTensor *r_, THTensor *t, THTensor *src) } } -void THTensor_(cbitand)(THTensor *r_, THTensor *t, THTensor *src) -{ -#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF) - (void)r_; - (void)t; - (void)src; - return THError("cbitand is only supported for integer type tensors"); -#else - THTensor_(resizeAs)(r_, t); - int64_t r_Size = THTensor_(nElement)(r_); - int64_t srcSize = THTensor_(nElement)(src); - int r_Contig = THTensor_(isContiguous)(r_); - int tContig = THTensor_(isContiguous)(t); - int srcContig = THTensor_(isContiguous)(src); - if (srcSize == r_Size){ - if (r_Contig && tContig && srcContig) { - scalar_t *tp = t->data(); - scalar_t *sp = src->data(); - scalar_t *rp = r_->data(); - at::parallel_for(0, r_Size, TH_OMP_OVERHEAD_THRESHOLD, - [&](int64_t start, int64_t end) { - for (auto i = start; i < end; i++) { - rp[i] = tp[i] & sp[i]; - } - }); - } else { - TH_TENSOR_APPLY3_PARALLEL(r_Size, r_Contig, tContig, srcContig, scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = *t_data & *src_data;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD); - } - } else { - TH_TENSOR_APPLY3(scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = *t_data & *src_data;); - } -#endif -} - -void THTensor_(cbitor)(THTensor *r_, THTensor *t, THTensor *src) -{ -#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF) - (void)r_; - (void)t; - (void)src; - return THError("cbitor is only supported for integer type tensors"); -#else - THTensor_(resizeAs)(r_, t); - int64_t r_Size = THTensor_(nElement)(r_); - int64_t srcSize = THTensor_(nElement)(src); - int r_Contig = THTensor_(isContiguous)(r_); - int tContig = THTensor_(isContiguous)(t); - int srcContig = THTensor_(isContiguous)(src); - if (srcSize == r_Size){ - if (r_Contig && tContig && srcContig) { - scalar_t *tp = t->data(); - scalar_t *sp = src->data(); - scalar_t *rp = r_->data(); - at::parallel_for(0, r_Size, TH_OMP_OVERHEAD_THRESHOLD, - [&](int64_t start, int64_t end) { - for (auto i = start; i < end; i++) { - rp[i] = tp[i] | sp[i]; - } - }); - } else { - TH_TENSOR_APPLY3_PARALLEL(r_Size, r_Contig, tContig, srcContig, scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = *t_data | *src_data;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD); - } - } else { - TH_TENSOR_APPLY3(scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = *t_data | *src_data;); - } -#endif -} - -void THTensor_(cbitxor)(THTensor *r_, THTensor *t, THTensor *src) -{ -#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF) - (void)r_; - (void)t; - (void)src; - return THError("cbitxor is only supported for integer type tensors"); -#else - THTensor_(resizeAs)(r_, t); - int64_t r_Size = THTensor_(nElement)(r_); - int64_t srcSize = THTensor_(nElement)(src); - int r_Contig = THTensor_(isContiguous)(r_); - int tContig = THTensor_(isContiguous)(t); - int srcContig = THTensor_(isContiguous)(src); - if (srcSize == r_Size){ - if (r_Contig && tContig && srcContig) { - scalar_t *tp = t->data(); - scalar_t *sp = src->data(); - scalar_t *rp = r_->data(); - at::parallel_for(0, r_Size, TH_OMP_OVERHEAD_THRESHOLD, - [&](int64_t start, int64_t end) { - for (auto i = start; i < end; i++) { - rp[i] = tp[i] ^ sp[i]; - } - }); - } else { - TH_TENSOR_APPLY3_PARALLEL(r_Size, r_Contig, tContig, srcContig, scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = *t_data ^ *src_data;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD); - } - } else { - TH_TENSOR_APPLY3(scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = *t_data ^ *src_data;); - } -#endif -} - void THTensor_(tpow)(THTensor *r_, scalar_t value, THTensor *t) { THTensor_(resizeAs)(r_, t); @@ -582,7 +582,7 @@ void THTensor_(addcmul)(THTensor *r_, THTensor *t, scalar_t value, THTensor *src THTensor_(resizeAs)(r_, t); at::Tensor r__wrap = THTensor_wrap(r_); at::Tensor t_wrap = THTensor_wrap(t); - at::_copy_same_type_(r__wrap, t_wrap); + at::native::copy_(r__wrap, t_wrap); } int64_t r_Size = THTensor_(nElement)(r_); int64_t src1Size = THTensor_(nElement)(src1); @@ -604,7 +604,7 @@ void THTensor_(addcdiv)(THTensor *r_, THTensor *t, scalar_t value, THTensor *src THTensor_(resizeAs)(r_, t); at::Tensor r__wrap = THTensor_wrap(r_); at::Tensor t_wrap = THTensor_wrap(t); - at::_copy_same_type_(r__wrap, t_wrap); + at::native::copy_(r__wrap, t_wrap); } int64_t r_Size = THTensor_(nElement)(r_); int64_t src1Size = THTensor_(nElement)(src1); @@ -645,7 +645,7 @@ void THTensor_(addmv)(THTensor *r_, scalar_t beta, THTensor *t, scalar_t alpha, THTensor_(resizeAs)(r_, t); at::Tensor r__wrap = THTensor_wrap(r_); at::Tensor t_wrap = THTensor_wrap(t); - at::_copy_same_type_(r__wrap, t_wrap); + at::native::copy_(r__wrap, t_wrap); } auto r_stride = THTensor_strideLegacyNoScalars(r_, 0); @@ -768,7 +768,7 @@ void THTensor_(addmm)(THTensor *r_, scalar_t beta, THTensor *t, scalar_t alpha, if (beta != 0.0) { at::Tensor r__wrap = THTensor_wrap(r_); at::Tensor t_wrap = THTensor_wrap(t); - at::_copy_same_type_(r__wrap, t_wrap); + at::native::copy_(r__wrap, t_wrap); } } @@ -905,7 +905,7 @@ void THTensor_(addr)(THTensor *r_, scalar_t beta, THTensor *t, scalar_t alpha, T THTensor_(resizeAs)(r_, t); at::Tensor r__wrap = THTensor_wrap(r_); at::Tensor t_wrap = THTensor_wrap(t); - at::_copy_same_type_(r__wrap, t_wrap); + at::native::copy_(r__wrap, t_wrap); } if(beta == 0) { @@ -970,7 +970,7 @@ void THTensor_(addbmm)(THTensor *result, scalar_t beta, THTensor *t, scalar_t al if (beta != 0.0) { at::Tensor result_wrap = THTensor_wrap(result); at::Tensor t_wrap = THTensor_wrap(t); - at::_copy_same_type_(result_wrap, t_wrap); + at::native::copy_(result_wrap, t_wrap); } } diff --git a/aten/src/TH/generic/THTensorMath.h b/aten/src/TH/generic/THTensorMath.h index 82823441aee8..7ec9d7854ba6 100644 --- a/aten/src/TH/generic/THTensorMath.h +++ b/aten/src/TH/generic/THTensorMath.h @@ -4,6 +4,8 @@ TH_API void THTensor_(nonzero)(THLongTensor *subscript, THTensor *tensor); +#ifndef TH_REAL_IS_HALF + TH_API void THTensor_(ltValue)(THByteTensor *r_, THTensor* t, scalar_t value); TH_API void THTensor_(leValue)(THByteTensor *r_, THTensor* t, scalar_t value); TH_API void THTensor_(gtValue)(THByteTensor *r_, THTensor* t, scalar_t value); @@ -35,14 +37,24 @@ TH_API void THTensor_(eqTensorT)(THTensor *r_, THTensor *ta, THTensor *tb); TH_API accreal THTensor_(sumall)(THTensor *t); TH_API int THTensor_(equal)(THTensor *ta, THTensor *tb); +TH_API void THTensor_(maskedSelect)(THTensor *tensor, THTensor* src, THByteTensor *mask); +TH_API void THTensor_(maskedSelectBool)(THTensor *tensor, THTensor* src, THBoolTensor *mask); + +TH_API void THTensor_(bitand)(THTensor *r_, THTensor *t, scalar_t value); +TH_API void THTensor_(cbitand)(THTensor *r_, THTensor *t, THTensor *src); +TH_API void THTensor_(bitor)(THTensor *r_, THTensor *t, scalar_t value); +TH_API void THTensor_(cbitor)(THTensor *r_, THTensor *t, THTensor *src); +TH_API void THTensor_(bitxor)(THTensor *r_, THTensor *t, scalar_t value); +TH_API void THTensor_(cbitxor)(THTensor *r_, THTensor *t, THTensor *src); + +TH_API void THTensor_(sign)(THTensor *r_, THTensor *t); + #if !defined(TH_REAL_IS_BOOL) /* non bool only part */ TH_API void THTensor_(maskedFill)(THTensor *tensor, THByteTensor *mask, scalar_t value); TH_API void THTensor_(maskedCopy)(THTensor *tensor, THByteTensor *mask, THTensor* src); -TH_API void THTensor_(maskedSelect)(THTensor *tensor, THTensor* src, THByteTensor *mask); TH_API void THTensor_(maskedFillBool)(THTensor *tensor, THBoolTensor *mask, scalar_t value); TH_API void THTensor_(maskedCopyBool)(THTensor *tensor, THBoolTensor *mask, THTensor* src); -TH_API void THTensor_(maskedSelectBool)(THTensor *tensor, THTensor* src, THBoolTensor *mask); TH_API void THTensor_(indexSelect)(THTensor *tensor, THTensor *src, int dim, THLongTensor *index); TH_API void THTensor_(indexCopy)(THTensor *tensor, int dim, THLongTensor *index, THTensor *src); @@ -75,9 +87,6 @@ TH_API void THTensor_(rshift)(THTensor *r_, THTensor *t, scalar_t value); TH_API void THTensor_(fmod)(THTensor *r_, THTensor *t, scalar_t value); TH_API void THTensor_(remainder)(THTensor *r_, THTensor *t, scalar_t value); TH_API void THTensor_(clamp)(THTensor *r_, THTensor *t, scalar_t min_value, scalar_t max_value); -TH_API void THTensor_(bitand)(THTensor *r_, THTensor *t, scalar_t value); -TH_API void THTensor_(bitor)(THTensor *r_, THTensor *t, scalar_t value); -TH_API void THTensor_(bitxor)(THTensor *r_, THTensor *t, scalar_t value); TH_API void THTensor_(cadd)(THTensor *r_, THTensor *t, scalar_t value, THTensor *src); TH_API void THTensor_(csub)(THTensor *self, THTensor *src1, scalar_t value, THTensor *src2); @@ -88,9 +97,6 @@ TH_API void THTensor_(clshift)(THTensor *r_, THTensor *t, THTensor *src); TH_API void THTensor_(crshift)(THTensor *r_, THTensor *t, THTensor *src); TH_API void THTensor_(cfmod)(THTensor *r_, THTensor *t, THTensor *src); TH_API void THTensor_(cremainder)(THTensor *r_, THTensor *t, THTensor *src); -TH_API void THTensor_(cbitand)(THTensor *r_, THTensor *t, THTensor *src); -TH_API void THTensor_(cbitor)(THTensor *r_, THTensor *t, THTensor *src); -TH_API void THTensor_(cbitxor)(THTensor *r_, THTensor *t, THTensor *src); TH_API void THTensor_(addcmul)(THTensor *r_, THTensor *t, scalar_t value, THTensor *src1, THTensor *src2); TH_API void THTensor_(addcdiv)(THTensor *r_, THTensor *t, scalar_t value, THTensor *src1, THTensor *src2); @@ -113,7 +119,6 @@ TH_API void THTensor_(mode)(THTensor *values_, THLongTensor *indices_, THTensor TH_API void THTensor_(prod)(THTensor *r_, THTensor *t, int dimension, int keepdim); TH_API void THTensor_(cumsum)(THTensor *r_, THTensor *t, int dimension); TH_API void THTensor_(cumprod)(THTensor *r_, THTensor *t, int dimension); -TH_API void THTensor_(sign)(THTensor *r_, THTensor *t); TH_API accreal THTensor_(trace)(THTensor *t); TH_API void THTensor_(cmax)(THTensor *r, THTensor *t, THTensor *src); @@ -183,3 +188,4 @@ TH_API void THTensor_(dirichlet_grad)(THTensor *self, THTensor *x, THTensor *alp #endif #endif +#endif diff --git a/aten/src/TH/generic/THTensorMoreMath.cpp b/aten/src/TH/generic/THTensorMoreMath.cpp index 1fcb8ee231df..e22ccdcea45e 100644 --- a/aten/src/TH/generic/THTensorMoreMath.cpp +++ b/aten/src/TH/generic/THTensorMoreMath.cpp @@ -63,6 +63,26 @@ int THTensor_(equal)(THTensor *ta, THTensor* tb) return equal; } +void THTensor_(sign)(THTensor *r_, THTensor *t) +{ + THTensor_(resizeAs)(r_, t); + +#if defined (TH_REAL_IS_BYTE) + TH_TENSOR_APPLY2(scalar_t, r_, scalar_t, t, + if (*t_data > 0) *r__data = 1; + else *r__data = 0;); +#elif defined (TH_REAL_IS_BOOL) +TH_TENSOR_APPLY2(scalar_t, r_, scalar_t, t, + if (*t_data == true) *r__data = false; + else *r__data = true;); +#else + TH_TENSOR_APPLY2(scalar_t, r_, scalar_t, t, + if (*t_data > 0) *r__data = 1; + else if (*t_data < 0) *r__data = -1; + else *r__data = 0;); +#endif +} + #if !defined(TH_REAL_IS_BOOL) /* non bool only part */ void THTensor_(baddbmm)(THTensor *result, scalar_t beta, THTensor *t, scalar_t alpha, THTensor *batch1, THTensor *batch2) @@ -91,7 +111,7 @@ void THTensor_(baddbmm)(THTensor *result, scalar_t beta, THTensor *t, scalar_t a if (beta != 0.0) { at::Tensor result_wrap = THTensor_wrap(result); at::Tensor t_wrap = THTensor_wrap(t); - at::_copy_same_type_(result_wrap, t_wrap); + at::native::copy_(result_wrap, t_wrap); } } @@ -177,7 +197,7 @@ void THTensor_(max)(THTensor *values_, THLongTensor *indices_, THTensor *t, int at::Tensor values__wrap = THTensor_wrap(values_); at::Tensor t0_wrap = THTensor_wrap(t0); auto right_shape = t0_wrap.reshape(values__wrap.sizes()); - at::_copy_same_type_(values__wrap, right_shape); + at::native::copy_(values__wrap, right_shape); c10::raw::intrusive_ptr::decref(t0); } else { THTensor_(fill)(values_, THTensor_(get1d)(t, 0)); @@ -261,7 +281,7 @@ void THTensor_(min)(THTensor *values_, THLongTensor *indices_, THTensor *t, int at::Tensor values__wrap = THTensor_wrap(values_); at::Tensor t0_wrap = THTensor_wrap(t0); auto right_shape = t0_wrap.reshape(values__wrap.sizes()); - at::_copy_same_type_(values__wrap, right_shape); + at::native::copy_(values__wrap, right_shape); c10::raw::intrusive_ptr::decref(t0); } else { THTensor_(fill)(values_, THTensor_(get1d)(t, 0)); @@ -400,24 +420,6 @@ void THTensor_(cumprod)(THTensor *r_, THTensor *t, int dimension) }); } - -void THTensor_(sign)(THTensor *r_, THTensor *t) -{ - THTensor_(resizeAs)(r_, t); - -#if defined (TH_REAL_IS_BYTE) - TH_TENSOR_APPLY2(scalar_t, r_, scalar_t, t, - if (*t_data > 0) *r__data = 1; - else *r__data = 0;); -#else - TH_TENSOR_APPLY2(scalar_t, r_, scalar_t, t, - if (*t_data > 0) *r__data = 1; - else if (*t_data < 0) *r__data = -1; - else *r__data = 0;); -#endif -} - - accreal THTensor_(trace)(THTensor *t) { scalar_t *t_data = t->data(); @@ -737,7 +739,7 @@ void THTensor_(sort)(THTensor *rt_, THLongTensor *ri_, THTensor *t, int dimensio THTensor_(resizeAs)(rt_, t); at::Tensor rt__wrap = THTensor_wrap(rt_); at::Tensor t_wrap = THTensor_wrap(t); - at::_copy_same_type_(rt__wrap, t_wrap); + at::native::copy_(rt__wrap, t_wrap); THLongTensor_resize(ri_, t->sizes(), {}); if(descendingOrder) @@ -1341,7 +1343,7 @@ void THTensor_(renorm)(THTensor *res, THTensor *src, scalar_t value, int dimensi { at::Tensor rowR_wrap = THTensor_wrap(rowR); at::Tensor rowS_wrap = THTensor_wrap(rowS); - at::_copy_same_type_(rowR_wrap, rowS_wrap); + at::native::copy_(rowR_wrap, rowS_wrap); } } diff --git a/aten/src/THC/CMakeLists.txt b/aten/src/THC/CMakeLists.txt index 4a7dcccfa93d..28c44dcfeaaa 100644 --- a/aten/src/THC/CMakeLists.txt +++ b/aten/src/THC/CMakeLists.txt @@ -18,7 +18,7 @@ foreach(THC_TYPE Byte Char Short Int Long Half Float Double) endforeach() endforeach() -foreach(THC_FILE TensorMathCompareT TensorMathCompare TensorMathReduce TensorMasked) +foreach(THC_FILE TensorMathCompareT TensorMathCompare TensorMathReduce TensorMasked TensorMathPointwise) if(NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/generated/THC${THC_FILE}Bool.cu") FILE(WRITE "${CMAKE_CURRENT_SOURCE_DIR}/generated/THC${THC_FILE}Bool.cu" "#include \n#include \n\n#include \n#include \n") diff --git a/aten/src/THC/THCBlas.cu b/aten/src/THC/THCBlas.cu index 39048730a6ee..4927bf2a6800 100644 --- a/aten/src/THC/THCBlas.cu +++ b/aten/src/THC/THCBlas.cu @@ -508,37 +508,6 @@ void THCudaBlas_DgemmStridedBatched(THCState *state, char transa, char transb, i } #endif -/* Inverse */ -void THCudaBlas_Sgetrf(THCState *state, int n, float **a, int lda, int *pivot, int *info, int batchSize) { -#ifndef __HIP_PLATFORM_HCC__ - if( (n >= INT_MAX) || (lda >= INT_MAX) || (batchSize >= INT_MAX) ) - { - THError("Cublas_Sgetrf only supports n, lda, batchSize" - "with the bound [val] <= %d", INT_MAX); - } - cublasHandle_t handle = THCState_getCurrentBlasHandle(state); - cublasSetStream(handle, THCState_getCurrentStream(state)); - THCublasCheck(cublasSgetrfBatched(handle, n, a, lda, pivot, info, batchSize)); -#else - THError("THCudaBlas_Sgetrf not supported in ROCM."); -#endif -} - -void THCudaBlas_Dgetrf(THCState *state, int n, double **a, int lda, int *pivot, int *info, int batchSize) { -#ifndef __HIP_PLATFORM_HCC__ - if( (n >= INT_MAX) || (lda >= INT_MAX) || (batchSize >= INT_MAX) ) - { - THError("Cublas_Dgetrf only supports n, lda, batchSize" - "with the bound [val] <= %d", INT_MAX); - } - cublasHandle_t handle = THCState_getCurrentBlasHandle(state); - cublasSetStream(handle, THCState_getCurrentStream(state)); - THCublasCheck(cublasDgetrfBatched(handle, n, a, lda, pivot, info, batchSize)); -#else - THError("THCudaBlas_Dgetrf not supported in ROCM."); -#endif -} - void THCudaBlas_Sgetrs(THCState *state, char transa, int n, int nrhs, const float **a, int lda, int *pivot, float **b, int ldb, int *info, int batchSize) { #ifndef __HIP_PLATFORM_HCC__ @@ -579,33 +548,3 @@ void THCudaBlas_Dgetrs(THCState *state, char transa, int n, int nrhs, const doub THError("THCudaBlas_Dgetrs not supported in ROCM."); #endif } - -void THCudaBlas_Sgetri(THCState *state, int n, const float **a, int lda, int *pivot, float **c, int ldc, int *info, int batchSize) { -#ifndef __HIP_PLATFORM_HCC__ - if( (n >= INT_MAX) || (lda >= INT_MAX)|| (ldc >= INT_MAX) || (batchSize >= INT_MAX) ) - { - THError("Cublas_Sgetri only supports n, lda, ldc, batchSize" - "with the bound [val] <= %d", INT_MAX); - } - cublasHandle_t handle = THCState_getCurrentBlasHandle(state); - cublasSetStream(handle, THCState_getCurrentStream(state)); - THCublasCheck(cublasSgetriBatched(handle, n, a, lda, pivot, c, ldc, info, batchSize)); -#else - THError("THCudaBlas_Sgetri not supported in ROCM."); -#endif -} - -void THCudaBlas_Dgetri(THCState *state, int n, const double **a, int lda, int *pivot, double **c, int ldc, int *info, int batchSize) { -#ifndef __HIP_PLATFORM_HCC__ - if( (n >= INT_MAX) || (lda >= INT_MAX)|| (ldc >= INT_MAX) || (batchSize >= INT_MAX) ) - { - THError("Cublas_Dgetri only supports n, lda, ldc, batchSize" - "with the bound [val] <= %d", INT_MAX); - } - cublasHandle_t handle = THCState_getCurrentBlasHandle(state); - cublasSetStream(handle, THCState_getCurrentStream(state)); - THCublasCheck(cublasDgetriBatched(handle, n, a, lda, pivot, c, ldc, info, batchSize)); -#else - THError("THCudaBlas_Dgetri not supported in ROCM."); -#endif -} diff --git a/aten/src/THC/THCBlas.h b/aten/src/THC/THCBlas.h index 030646892d7c..56e011386a39 100644 --- a/aten/src/THC/THCBlas.h +++ b/aten/src/THC/THCBlas.h @@ -42,14 +42,7 @@ void THCudaBlas_HgemmStridedBatched(THCState *state, char transa, char transb, i THHalf beta, THHalf *c, int64_t ldc, int64_t strideC, int64_t batchCount); #endif -/* Inverse */ -THC_API void THCudaBlas_Sgetrf(THCState *state, int n, float **a, int lda, int *pivot, int *info, int batchSize); -THC_API void THCudaBlas_Dgetrf(THCState *state, int n, double **a, int lda, int *pivot, int *info, int batchSize); - THC_API void THCudaBlas_Sgetrs(THCState *state, char transa, int n, int nrhs, const float **a, int lda, int *pivot, float **b, int ldb, int *info, int batchSize); THC_API void THCudaBlas_Dgetrs(THCState *state, char transa, int n, int nrhs, const double **a, int lda, int *pivot, double **b, int ldb, int *info, int batchSize); -THC_API void THCudaBlas_Sgetri(THCState *state, int n, const float **a, int lda, int *pivot, float **c, int ldc, int *info, int batchSize); -THC_API void THCudaBlas_Dgetri(THCState *state, int n, const double **a, int lda, int *pivot, double **c, int ldc, int *info, int batchSize); - #endif diff --git a/aten/src/THC/THCTensor.cpp b/aten/src/THC/THCTensor.cpp index 8065d6020fb3..171ec945f575 100644 --- a/aten/src/THC/THCTensor.cpp +++ b/aten/src/THC/THCTensor.cpp @@ -104,7 +104,7 @@ void THCTensor_resizeAs(THCState *state, THCTensor *self, THCTensor *src) { void THCTensor_resizeNd(THCState *state, THCTensor *self, int nDimension, const int64_t *size, const int64_t *stride) { - AT_CHECK(nDimension >= 0, "resizeNd nDimension must be non-negative"); + TORCH_CHECK(nDimension >= 0, "resizeNd nDimension must be non-negative"); at::IntArrayRef sizes(size, nDimension); at::optional strides; if (stride) { diff --git a/aten/src/THC/THCTensorMath.h b/aten/src/THC/THCTensorMath.h index 264a6b26b1e3..4002a85f7ce7 100644 --- a/aten/src/THC/THCTensorMath.h +++ b/aten/src/THC/THCTensorMath.h @@ -19,9 +19,15 @@ #include #include +#include +#include + #include #include +#include +#include + #include #include @@ -46,6 +52,9 @@ #include #include +#include +#include + #include #include diff --git a/aten/src/THC/THCTensorMathPairwise.cu b/aten/src/THC/THCTensorMathPairwise.cu index 63eb707cfaf3..fe781feb0d53 100644 --- a/aten/src/THC/THCTensorMathPairwise.cu +++ b/aten/src/THC/THCTensorMathPairwise.cu @@ -303,3 +303,6 @@ struct TensorBitXorConstantOp { #include #include + +#include +#include diff --git a/aten/src/THC/THCTensorRandom.cu b/aten/src/THC/THCTensorRandom.cu index db7f6446335b..f324fa2f0b1a 100644 --- a/aten/src/THC/THCTensorRandom.cu +++ b/aten/src/THC/THCTensorRandom.cu @@ -101,24 +101,6 @@ THC_API __host__ void THCRandom_setRNGState(THCState* state, THByteTensor *rng_s } } -// Goes from (0, 1] to [0, 1). Note 1-x is not sufficient since for some floats -// eps near 0, 1-eps will round to 1. -template -__device__ inline T reverse_bounds(T value) { - if (THCNumerics::eq(value, ScalarConvert::to(1))) { - return ScalarConvert::to(0); - } - return value; -} - - -__device__ inline at::Half half_uniform_scale_and_shift(float x, double a, double b) { - at::Half width = ScalarConvert::to(b - a); - at::Half start = ScalarConvert::to(a); - at::Half scaled = THCNumerics::mul(reverse_bounds(ScalarConvert::to(x)), width); - return THCNumerics::add(scaled, start); -} - #define GENERATE_KERNEL1(NAME, T, ARG1, CURAND_T, CURAND_FUNC, TRANSFORM) \ __global__ void NAME(curandStateMtgp32 *state, int size, T *result, ARG1) \ { \ @@ -147,11 +129,6 @@ __global__ void NAME(curandStateMtgp32 *state, int size, T *result, ARG1, ARG2) } \ } -// NOTE: curand_uniform is (0, 1] and we want [a, b) -GENERATE_KERNEL2(generate_uniform, float, float a, float b, float, curand_uniform, reverse_bounds(x) * (b-a) + a) -GENERATE_KERNEL2(generate_uniform, float, double a, double b, float, curand_uniform, reverse_bounds(x) * (b-a) + a) -GENERATE_KERNEL2(generate_uniform, double, double a, double b, double, curand_uniform_double, reverse_bounds(x) * (b-a) + a) - GENERATE_KERNEL2(generate_normal, float, double mean, double stdv, float, curand_normal, (x * stdv) + mean) GENERATE_KERNEL2(generate_normal, double, double mean, double stdv, double, curand_normal_double, (x * stdv) + mean) @@ -161,7 +138,6 @@ GENERATE_KERNEL1(generate_exponential, double, double lambda, double, curand_uni GENERATE_KERNEL2(generate_cauchy, float, double median, double sigma, float, curand_uniform, (float)(median + sigma * tan(M_PI*(x-0.5)))) GENERATE_KERNEL2(generate_cauchy, double, double median, double sigma, double, curand_uniform_double, (double)(median + sigma * tan(M_PI*(x-0.5)))) -GENERATE_KERNEL2(generate_uniform, at::Half, double a, double b, float, curand_uniform, (half_uniform_scale_and_shift(x, a, b))) GENERATE_KERNEL2(generate_normal, at::Half, double mean, double stdv, float, curand_normal, (ScalarConvert::to((x * stdv) + mean))) GENERATE_KERNEL1(generate_exponential, at::Half, double lambda, float, curand_uniform, (ScalarConvert::to((float)(-1. / lambda * log(x))))) GENERATE_KERNEL2(generate_cauchy, at::Half, double median, double sigma, float, curand_uniform, (ScalarConvert::to((float)(median + sigma * tan(M_PI*(x-0.5)))))) diff --git a/aten/src/THC/generated/THCTensorMathPointwiseBool.cu b/aten/src/THC/generated/THCTensorMathPointwiseBool.cu new file mode 100644 index 000000000000..817106de7175 --- /dev/null +++ b/aten/src/THC/generated/THCTensorMathPointwiseBool.cu @@ -0,0 +1,5 @@ +#include +#include + +#include +#include diff --git a/aten/src/THC/generic/THCTensor.cpp b/aten/src/THC/generic/THCTensor.cpp index 4844e61a8ad6..c04ec7fdcbbe 100644 --- a/aten/src/THC/generic/THCTensor.cpp +++ b/aten/src/THC/generic/THCTensor.cpp @@ -89,7 +89,7 @@ THCTensor *THCTensor_(newWithTensor)(THCState *state, THCTensor *tensor) /* Storage init */ THCTensor *THCTensor_(newWithStorage)(THCState *state, THCStorage *storage, ptrdiff_t storageOffset, at::IntArrayRef sizes, at::IntArrayRef strides) { if (strides.data()) { - AT_CHECK(sizes.size() == strides.size(), "number of sizes and strides must match"); + TORCH_CHECK(sizes.size() == strides.size(), "number of sizes and strides must match"); } THCTensor *self = c10::make_intrusive( c10::intrusive_ptr::reclaim(THCStorage_(new)(state)), diff --git a/aten/src/THC/generic/THCTensorCopy.cu b/aten/src/THC/generic/THCTensorCopy.cu index 3e837e45c267..19fe5a5fc408 100644 --- a/aten/src/THC/generic/THCTensorCopy.cu +++ b/aten/src/THC/generic/THCTensorCopy.cu @@ -6,7 +6,7 @@ void THCTensor_(copy)(THCState* state, THCTensor* dst, THCTensor* src) { if (dst == src) return; at::Tensor dst_wrap = THTensor_wrap(dst); at::Tensor src_wrap = THTensor_wrap(src); - at::s_copy_(dst_wrap, src_wrap); + at::native::copy_(dst_wrap, src_wrap); } template <> @@ -16,7 +16,7 @@ THCTensor *THCTensor_newClone(THCState *state, THCTensor *self) { THCTensor_resizeAs(state, tensor, self); at::Tensor tensor_wrap = THTensor_wrap(tensor); at::Tensor self_wrap = THTensor_wrap(self); - at::s_copy_(tensor_wrap, self_wrap); + at::native::copy_(tensor_wrap, self_wrap); return tensor; } @@ -37,7 +37,7 @@ void THCTensor_freeCopyTo(THCState *state, THCTensor *self, THCTensor if(self != dst) { at::Tensor dst_wrap = THTensor_wrap(dst); at::Tensor self_wrap = THTensor_wrap(self); - at::s_copy_(dst_wrap, self_wrap); + at::native::copy_(dst_wrap, self_wrap); } THCTensor_free(state, self); diff --git a/aten/src/THC/generic/THCTensorMathMagma.cu b/aten/src/THC/generic/THCTensorMathMagma.cu index 198495bb881a..a83d6d0da864 100644 --- a/aten/src/THC/generic/THCTensorMathMagma.cu +++ b/aten/src/THC/generic/THCTensorMathMagma.cu @@ -64,7 +64,7 @@ void THCTensor_(gels)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor #ifdef USE_MAGMA THArgCheck(!a_->is_empty() && a_->dim() == 2, 1, "A should be (non-empty) 2 dimensional"); THArgCheck(!b_->is_empty() && b_->dim() == 2, 1, "b should be (non-empty) 2 dimensional"); - AT_CHECK(a_->size(0) == b_->size(0), "Expected A and b to have same size " + TORCH_CHECK(a_->size(0) == b_->size(0), "Expected A and b to have same size " "at dim 0, but A has ", a_->size(0), " rows and B has ", b_->size(0), " rows"); THArgCheck(a_->size(0) >= a_->size(1), 2, "Expected A with shape (m x n) to have " "m >= n. The case for m < n is not implemented yet."); @@ -334,112 +334,6 @@ void THCTensor_(gesdd2)(THCState *state, THCTensor *ru_, THCTensor *rs_, THCTens #endif } -void THCTensor_(getri)(THCState *state, THCTensor *ra_, THCTensor *a) -{ - THArgCheck(!a->is_empty() && a->dim() == 2, 2, "A should be non-empty 2 dimensional"); - THArgCheck(a->size(0) == a->size(1), 2, "A should be square"); - -#ifdef USE_MAGMA - int info; - int64_t n = a->size(0); - int lwork = n * magma_get_sgetri_nb(n); - - THCTensor *input = THCTensor_(newColumnMajor)(state, ra_, a); - scalar_t *input_data = THCTensor_(data)(state, input); - - int *ipiv = th_magma_malloc_pinned(n); - - THCTensor *work = THCTensor_(newWithSize1d)(state, lwork); - scalar_t *work_data = THCTensor_(data)(state, work); - - // Run LU -#if defined(THC_REAL_IS_FLOAT) - magma_sgetrf_gpu(n, n, input_data, n, ipiv, &info); -#else - magma_dgetrf_gpu(n, n, input_data, n, ipiv, &info); -#endif - - if (info > 0) - THError("MAGMA getrf : U(%d,%d) is 0, U is singular", info, info); - else if (info < 0) - THError("MAGMA getrf : Argument %d : illegal value", -info); - - // Inverse -#if defined(THC_REAL_IS_FLOAT) - magma_sgetri_gpu(n, input_data, n, ipiv, work_data, lwork, &info); -#else - magma_dgetri_gpu(n, input_data, n, ipiv, work_data, lwork, &info); -#endif - - if (info > 0) - THError("MAGMA getri : U(%d,%d) is 0, U is singular", info, info); - else if (info < 0) - THError("MAGMA getri : Argument %d : illegal value", -info); - - THCTensor_(free)(state, work); - magma_free_pinned(ipiv); - THCTensor_(freeCopyTo)(state, input, ra_); -#else - int64_t n = a->size(0); - - // input - THCTensor *input = THCTensor_(newColumnMajor)(state, a, a); - THCTensor_(resizeNd)(state, ra_, 2, THTensor_getSizePtr(input), THTensor_getStridePtr(input)); - - scalar_t *matrices1[1] = { THCTensor_(data)(state, input) }; - scalar_t *matrices2[1] = { THCTensor_(data)(state, ra_) }; - - // Copy pointers to device. - auto d_matrices1 = static_cast(THCudaMalloc(state, sizeof(scalar_t*))); - auto d_matrices2 = static_cast(THCudaMalloc(state, sizeof(scalar_t*))); - - THCudaCheck(cudaMemcpyAsync(d_matrices1, matrices1, sizeof(scalar_t*), - cudaMemcpyHostToDevice, THCState_getCurrentStream(state))); - THCudaCheck(cudaMemcpyAsync(d_matrices2, matrices2, sizeof(scalar_t*), - cudaMemcpyHostToDevice, THCState_getCurrentStream(state))); - int info; - auto info_gpu = static_cast(THCudaMalloc(state, sizeof(int))); - - auto ipiv_gpu = static_cast(THCudaMalloc(state, n * sizeof(int))); - - // Run LU -#if defined(THC_REAL_IS_FLOAT) - THCudaBlas_Sgetrf(state, n, d_matrices1, n, ipiv_gpu, info_gpu, 1); -#else - THCudaBlas_Dgetrf(state, n, d_matrices1, n, ipiv_gpu, info_gpu, 1); -#endif - - THCudaCheck(cudaMemcpy(&info, info_gpu, sizeof(int), cudaMemcpyDeviceToHost)); - - if (info > 0) - THError("CUBLAS getrf : U(%d,%d) is 0, U is singular", info, info); - else if (info < 0) - THError("CUBLAS getrf : Argument %d : illegal value", -info); - - // Inverse -#if defined(THC_REAL_IS_FLOAT) - THCudaBlas_Sgetri(state, n, (const scalar_t**)d_matrices1, n, ipiv_gpu, d_matrices2, n, info_gpu, 1); -#else - THCudaBlas_Dgetri(state, n, (const scalar_t**)d_matrices1, n, ipiv_gpu, d_matrices2, n, info_gpu, 1); -#endif - - THCudaCheck(cudaMemcpy(&info, info_gpu, sizeof(int), cudaMemcpyDeviceToHost)); - - if (info > 0) - THError("CUBLAS getri : U(%d,%d) is 0, U is singular", info, info); - else if (info < 0) - THError("CUBLAS getri : Argument %d : illegal value", -info); - - THCudaFree(state, ipiv_gpu); - THCudaFree(state, info_gpu); - - THCudaFree(state, d_matrices1); - THCudaFree(state, d_matrices2); - - THCTensor_(free)(state, input); -#endif -} - __global__ void THCTensor_(copyUpperSymmetric)(scalar_t *input, int n, int len) { for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < len; idx += 65535) { diff --git a/aten/src/THC/generic/THCTensorMathMagma.h b/aten/src/THC/generic/THCTensorMathMagma.h index f388f68517c6..0ae49cd65007 100644 --- a/aten/src/THC/generic/THCTensorMathMagma.h +++ b/aten/src/THC/generic/THCTensorMathMagma.h @@ -12,7 +12,6 @@ THC_API void THCTensor_(gesdd)(THCState *state, THCTensor *ru_, THCTensor *rs_, const char *some, const char* compute_uv); THC_API void THCTensor_(gesdd2)(THCState *state, THCTensor *ru_, THCTensor *rs_, THCTensor *rv_, THCTensor *ra_, THCTensor *a, const char *some, const char* compute_uv); -THC_API void THCTensor_(getri)(THCState *state, THCTensor *ra_, THCTensor *a); THC_API void THCTensor_(potri)(THCState *state, THCTensor *ra_, THCTensor *a, const char *uplo); THC_API void THCTensor_(geqrf)(THCState *state, THCTensor *ra_, THCTensor *rtau_, THCTensor *a_); THC_API void THCTensor_(qr)(THCState *state, THCTensor *rq_, THCTensor *rr_, THCTensor *a); diff --git a/aten/src/THC/generic/THCTensorMathPairwise.cu b/aten/src/THC/generic/THCTensorMathPairwise.cu index c16e36909dd6..667db64cf53b 100644 --- a/aten/src/THC/generic/THCTensorMathPairwise.cu +++ b/aten/src/THC/generic/THCTensorMathPairwise.cu @@ -2,6 +2,95 @@ #define THC_GENERIC_FILE "THC/generic/THCTensorMathPairwise.cu" #else +int THCTensor_(equal)(THCState *state, THCTensor *self_, THCTensor *src_) +{ + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_)); + if (!THCTensor_(isSameSizeAs(state, self_, src_))) { + return 0; + } + + // This is not as efficient as TH, but the basic idea: create a buffer that stores + // 1 if the two tensors are equal at a position, otherwise 0. If the minimum value + // in this buffer is 1, the two tensors are equal, otherwise they are not + + THCudaByteTensor *buf = THCudaByteTensor_newWithSize(state, self_->sizes(), {}); + + if (!THC_pointwiseApply3(state, buf, self_, src_, TensorEQOp())) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + + unsigned char min = THCudaByteTensor_minall(state, buf); + + THCudaByteTensor_free(state, buf); + + return min != 0; +} + +void THCTensor_(bitand)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value) +{ +#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF) + return THError("bitand only supported for integer type tensors"); +#else + if (self_ == src_) { + if (!THC_pointwiseApply1(state, self_, TensorBitAndConstantOp(value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } else { + THCTensor_(resizeAs)(state, self_, src_); + + if (!THC_pointwiseApply2(state, self_, src_, TensorBitAndConstantOp(value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } + + THCudaCheck(cudaGetLastError()); +#endif +} + +void THCTensor_(bitor)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value) +{ +#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF) + return THError("bitor only supported for integer type tensors"); +#else + if (self_ == src_) { + if (!THC_pointwiseApply1(state, self_, TensorBitOrConstantOp(value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } else { + THCTensor_(resizeAs)(state, self_, src_); + + if (!THC_pointwiseApply2(state, self_, src_, TensorBitOrConstantOp(value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } + + THCudaCheck(cudaGetLastError()); +#endif +} + +void THCTensor_(bitxor)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value) +{ +#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF) + return THError("bitxor only supported for integer type tensors"); +#else + if (self_ == src_) { + if (!THC_pointwiseApply1(state, self_, TensorBitXorConstantOp(value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } else { + THCTensor_(resizeAs)(state, self_, src_); + + if (!THC_pointwiseApply2(state, self_, src_, TensorBitXorConstantOp(value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } + + THCudaCheck(cudaGetLastError()); +#endif +} + +#if !defined(THC_REAL_IS_BOOL) + void THCTensor_(add)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_)); @@ -196,91 +285,6 @@ void THCTensor_(triu)(THCState *state, THCTensor *self_, THCTensor *src_, int64_ THCudaCheck(cudaGetLastError()); } -int THCTensor_(equal)(THCState *state, THCTensor *self_, THCTensor *src_) -{ - THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_)); - if (!THCTensor_(isSameSizeAs(state, self_, src_))) { - return 0; - } - - // This is not as efficient as TH, but the basic idea: create a buffer that stores - // 1 if the two tensors are equal at a position, otherwise 0. If the minimum value - // in this buffer is 1, the two tensors are equal, otherwise they are not - - THCudaByteTensor *buf = THCudaByteTensor_newWithSize(state, self_->sizes(), {}); - - if (!THC_pointwiseApply3(state, buf, self_, src_, TensorEQOp())) { - THArgCheck(false, 2, CUTORCH_DIM_WARNING); - } - - unsigned char min = THCudaByteTensor_minall(state, buf); - - THCudaByteTensor_free(state, buf); - - return min != 0; -} - -void THCTensor_(bitand)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value) -{ -#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF) - return THError("bitand only supported for integer type tensors"); -#else - if (self_ == src_) { - if (!THC_pointwiseApply1(state, self_, TensorBitAndConstantOp(value))) { - THArgCheck(false, 2, CUTORCH_DIM_WARNING); - } - } else { - THCTensor_(resizeAs)(state, self_, src_); - - if (!THC_pointwiseApply2(state, self_, src_, TensorBitAndConstantOp(value))) { - THArgCheck(false, 2, CUTORCH_DIM_WARNING); - } - } - - THCudaCheck(cudaGetLastError()); -#endif -} - -void THCTensor_(bitor)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value) -{ -#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF) - return THError("bitor only supported for integer type tensors"); -#else - if (self_ == src_) { - if (!THC_pointwiseApply1(state, self_, TensorBitOrConstantOp(value))) { - THArgCheck(false, 2, CUTORCH_DIM_WARNING); - } - } else { - THCTensor_(resizeAs)(state, self_, src_); - - if (!THC_pointwiseApply2(state, self_, src_, TensorBitOrConstantOp(value))) { - THArgCheck(false, 2, CUTORCH_DIM_WARNING); - } - } - - THCudaCheck(cudaGetLastError()); #endif -} - -void THCTensor_(bitxor)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value) -{ -#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF) - return THError("bitxor only supported for integer type tensors"); -#else - if (self_ == src_) { - if (!THC_pointwiseApply1(state, self_, TensorBitXorConstantOp(value))) { - THArgCheck(false, 2, CUTORCH_DIM_WARNING); - } - } else { - THCTensor_(resizeAs)(state, self_, src_); - - if (!THC_pointwiseApply2(state, self_, src_, TensorBitXorConstantOp(value))) { - THArgCheck(false, 2, CUTORCH_DIM_WARNING); - } - } - - THCudaCheck(cudaGetLastError()); -#endif -} #endif diff --git a/aten/src/THC/generic/THCTensorMathPairwise.h b/aten/src/THC/generic/THCTensorMathPairwise.h index 26efe9db5bfd..bcfb95b103c4 100644 --- a/aten/src/THC/generic/THCTensorMathPairwise.h +++ b/aten/src/THC/generic/THCTensorMathPairwise.h @@ -2,6 +2,14 @@ #define THC_GENERIC_FILE "THC/generic/THCTensorMathPairwise.h" #else +THC_API int THCTensor_(equal)(THCState *state, THCTensor *self, THCTensor *src); + +THC_API void THCTensor_(bitand)(THCState *state, THCTensor *self, THCTensor *src, scalar_t value); +THC_API void THCTensor_(bitor)(THCState *state, THCTensor *self, THCTensor *src, scalar_t value); +THC_API void THCTensor_(bitxor)(THCState *state, THCTensor *self, THCTensor *src, scalar_t value); + +#if !defined(THC_REAL_IS_BOOL) + THC_API void THCTensor_(add)(THCState *state, THCTensor *self, THCTensor *src, scalar_t value); THC_API void THCTensor_(sub)(THCState *state, THCTensor *self, THCTensor *src, scalar_t value); THC_API void THCTensor_(add_scaled)(THCState *state, THCTensor *self, THCTensor *src, scalar_t value, scalar_t alpha); @@ -12,10 +20,7 @@ THC_API void THCTensor_(lshift)(THCState *state, THCTensor *self, THCTensor *src THC_API void THCTensor_(rshift)(THCState *state, THCTensor *self, THCTensor *src, scalar_t value); THC_API void THCTensor_(fmod)(THCState *state, THCTensor *self, THCTensor *src, scalar_t value); THC_API void THCTensor_(remainder)(THCState *state, THCTensor *self, THCTensor *src, scalar_t value); -THC_API void THCTensor_(bitand)(THCState *state, THCTensor *self, THCTensor *src, scalar_t value); -THC_API void THCTensor_(bitor)(THCState *state, THCTensor *self, THCTensor *src, scalar_t value); -THC_API void THCTensor_(bitxor)(THCState *state, THCTensor *self, THCTensor *src, scalar_t value); -THC_API int THCTensor_(equal)(THCState *state, THCTensor *self, THCTensor *src); +#endif #endif diff --git a/aten/src/THC/generic/THCTensorMathPointwise.cu b/aten/src/THC/generic/THCTensorMathPointwise.cu index 6a2ee33ed126..f4a86df732a9 100644 --- a/aten/src/THC/generic/THCTensorMathPointwise.cu +++ b/aten/src/THC/generic/THCTensorMathPointwise.cu @@ -4,6 +4,106 @@ #include +void THCTensor_(cbitand)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2) +{ +#if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) + return THError("cbitand is only supported for integer type tensors"); +#else + THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); + THArgCheck(THCTensor_(nElement)(state, src1) == + THCTensor_(nElement)(state, src2), 3, "sizes do not match"); + + if (self_ == src1) { + // self /= src2 + if (!THC_pointwiseApply2(state, self_, src2, TensorBitAndOp())) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } else { + THCTensor_(resizeAs)(state, self_, src1); + + // self = src1 / src2 + if (!THC_pointwiseApply3(state, self_, src1, src2, TensorBitAndOp())) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } + + THCudaCheck(cudaGetLastError()); +#endif +} + +void THCTensor_(cbitor)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2) +{ +#if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) + return THError("cbitor is only supported for integer type tensors"); +#else + THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); + THArgCheck(THCTensor_(nElement)(state, src1) == + THCTensor_(nElement)(state, src2), 3, "sizes do not match"); + + if (self_ == src1) { + // self /= src2 + if (!THC_pointwiseApply2(state, self_, src2, TensorBitOrOp())) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } else { + THCTensor_(resizeAs)(state, self_, src1); + + // self = src1 / src2 + if (!THC_pointwiseApply3(state, self_, src1, src2, TensorBitOrOp())) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } + + THCudaCheck(cudaGetLastError()); +#endif +} + +void THCTensor_(cbitxor)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2) +{ +#if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) + return THError("cbitor is only supported for integer type tensors"); +#else + THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); + THArgCheck(THCTensor_(nElement)(state, src1) == + THCTensor_(nElement)(state, src2), 3, "sizes do not match"); + + if (self_ == src1) { + // self /= src2 + if (!THC_pointwiseApply2(state, self_, src2, TensorBitXorOp())) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } else { + THCTensor_(resizeAs)(state, self_, src1); + + // self = src1 / src2 + if (!THC_pointwiseApply3(state, self_, src1, src2, TensorBitXorOp())) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } + + THCudaCheck(cudaGetLastError()); +#endif +} + +void THCTensor_(sign)(THCState* state, THCTensor* self_, THCTensor* src) { + THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src)); + if (self_ == src) { + if (!THC_pointwiseApply1(state, self_, TensorSignOp())) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } else { + THCTensor_(resizeAs)(state, self_, src); + + if (!THC_pointwiseApply2(state, self_, src, TensorSignOp())) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } + + THCudaCheck(cudaGetLastError()); +} + +#if !defined(THC_REAL_IS_BOOL) + #define IMPLEMENT_CUDA_TENSOR_BASIC_FUNC_(NAME, CFUNC, REAL) \ struct Tensor_##NAME##_##REAL##_Op { \ __device__ __forceinline__ void operator()(scalar_t* out, scalar_t* in) const { \ @@ -75,23 +175,6 @@ IMPLEMENT_CUDA_TENSOR_BASIC_FUNC( abs, THCNumerics::abs, Real) #undef IMPLEMENT_CUDA_TENSOR_BASIC_FUNC_ #undef IMPLEMENT_CUDA_TENSOR_BASIC_FUNC -void THCTensor_(sign)(THCState* state, THCTensor* self_, THCTensor* src) { - THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src)); - if (self_ == src) { - if (!THC_pointwiseApply1(state, self_, TensorSignOp())) { - THArgCheck(false, 2, CUTORCH_DIM_WARNING); - } - } else { - THCTensor_(resizeAs)(state, self_, src); - - if (!THC_pointwiseApply2(state, self_, src, TensorSignOp())) { - THArgCheck(false, 2, CUTORCH_DIM_WARNING); - } - } - - THCudaCheck(cudaGetLastError()); -} - void THCTensor_(clamp)(THCState *state, THCTensor *self_, THCTensor *src, scalar_t min_value, scalar_t max_value) { @@ -552,84 +635,5 @@ void THCTensor_(addcdiv)(THCState *state, THCTensor *self_, THCTensor *t, scalar THCudaCheck(cudaGetLastError()); } -void THCTensor_(cbitand)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2) -{ -#if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) - return THError("cbitand is only supported for integer type tensors"); -#else - THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); - THArgCheck(THCTensor_(nElement)(state, src1) == - THCTensor_(nElement)(state, src2), 3, "sizes do not match"); - - if (self_ == src1) { - // self /= src2 - if (!THC_pointwiseApply2(state, self_, src2, TensorBitAndOp())) { - THArgCheck(false, 2, CUTORCH_DIM_WARNING); - } - } else { - THCTensor_(resizeAs)(state, self_, src1); - - // self = src1 / src2 - if (!THC_pointwiseApply3(state, self_, src1, src2, TensorBitAndOp())) { - THArgCheck(false, 2, CUTORCH_DIM_WARNING); - } - } - - THCudaCheck(cudaGetLastError()); #endif -} - -void THCTensor_(cbitor)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2) -{ -#if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) - return THError("cbitor is only supported for integer type tensors"); -#else - THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); - THArgCheck(THCTensor_(nElement)(state, src1) == - THCTensor_(nElement)(state, src2), 3, "sizes do not match"); - - if (self_ == src1) { - // self /= src2 - if (!THC_pointwiseApply2(state, self_, src2, TensorBitOrOp())) { - THArgCheck(false, 2, CUTORCH_DIM_WARNING); - } - } else { - THCTensor_(resizeAs)(state, self_, src1); - - // self = src1 / src2 - if (!THC_pointwiseApply3(state, self_, src1, src2, TensorBitOrOp())) { - THArgCheck(false, 2, CUTORCH_DIM_WARNING); - } - } - - THCudaCheck(cudaGetLastError()); -#endif -} - -void THCTensor_(cbitxor)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2) -{ -#if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) - return THError("cbitor is only supported for integer type tensors"); -#else - THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); - THArgCheck(THCTensor_(nElement)(state, src1) == - THCTensor_(nElement)(state, src2), 3, "sizes do not match"); - - if (self_ == src1) { - // self /= src2 - if (!THC_pointwiseApply2(state, self_, src2, TensorBitXorOp())) { - THArgCheck(false, 2, CUTORCH_DIM_WARNING); - } - } else { - THCTensor_(resizeAs)(state, self_, src1); - - // self = src1 / src2 - if (!THC_pointwiseApply3(state, self_, src1, src2, TensorBitXorOp())) { - THArgCheck(false, 2, CUTORCH_DIM_WARNING); - } - } - - THCudaCheck(cudaGetLastError()); -#endif -} #endif diff --git a/aten/src/THC/generic/THCTensorMathPointwise.h b/aten/src/THC/generic/THCTensorMathPointwise.h index 5539e8ed1bf8..4f4f209e23d3 100644 --- a/aten/src/THC/generic/THCTensorMathPointwise.h +++ b/aten/src/THC/generic/THCTensorMathPointwise.h @@ -2,6 +2,14 @@ #define THC_GENERIC_FILE "THC/generic/THCTensorMathPointwise.h" #else +THC_API void THCTensor_(cbitand)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2); +THC_API void THCTensor_(cbitor)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2); +THC_API void THCTensor_(cbitxor)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2); + +THC_API void THCTensor_(sign)(THCState *state, THCTensor *self, THCTensor *src); + +#if !defined(THC_REAL_IS_BOOL) + THC_API void THCTensor_(pow)(THCState *state, THCTensor *self, THCTensor *src, scalar_t value); THC_API void THCTensor_(tpow)(THCState *state, THCTensor *self, scalar_t value, THCTensor *src); THC_API void THCTensor_(cpow)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2); @@ -45,7 +53,6 @@ THC_API void THCTensor_(cinv)(THCState *state, THCTensor *self, THCTensor *src); THC_API void THCTensor_(neg)(THCState *state, THCTensor *self, THCTensor *src); THC_API void THCTensor_(abs)(THCState *state, THCTensor *self, THCTensor *src); -THC_API void THCTensor_(sign)(THCState *state, THCTensor *self, THCTensor *src); THC_API void THCTensor_(clamp)(THCState *state, THCTensor *self, THCTensor *src, scalar_t min_value, scalar_t max_value); THC_API void THCTensor_(crossKernel)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2, int dimension); @@ -61,11 +68,9 @@ THC_API void THCTensor_(cfmod)(THCState *state, THCTensor *self, THCTensor *src1 THC_API void THCTensor_(cremainder)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2); THC_API void THCTensor_(cmaxValue)(THCState *state, THCTensor *self, THCTensor *src, scalar_t value); THC_API void THCTensor_(cminValue)(THCState *state, THCTensor *self, THCTensor *src, scalar_t value); -THC_API void THCTensor_(cbitand)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2); -THC_API void THCTensor_(cbitor)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2); -THC_API void THCTensor_(cbitxor)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2); THC_API void THCTensor_(addcmul)(THCState *state, THCTensor *self, THCTensor* t, scalar_t value, THCTensor *src1, THCTensor *src2); THC_API void THCTensor_(addcdiv)(THCState *state, THCTensor *self, THCTensor* t, scalar_t value, THCTensor *src1, THCTensor *src2); #endif +#endif diff --git a/aten/src/THC/generic/THCTensorMathReduce.h b/aten/src/THC/generic/THCTensorMathReduce.h index e4eec0fcb418..4f48c54f76c0 100644 --- a/aten/src/THC/generic/THCTensorMathReduce.h +++ b/aten/src/THC/generic/THCTensorMathReduce.h @@ -21,7 +21,6 @@ THC_API accreal THCTensor_(varall)(THCState *state, THCTensor *self, int biased) THC_API void THCTensor_(prod)(THCState *state, THCTensor *self, THCTensor *src, int dim, int keepdim); -THC_API accreal THCTensor_(sumall)(THCState *state, THCTensor *self); THC_API accreal THCTensor_(meanall)(THCState *state, THCTensor *self); THC_API void THCTensor_(min)(THCState *state, diff --git a/aten/src/THC/generic/THCTensorRandom.cu b/aten/src/THC/generic/THCTensorRandom.cu index 0ee87a27abb8..61225d19b784 100644 --- a/aten/src/THC/generic/THCTensorRandom.cu +++ b/aten/src/THC/generic/THCTensorRandom.cu @@ -8,21 +8,6 @@ #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF) -void THCTensor_(uniform)(THCState* state, THCTensor *self_, double a, double b) -{ - THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_)); - ptrdiff_t size = THCTensor_(nElement)(state, self_); - if (size == 0) return; - THCGenerator* gen = THCRandom_getGenerator(state); - THCTensor *self = THCTensor_(newContiguous)(state, self_); - scalar_t *data = THCTensor_(data)(state, self); - - generate_uniform<<>>( - gen->state.gen_states, size, data, a, b); - - THCTensor_(freeCopyTo)(state, self, self_); -}; - void THCTensor_(normal)(THCState* state, THCTensor *self_, double mean, double stdv) { THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_)); @@ -191,7 +176,8 @@ void THCTensor_(multinomial)(struct THCState *state, // Uniform random samples in a separate kernel launch, into // temporarily allocated memory. The device RNG is thread-limited THCTensor *sampled = THCTensor_(newWithSize2d)(state, numDist, n_sample); - THCTensor_(uniform)(state, sampled, 0.0, 1.0); + auto out = THTensor_wrap(sampled); + at::native::uniform_cuda_(out, 0.0, 1.0); dim3 block(numCategories < maxThreads ? numCategories : maxThreads); dim3 grid(numDist < numSM * 4 ? numDist : numSM * 4); @@ -380,8 +366,10 @@ void THCTensor_(multinomialAliasDraw)(THCState *state, THCudaLongTensor *self, T THCTensor *uniform = THCTensor_(newWithSize1d)(state, n_sample); THCTensor *bernoulli = THCTensor_(newWithSize1d)(state, n_sample); - THCTensor_(uniform)(state, uniform, 0, K); - THCTensor_(uniform)(state, bernoulli, 0, 1); + auto out_uniform = THTensor_wrap(uniform); + auto out_bernoulli = THTensor_wrap(bernoulli); + at::native::uniform_cuda_(out_uniform, 0, K); + at::native::uniform_cuda_(out_bernoulli, 0, 1); multinomialAliasDrawKernel <<>>( diff --git a/aten/src/THC/generic/THCTensorRandom.h b/aten/src/THC/generic/THCTensorRandom.h index 552207d9a885..578c77a6b7e5 100644 --- a/aten/src/THC/generic/THCTensorRandom.h +++ b/aten/src/THC/generic/THCTensorRandom.h @@ -4,7 +4,6 @@ #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF) -THC_API void THCTensor_(uniform)(struct THCState *state, THCTensor *self, double a, double b); THC_API void THCTensor_(normal)(struct THCState *state, THCTensor *self, double mean, double stdv); THC_API void THCTensor_(normal_means)(struct THCState *state, THCTensor *self, THCTensor *means, double stddev); THC_API void THCTensor_(normal_stddevs)(struct THCState *state, THCTensor *self, double mean, THCTensor *stddevs); diff --git a/aten/src/THCUNN/CMakeLists.txt b/aten/src/THCUNN/CMakeLists.txt index 3c32868c6846..c4e6df5b7d9e 100644 --- a/aten/src/THCUNN/CMakeLists.txt +++ b/aten/src/THCUNN/CMakeLists.txt @@ -33,24 +33,16 @@ ${CMAKE_CURRENT_SOURCE_DIR}/SpatialConvolutionMM.cu ${CMAKE_CURRENT_SOURCE_DIR}/SpatialCrossMapLRN.cu ${CMAKE_CURRENT_SOURCE_DIR}/SpatialDepthwiseConvolution.cu ${CMAKE_CURRENT_SOURCE_DIR}/SpatialDilatedConvolution.cu -${CMAKE_CURRENT_SOURCE_DIR}/SpatialDilatedMaxPooling.cu ${CMAKE_CURRENT_SOURCE_DIR}/SpatialFullConvolution.cu ${CMAKE_CURRENT_SOURCE_DIR}/SpatialFullDilatedConvolution.cu -${CMAKE_CURRENT_SOURCE_DIR}/SpatialMaxPooling.cu ${CMAKE_CURRENT_SOURCE_DIR}/SpatialMaxUnpooling.cu ${CMAKE_CURRENT_SOURCE_DIR}/SpatialSubSampling.cu -${CMAKE_CURRENT_SOURCE_DIR}/SpatialUpSamplingBicubic.cu -${CMAKE_CURRENT_SOURCE_DIR}/SpatialUpSamplingBilinear.cu -${CMAKE_CURRENT_SOURCE_DIR}/SpatialUpSamplingNearest.cu ${CMAKE_CURRENT_SOURCE_DIR}/Sqrt.cu ${CMAKE_CURRENT_SOURCE_DIR}/Square.cu ${CMAKE_CURRENT_SOURCE_DIR}/Tanh.cu ${CMAKE_CURRENT_SOURCE_DIR}/TemporalConvolution.cu ${CMAKE_CURRENT_SOURCE_DIR}/TemporalMaxPooling.cu ${CMAKE_CURRENT_SOURCE_DIR}/TemporalRowConvolution.cu -${CMAKE_CURRENT_SOURCE_DIR}/TemporalUpSamplingLinear.cu -${CMAKE_CURRENT_SOURCE_DIR}/TemporalUpSamplingNearest.cu -${CMAKE_CURRENT_SOURCE_DIR}/VolumetricAdaptiveAveragePooling.cu ${CMAKE_CURRENT_SOURCE_DIR}/VolumetricAveragePooling.cu ${CMAKE_CURRENT_SOURCE_DIR}/VolumetricConvolution.cu ${CMAKE_CURRENT_SOURCE_DIR}/VolumetricDilatedConvolution.cu @@ -59,8 +51,6 @@ ${CMAKE_CURRENT_SOURCE_DIR}/VolumetricFullConvolution.cu ${CMAKE_CURRENT_SOURCE_DIR}/VolumetricFullDilatedConvolution.cu ${CMAKE_CURRENT_SOURCE_DIR}/VolumetricMaxPooling.cu ${CMAKE_CURRENT_SOURCE_DIR}/VolumetricMaxUnpooling.cu -${CMAKE_CURRENT_SOURCE_DIR}/VolumetricUpSamplingNearest.cu -${CMAKE_CURRENT_SOURCE_DIR}/VolumetricUpSamplingTrilinear.cu PARENT_SCOPE) set(ATen_CUDA_INCLUDE ${ATen_CUDA_INCLUDE} diff --git a/aten/src/THCUNN/SpatialDilatedMaxPooling.cu b/aten/src/THCUNN/SpatialDilatedMaxPooling.cu deleted file mode 100644 index 5adefab5bbd8..000000000000 --- a/aten/src/THCUNN/SpatialDilatedMaxPooling.cu +++ /dev/null @@ -1,121 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include - -// kernels borrowed from Caffe -template -__global__ void MaxPoolForward(const int nthreads, const Dtype* bottom_data, - const int num, const int channels, const int height, - const int width, const int pooled_height, const int pooled_width, - const int kernel_h, const int kernel_w, const int stride_h, - const int stride_w, const int pad_h, const int pad_w, - const int dilation_h, const int dilation_w, Dtype* top_data, - int64_t* top_mask) { - CUDA_KERNEL_LOOP(index, nthreads) { - int pw = index % pooled_width; - int ph = (index / pooled_width) % pooled_height; - int c = (index / pooled_width / pooled_height) % channels; - int n = index / pooled_width / pooled_height / channels; - int hstart = ph * stride_h - pad_h; - int wstart = pw * stride_w - pad_w; - int hend = min(hstart + (kernel_h - 1) * dilation_h + 1, height); - int wend = min(wstart + (kernel_w - 1) * dilation_w + 1, width); - while(hstart < 0) - hstart += dilation_h; - while(wstart < 0) - wstart += dilation_w; - AccType maxval = THCNumerics::min(); - int maxidx = -1; - bottom_data += (n * channels + c) * height * width; - for (int h = hstart; h < hend; h += dilation_h) { - for (int w = wstart; w < wend; w += dilation_w) { - Dtype val = bottom_data[h * width + w]; - if ((ScalarConvert::to(val) > maxval) || THCNumerics::isnan(val)) { - maxidx = h * width + w; - maxval = ScalarConvert::to(val); - } - } - } - top_data[index] = ScalarConvert::to(maxval); - top_mask[index] = maxidx; - } -} - -const int BACKWARD_THREADS = 256; - -template -#if defined (__HIP_PLATFORM_HCC__) -C10_LAUNCH_BOUNDS_2(BACKWARD_THREADS, 4) -#else -C10_LAUNCH_BOUNDS_2(BACKWARD_THREADS, 8) -#endif -__global__ void MaxPoolBackward(const int nthreads, const Dtype* top_diff, - const int64_t* top_mask, const int num, const int channels, - const int height, const int width, const int pooled_height, - const int pooled_width, const int kernel_h, const int kernel_w, - const int stride_h, const int stride_w, const int pad_h, const int pad_w, - const int dilation_h, const int dilation_w, - Dtype* bottom_diff) { - CUDA_KERNEL_LOOP(index, height*width) { - int h = index/width; - int w = index - h * width; -//get some templating performance benefits without actually templating - int phstart, phend, pwstart, pwend; - if (stride_h == 1) { - phstart = - (h + pad_h < ((kernel_h - 1) * dilation_h + 1)) ? 0 : (h + pad_h - ((kernel_h - 1) * dilation_h + 1)) + 1; - phend = min((h + pad_h) + 1, pooled_height); - } else if (stride_h == 2) { - phstart = - (h + pad_h < ((kernel_h - 1) * dilation_h + 1)) ? 0 : (h + pad_h - ((kernel_h - 1) * dilation_h + 1)) / 2 + 1; - phend = min((h + pad_h) / 2 + 1, pooled_height); - } else { - phstart = - (h + pad_h < ((kernel_h - 1) * dilation_h + 1)) ? 0 : (h + pad_h - ((kernel_h - 1) * dilation_h + 1)) / stride_h + 1; - phend = min((h + pad_h) / stride_h + 1, pooled_height); - } - if (stride_w == 1) { - pwstart = - (w + pad_w < ((kernel_w - 1) * dilation_w + 1)) ? 0 : (w + pad_w - ((kernel_w - 1) * dilation_w + 1)) + 1; - pwend = min((w + pad_w) + 1, pooled_width); - } else if (stride_w == 2) { - pwstart = - (w + pad_w < ((kernel_w - 1) * dilation_w + 1)) ? 0 : (w + pad_w - ((kernel_w - 1) * dilation_w + 1)) / 2 + 1; - pwend = min((w + pad_w) / 2 + 1, pooled_width); - } else { - pwstart = - (w + pad_w < ((kernel_w - 1) * dilation_w + 1)) ? 0 : (w + pad_w - ((kernel_w - 1) * dilation_w + 1)) / stride_w + 1; - pwend = min((w + pad_w) / stride_w + 1, pooled_width); - } - for (int n = blockIdx.y; n < num; n += gridDim.y) - for (int c = blockIdx.z; c < channels; c+= gridDim.z) { - - AccType gradient = AccType(0); - int offset = (n * channels + c) * pooled_height * pooled_width; - top_diff += offset; - top_mask += offset; -//get some templating performance benefits without actually templating - if ((phstart + 1 != phend) || (pwstart + 1 != pwend)) { - for (int ph = phstart; ph < phend; ++ph) { - for (int pw = pwstart; pw < pwend; ++pw) { - if (top_mask[ph * pooled_width + pw] == h * width + w) { - gradient += ScalarConvert::to(top_diff[ph * pooled_width + pw]); - } - } - } - } else { - if (top_mask[phstart * pooled_width + pwstart] == h * width + w) { - gradient += ScalarConvert::to(top_diff[phstart * pooled_width + pwstart]); - } - } - bottom_diff[(n*channels+c)*height*width+index] = ScalarConvert::to(gradient); - } - } -} - -#include -#include diff --git a/aten/src/THCUNN/SpatialMaxPooling.cu b/aten/src/THCUNN/SpatialMaxPooling.cu deleted file mode 100644 index c01ea5957b7f..000000000000 --- a/aten/src/THCUNN/SpatialMaxPooling.cu +++ /dev/null @@ -1,4 +0,0 @@ -#include - -#include -#include diff --git a/aten/src/THCUNN/SpatialUpSamplingBicubic.cu b/aten/src/THCUNN/SpatialUpSamplingBicubic.cu deleted file mode 100644 index ae22582bd902..000000000000 --- a/aten/src/THCUNN/SpatialUpSamplingBicubic.cu +++ /dev/null @@ -1,164 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -template -#if defined(__HIP_PLATFORM_HCC__) -__launch_bounds__(1024) -#endif -__global__ void bicubic_interp2d_kernel( - const int num_elements, - const Acctype height_scale, - const Acctype width_scale, - const bool align_corners, - const THCDeviceTensor in_data, - THCDeviceTensor out_data -) { - - int index = threadIdx.x + blockIdx.x * blockDim.x; - const int batchsize = in_data.getSize(0); - const int channels = in_data.getSize(1); - const int input_height = in_data.getSize(2); - const int input_width = in_data.getSize(3); - const int output_height = out_data.getSize(2); - const int output_width = out_data.getSize(3); - - if (index >= num_elements) { - return; - } - - // Special case: input and output are the same size, just copy - const int output_x = index % output_width; - const int output_y = index / output_width; - if (input_height == output_height && input_width == output_width) { - for (int n = 0; n < batchsize; n++){ - for (int c = 0; c < channels; c++) { - const Dtype val = in_data[n][c][output_y][output_x]; - out_data[n][c][output_x][output_y] = val; - } - } - return; - } - - // Interpolation kernel - Acctype real_x = area_pixel_compute_source_index(width_scale, output_x, align_corners, /*cubic=*/true); - int in_x = floorf(real_x); - Acctype t_x = real_x - in_x; - - Acctype real_y = area_pixel_compute_source_index(height_scale, output_y, align_corners, /*cubic=*/true); - int in_y = floorf(real_y); - Acctype t_y = real_y - in_y; - - for (int n = 0; n < batchsize ; n++) { - for (int c = 0; c < channels; c++) { - Acctype coefficients[4]; - - for (int k = 0; k < 4; k++) { - coefficients[k] = cubic_interp1d( - upsampling_get_value_bounded( - in_data, c, n, input_width, input_height, in_x - 1, in_y - 1 + k), - upsampling_get_value_bounded( - in_data, c, n, input_width, input_height, in_x + 0, in_y - 1 + k), - upsampling_get_value_bounded( - in_data, c, n, input_width, input_height, in_x + 1, in_y - 1 + k), - upsampling_get_value_bounded( - in_data, c, n, input_width, input_height, in_x + 2, in_y - 1 + k), - t_x - ); - } - - out_data[n][c][output_y][output_x] = ScalarConvert::to(cubic_interp1d( - coefficients[0], - coefficients[1], - coefficients[2], - coefficients[3], - t_y - )); - } - } -} - -// Backward (adjoint) operation 1 <- 2 (accumulates) -template -#if defined(__HIP_PLATFORM_HCC__) -__launch_bounds__(1024) -#endif -__global__ void bicubic_interp2d_backward_kernel( - const int num_elements, - const Acctype height_scale, - const Acctype width_scale, - const bool align_corners, - THCDeviceTensor in_data, - const THCDeviceTensor out_data -){ - - int index = threadIdx.x + blockIdx.x * blockDim.x; - const int batchsize = in_data.getSize(0); - const int channels = in_data.getSize(1); - const int input_height = in_data.getSize(2); - const int input_width = in_data.getSize(3); - const int output_height = out_data.getSize(2); - const int output_width = out_data.getSize(3); - - if (index >= num_elements) { - return; - } - - const int output_x = index % output_width; - const int output_y = index / output_width; - // special case: output_xust copy - if (input_height == output_height && input_width == output_width) { - for (int n = 0; n < batchsize ; n++){ - for (int c = 0; c < channels; ++c) { - const Dtype val = out_data[n][c][output_y][output_x]; - in_data[n][c][output_y][output_x] += val; - } - } - return; - } - - Acctype real_x = area_pixel_compute_source_index(width_scale, output_x, align_corners, /*cubic=*/true); - int input_x = floorf(real_x); - Acctype t_x = real_x - input_x; - - Acctype real_y = area_pixel_compute_source_index(height_scale, output_y, align_corners, /*cubic=*/true); - int input_y = floorf(real_y); - Acctype t_y = real_y - input_y; - - Acctype x_coeffs[4]; - Acctype y_coeffs[4]; - - get_cubic_upsampling_coefficients(x_coeffs, t_x); - get_cubic_upsampling_coefficients(y_coeffs, t_y); - - for (int n = 0; n < batchsize ; n++){ - for (int c = 0; c < channels; ++c) { - Dtype out_value = out_data[n][c][output_y][output_x]; - for (int i = 0; i < 4; i++) { - for (int j = 0; j < 4; j++) { - upsampling_increment_value_bounded( - in_data, - c, - n, - input_width, - input_height, - input_x - 1 + j, - input_y - 1 + i, - out_value * y_coeffs[i] * x_coeffs[j] - ); - } - } - } - } -} - - -#include -#include diff --git a/aten/src/THCUNN/SpatialUpSamplingBilinear.cu b/aten/src/THCUNN/SpatialUpSamplingBilinear.cu deleted file mode 100644 index ca9906d8b8eb..000000000000 --- a/aten/src/THCUNN/SpatialUpSamplingBilinear.cu +++ /dev/null @@ -1,130 +0,0 @@ -// Adapted from interp.cpp from Caffe util by Pauline Luc -// Originally developed by George Papandreou -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -template -#ifdef __HIP_PLATFORM_HCC__ -C10_LAUNCH_BOUNDS_1(1024) -#endif -__global__ void caffe_gpu_interp2_kernel(const int n, - const Acctype rheight, const Acctype rwidth, const bool align_corners, - const THCDeviceTensor data1, THCDeviceTensor data2) { - int index = threadIdx.x + blockIdx.x * blockDim.x; - const int batchsize = data1.getSize(0); - const int channels = data1.getSize(1); - const int height1 = data1.getSize(2); - const int width1 = data1.getSize(3); - const int height2 = data2.getSize(2); - const int width2 = data2.getSize(3); - - if (index < n) { - const int w2 = index % width2; // 0:width2-1 - const int h2 = index / width2; // 0:height2-1 - // special case: just copy - if (height1 == height2 && width1 == width2) { - const int h1 = h2; - const int w1 = w2; - for (int n = 0; n < batchsize ; n++){ - for (int c = 0; c < channels; ++c) { - const Dtype val = data1[n][c][h1][w1]; - data2[n][c][h2][w2] = val; - } - } - return; - } - // - const Acctype h1r = area_pixel_compute_source_index(rheight, h2, align_corners, /*cubic=*/false); - const int h1 = h1r; - const int h1p = (h1 < height1 - 1) ? 1 : 0; - const Acctype h1lambda = h1r - h1; - const Acctype h0lambda = Acctype(1) - h1lambda; - // - const Acctype w1r = area_pixel_compute_source_index(rwidth, w2, align_corners, /*cubic=*/false); - const int w1 = w1r; - const int w1p = (w1 < width1 - 1) ? 1 : 0; - const Acctype w1lambda = w1r - w1; - const Acctype w0lambda = Acctype(1) - w1lambda; - // - for (int n = 0; n < batchsize ; n++){ - for (int c = 0; c < channels; ++c) { - const Acctype val = h0lambda * (w0lambda * data1[n][c][h1][w1] - + w1lambda * data1[n][c][h1][w1+w1p]) - + h1lambda * (w0lambda * data1[n][c][h1+h1p][w1] - + w1lambda * data1[n][c][h1+h1p][w1+w1p]); - data2[n][c][h2][w2] = ScalarConvert::to(val); - } - } - } -} - -// Backward (adjoint) operation 1 <- 2 (accumulates) -template -#ifdef __HIP_PLATFORM_HCC__ -C10_LAUNCH_BOUNDS_1(1024) -#endif -__global__ void caffe_gpu_interp2_kernel_backward(const int n, - const Acctype rheight, const Acctype rwidth, const bool align_corners, - THCDeviceTensor data1, const THCDeviceTensor data2){ - int index = threadIdx.x + blockIdx.x * blockDim.x; - const int batchsize = data1.getSize(0); - const int channels = data1.getSize(1); - const int height1 = data1.getSize(2); - const int width1 = data1.getSize(3); - const int height2 = data2.getSize(2); - const int width2 = data2.getSize(3); - if (index < n) { - const int w2 = index % width2; // 0:width2-1 - const int h2 = index / width2; // 0:height2-1 - // special case: just copy - if (height1 == height2 && width1 == width2) { - const int h1 = h2; - const int w1 = w2; - for (int n = 0; n < batchsize ; n++){ - for (int c = 0; c < channels; ++c) { - const Dtype val = data2[n][c][h1][w1]; - data1[n][c][h2][w2] += val; - } - } - return; - } - // - const Acctype h1r = area_pixel_compute_source_index(rheight, h2, align_corners, /*cubic=*/false); - const int h1 = h1r; - const int h1p = (h1 < height1 - 1) ? 1 : 0; - const Acctype h1lambda = h1r - h1; - const Acctype h0lambda = Acctype(1) - h1lambda; - // - const Acctype w1r = area_pixel_compute_source_index(rwidth, w2, align_corners, /*cubic=*/false); - const int w1 = w1r; - const int w1p = (w1 < width1 - 1) ? 1 : 0; - const Acctype w1lambda = w1r - w1; - const Acctype w0lambda = Acctype(1) - w1lambda; - // - for (int n = 0; n < batchsize ; n++){ - for (int c = 0; c < channels; ++c) { - const Dtype d2val = data2[n][c][h2][w2]; - atomicAdd(data1[n][c][h1][w1].data(), - ScalarConvert::to(h0lambda * w0lambda * d2val)); - atomicAdd(data1[n][c][h1][w1+w1p].data(), - ScalarConvert::to(h0lambda * w1lambda * d2val)); - atomicAdd(data1[n][c][h1+h1p][w1].data(), - ScalarConvert::to(h1lambda * w0lambda * d2val)); - atomicAdd(data1[n][c][h1+h1p][w1+w1p].data(), - ScalarConvert::to(h1lambda * w1lambda * d2val)); - } - } - } -} - - -#include -#include diff --git a/aten/src/THCUNN/SpatialUpSamplingNearest.cu b/aten/src/THCUNN/SpatialUpSamplingNearest.cu deleted file mode 100644 index 13777b2bb28a..000000000000 --- a/aten/src/THCUNN/SpatialUpSamplingNearest.cu +++ /dev/null @@ -1,108 +0,0 @@ -#include -#include -#include - -#include -#include -#include -#include - -#include -#include -#include - -template -#ifdef __HIP_PLATFORM_HCC__ -C10_LAUNCH_BOUNDS_1(1024) -#endif -__global__ void nearest_neighbor_4d_kernel( - const int n, - const THCDeviceTensor data1, - THCDeviceTensor data2) { - int index = threadIdx.x + blockIdx.x * blockDim.x; - const int batchsize = data1.getSize(0); - const int channels = data1.getSize(1); - const int height1 = data1.getSize(2); - const int width1 = data1.getSize(3); - const int height2 = data2.getSize(2); - const int width2 = data2.getSize(3); - const float height_scale = (float) height1 / (float) height2; - const float width_scale = (float) width1 / (float) width2; - - if (index < n) { - const int w2 = index % width2; // 0:width2-1 - const int h2 = index / width2; // 0:height2-1 - // special case: just copy - if (height1 == height2 && width1 == width2) { - const int h1 = h2; - const int w1 = w2; - for (int n = 0; n < batchsize; n++) { - for (int c = 0; c < channels; ++c) { - const Dtype val = data1[n][c][h1][w1]; - data2[n][c][h2][w2] = val; - } - } - return; - } - // - const int h1 = nearest_neighbor_compute_source_index(height_scale, h2, height1); - const int w1 = nearest_neighbor_compute_source_index(width_scale, w2, width1); - for (int n = 0; n < batchsize; n++) { - for (int c = 0; c < channels; ++c) { - const Dtype val = data1[n][c][h1][w1]; - data2[n][c][h2][w2] = val; - } - } - } -} - -// Backward operation -template -#ifdef __HIP_PLATFORM_HCC__ -C10_LAUNCH_BOUNDS_1(1024) -#endif -__global__ void nearest_neighbor_4d_kernel_backward( - const int n, - THCDeviceTensor data1, - const THCDeviceTensor data2) { - int index = threadIdx.x + blockIdx.x * blockDim.x; - const int batchsize = data1.getSize(0); - const int channels = data1.getSize(1); - const int height1 = data1.getSize(2); - const int width1 = data1.getSize(3); - const int height2 = data2.getSize(2); - const int width2 = data2.getSize(3); - const float height_scale = (float) height1 / (float) height2; - const float width_scale = (float) width1 / (float) width2; - - if (index < n) { - const int w2 = index % width2; // 0:width2-1 - const int h2 = index / width2; // 0:height2-1 - // special case: just copy - if (height1 == height2 && width1 == width2) { - const int h1 = h2; - const int w1 = w2; - for (int n = 0; n < batchsize; n++) { - for (int c = 0; c < channels; ++c) { - const Dtype val = data2[n][c][h2][w2]; - data1[n][c][h1][w1] = val; - } - } - return; - } - // - const int h1 = nearest_neighbor_compute_source_index(height_scale, h2, height1); - const int w1 = nearest_neighbor_compute_source_index(width_scale, w2, width1); - - for (int n = 0; n < batchsize; n++) { - for (int c = 0; c < channels; ++c) { - const Dtype d2val = data2[n][c][h2][w2]; - atomicAdd(data1[n][c][h1][w1].data(), d2val); - } - } - } -} - - -#include -#include diff --git a/aten/src/THCUNN/TemporalUpSamplingLinear.cu b/aten/src/THCUNN/TemporalUpSamplingLinear.cu deleted file mode 100644 index a58cfb3196f1..000000000000 --- a/aten/src/THCUNN/TemporalUpSamplingLinear.cu +++ /dev/null @@ -1,104 +0,0 @@ -// Adapted from interp.cpp from Caffe util by Pauline Luc -// Originally developed by George Papandreou -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -template -#ifdef __HIP_PLATFORM_HCC__ -C10_LAUNCH_BOUNDS_1(1024) -#endif -__global__ void caffe_gpu_interp2_kernel(const int n, - const Acctype rwidth, const bool align_corners, - const THCDeviceTensor data1, THCDeviceTensor data2) { - int index = threadIdx.x + blockIdx.x * blockDim.x; - const int batchsize = data1.getSize(0); - const int channels = data1.getSize(1); - const int width1 = data1.getSize(2); - const int width2 = data2.getSize(2); - - if (index < n) { - const int w2 = index % width2; - // special case: just copy - if (width1 == width2) { - const int w1 = w2; - for (int n = 0; n < batchsize ; n++){ - for (int c = 0; c < channels; ++c) { - const Dtype val = data1[n][c][w1]; - data2[n][c][w2] = val; - } - } - return; - } - // - const Acctype w1r = area_pixel_compute_source_index(rwidth, w2, align_corners, /*cubic=*/false); - const int w1 = w1r; - const int w1p = (w1 < width1 - 1) ? 1 : 0; - const Acctype w1lambda = w1r - w1; - const Acctype w0lambda = Acctype(1) - w1lambda; - // - for (int n = 0; n < batchsize ; n++){ - for (int c = 0; c < channels; ++c) { - const Acctype val = w0lambda * data1[n][c][w1] - + w1lambda * data1[n][c][w1+w1p]; - data2[n][c][w2] = ScalarConvert::to(val); - } - } - } -} - -// Backward (adjoint) operation 1 <- 2 (accumulates) -template -#ifdef __HIP_PLATFORM_HCC__ -C10_LAUNCH_BOUNDS_1(1024) -#endif -__global__ void caffe_gpu_interp2_kernel_backward(const int n, - const Acctype rwidth, const bool align_corners, - THCDeviceTensor data1, const THCDeviceTensor data2){ - int index = threadIdx.x + blockIdx.x * blockDim.x; - const int batchsize = data1.getSize(0); - const int channels = data1.getSize(1); - const int width1 = data1.getSize(2); - const int width2 = data2.getSize(2); - if (index < n) { - const int w2 = index % width2; - // special case: just copy - if (width1 == width2) { - const int w1 = w2; - for (int n = 0; n < batchsize ; n++){ - for (int c = 0; c < channels; ++c) { - const Dtype val = data2[n][c][w1]; - data1[n][c][w2] += val; - } - } - return; - } - // - const Acctype w1r = area_pixel_compute_source_index(rwidth, w2, align_corners, /*cubic=*/false); - const int w1 = w1r; - const int w1p = (w1 < width1 - 1) ? 1 : 0; - const Acctype w1lambda = w1r - w1; - const Acctype w0lambda = Acctype(1) - w1lambda; - // - for (int n = 0; n < batchsize ; n++){ - for (int c = 0; c < channels; ++c) { - const Dtype d2val = data2[n][c][w2]; - atomicAdd(data1[n][c][w1].data(), - ScalarConvert::to(w0lambda * d2val)); - atomicAdd(data1[n][c][w1+w1p].data(), - ScalarConvert::to(w1lambda * d2val)); - } - } - } -} - - -#include -#include diff --git a/aten/src/THCUNN/TemporalUpSamplingNearest.cu b/aten/src/THCUNN/TemporalUpSamplingNearest.cu deleted file mode 100644 index b10f5e1392e7..000000000000 --- a/aten/src/THCUNN/TemporalUpSamplingNearest.cu +++ /dev/null @@ -1,95 +0,0 @@ -#include -#include -#include - -#include -#include -#include -#include - -#include -#include -#include - -template -#ifdef __HIP_PLATFORM_HCC__ -C10_LAUNCH_BOUNDS_1(1024) -#endif -__global__ void nearest_neighbor_3d_kernel( - const int n, - const THCDeviceTensor data1, - THCDeviceTensor data2) { - int index = threadIdx.x + blockIdx.x * blockDim.x; - const int batchsize = data1.getSize(0); - const int channels = data1.getSize(1); - const int width1 = data1.getSize(2); - const int width2 = data2.getSize(2); - const float scale = (float) width1 / (float) width2; - - if (index < n) { - const int w2 = index % width2; - // special case: just copy - if (width1 == width2) { - const int w1 = w2; - for (int n = 0; n < batchsize; n++) { - for (int c = 0; c < channels; ++c) { - const Dtype val = data1[n][c][w1]; - data2[n][c][w2] = val; - } - } - return; - } - // - const int w1 = nearest_neighbor_compute_source_index(scale, w2, width1); - for (int n = 0; n < batchsize; n++) { - for (int c = 0; c < channels; ++c) { - const Dtype val = data1[n][c][w1]; - data2[n][c][w2] = val; - } - } - } -} - -// Backward operation -template -#ifdef __HIP_PLATFORM_HCC__ -C10_LAUNCH_BOUNDS_1(1024) -#endif -__global__ void nearest_neighbor_3d_kernel_backward( - const int n, - THCDeviceTensor data1, - const THCDeviceTensor data2) { - int index = threadIdx.x + blockIdx.x * blockDim.x; - const int batchsize = data1.getSize(0); - const int channels = data1.getSize(1); - const int width1 = data1.getSize(2); - const int width2 = data2.getSize(2); - const float scale = (float) width1 / (float) width2; - - if (index < n) { - const int w2 = index % width2; - // special case: just copy - if (width1 == width2) { - const int w1 = w2; - for (int n = 0; n < batchsize; n++) { - for (int c = 0; c < channels; ++c) { - const Dtype val = data2[n][c][w1]; - data1[n][c][w2] = val; - } - } - return; - } - // - const int w1 = nearest_neighbor_compute_source_index(scale, w2, width1); - for (int n = 0; n < batchsize; n++) { - for (int c = 0; c < channels; ++c) { - const Dtype d2val = data2[n][c][w2]; - atomicAdd(data1[n][c][w1].data(), d2val); - } - } - } -} - - -#include -#include diff --git a/aten/src/THCUNN/VolumetricAdaptiveAveragePooling.cu b/aten/src/THCUNN/VolumetricAdaptiveAveragePooling.cu deleted file mode 100644 index e94183e2cfc7..000000000000 --- a/aten/src/THCUNN/VolumetricAdaptiveAveragePooling.cu +++ /dev/null @@ -1,248 +0,0 @@ -#include -#include -#include -#include -#include - -#define START_IND(a,b,c) (int)floor((float)(a * c) / b) -#define END_IND(a,b,c) (int)ceil((float)((a + 1) * c) / b) -// #define START_IND(a,b,c) a * c / b -// #define END_IND(a,b,c) (a + 1) * c / b + ((a + 1) * c % b > 0)?1:0 - - -#define CUDA_MAX_THREADS 1024 // this is safe, in reality 256 is our limit - -// 5d tensor B x D x T x H x W -// All kernels view batch dim B and feature dim D as collapsed. - -/* - * Description: - * This function adaptively average pools an input 5D tensor along dimensions - * 2, 3 and 4. - * - * gridDim.y blocks work together on a single 2D output plane specified by - * (blockIdx.x + offsetZ). - */ - template -__global__ void cunn_VolumetricAdaptiveAveragePooling_updateOutput_kernel( - T *input, T *output, - int isizeT, int isizeH, int isizeW, - int osizeT, int osizeH, int osizeW, - int64_t istrideD, - int64_t istrideT, int64_t istrideH, int64_t istrideW, - int64_t offsetZ) -{ - // iterators on output pixels - int ot, oh, ow; - - // compute offsets based on thread/block ID - int ostartH = blockIdx.y * blockDim.y + threadIdx.y; - int oendH = osizeH; - int ostepH = gridDim.y * blockDim.y; - int ostartW = threadIdx.x; - int oendW = osizeW; - int ostepW = blockDim.x; - - // select output plane - int64_t o_plane = blockIdx.x + offsetZ; - ot = o_plane % osizeT; // output frame/time - int d = o_plane / osizeT; // slice/feature - - // input frame/time ramge is fixed. - int istartT = START_IND(ot, osizeT, isizeT); - int iendT = END_IND(ot, osizeT, isizeT); - int kT = iendT - istartT; - - // input offset by slice/feature and earliest relevant frame/time - T *input_dt = input + d*istrideD + istartT*istrideT; - // output offset by slice/feature and frame/time - T *output_dt = output + o_plane*osizeH*osizeW; - - // For all output pixels... - for(oh = ostartH; oh < oendH; oh += ostepH) { - - int istartH = START_IND(oh, osizeH, isizeH); - int iendH = END_IND(oh, osizeH, isizeH); - int kH = iendH - istartH; - - for(ow = ostartW; ow < oendW; ow += ostepW) { - - int istartW = START_IND(ow, osizeW, isizeW); - int iendW = END_IND(ow, osizeW, isizeW); - int kW = iendW - istartW; - - // Compute the average pooling from corresponding input pixels - T *ptr_input = input_dt + istartH*istrideH + istartW*istrideW; - T *ptr_output = output_dt + oh*osizeW + ow; - T sum = ScalarConvert::to(0); - - int it, ih, iw; - for(it = 0; it < kT; ++it) { - for(ih = 0; ih < kH; ++ih) { - for(iw = 0; iw < kW; ++iw) { - T val = ptr_input[ih*istrideH + iw*istrideW]; - sum += val; - } - } - ptr_input += istrideT; // next input frame - } - // Update output - *ptr_output = sum / kT / kH / kW; - } - } -} - -/* - * Description: - * This function computes the gradInput from gradOutput. - * - * gridDim.y blocks work together on a single 2D input plane specified by - * (blockIdx.x + offsetZ). - */ - template -__global__ void cunn_VolumetricAdaptiveAveragePooling_updateGradInput_kernel( - T *gradInput, T *gradOutput, - int isizeT, int isizeH, int isizeW, - int osizeT, int osizeH, int osizeW, - int64_t offsetZ -) -{ - // iterators on input pixels - int it, ih, iw; - - // compute offsets based on thread/block ID - int istartH = blockIdx.y * blockDim.y + threadIdx.y; - int iendH = isizeH; - int istepH = gridDim.y * blockDim.y; - int istartW = threadIdx.x; - int iendW = isizeW; - int istepW = blockDim.x; - - // select input plane - int64_t i_plane = blockIdx.x + offsetZ; - it = i_plane % isizeT; // output frame/time - int d = i_plane / isizeT; // slice/feature - - // output frame/time ramge is fixed. - int ostartT = START_IND(it, isizeT, osizeT); - int oendT = END_IND(it, isizeT, osizeT); - - // gradInput offset by slice/feature and frame/time - T *gradInput_dt = gradInput + i_plane*isizeH*isizeW; - // gradOutput offset by slice/feature and earliest relevant frame/time - T *gradOutput_dt = gradOutput + (d*osizeT + ostartT)*osizeH*osizeW; - - // For all input pixels... - for(ih = istartH; ih < iendH; ih += istepH) { - - int ostartH = START_IND(ih, isizeH, osizeH); - int oendH = END_IND(ih, isizeH, osizeH); - - for(iw = istartW; iw < iendW; iw += istepW) { - - int ostartW = START_IND(iw, isizeW, osizeW); - int oendW = END_IND(iw, isizeW, osizeW); - - // Compute the gradients from corresponding output pixels - T *ptr_gradInput = gradInput_dt + ih*isizeW + iw; - T *ptr_gradOutput = gradOutput_dt; - - // for all relevant output pixels - int ot, oh, ow; - for(ot = ostartT; ot < oendT; ++ot) { - int kT = END_IND(ot, osizeT, isizeT) - START_IND(ot, osizeT, isizeT); - for(oh = ostartH; oh < oendH; ++oh) { - int kH = END_IND(oh, osizeH, isizeH) - START_IND(oh, osizeH, isizeH); - for(ow = ostartW; ow < oendW; ++ow) { - int kW = END_IND(ow, osizeW, isizeW) - START_IND(ow, osizeW, isizeW); - T grad_delta = ptr_gradOutput[oh*osizeW + ow] / kW / kH / kT; - *ptr_gradInput += grad_delta; - } - } - ptr_gradOutput += osizeH*osizeW; // next output frame - } - } - } -} - -/* - * Description: - * This function computes the gradInput from gradOutput without assuming - * dependencies between input pixels and output pixels. - * - * gridDim.y blocks work together on a single 2D output plane specified by - * (blockIdx.x + offsetZ). - * - * (uses atomic add) - */ - template -__global__ void cunn_atomic_VolumetricAdaptiveAveragePooling_updateGradInput_kernel( - T *gradInput, T *gradOutput, - int isizeT, int isizeH, int isizeW, - int osizeT, int osizeH, int osizeW, - int64_t offsetZ -) -{ - // iterators on output pixels - int ot, oh, ow; - - // compute offsets based on thread/block ID - int ostartH = blockIdx.y * blockDim.y + threadIdx.y; - int oendH = osizeH; - int ostepH = gridDim.y * blockDim.y; - int ostartW = threadIdx.x; - int oendW = osizeW; - int ostepW = blockDim.x; - - // select output plane - int64_t o_plane = blockIdx.x + offsetZ; - ot = o_plane % osizeT; // output frame/time - int d = o_plane / osizeT; // output slice/feature - - // input frame/time ramge is fixed. - int istartT = START_IND(ot, osizeT, isizeT); - int iendT = END_IND(ot, osizeT, isizeT); - int kT = iendT - istartT; - - // gradInput offset by slice/feature and earliest relevant frame/time - T *gradInput_nt = gradInput + (d*isizeT + istartT)*isizeH*isizeW; - // gradOutput offset by slice/feature and frame/time - T *gradOutput_nt = gradOutput + o_plane*osizeH*osizeW; - - // For all output pixels... - for(oh = ostartH; oh < oendH; oh += ostepH) { - - int istartH = START_IND(oh, osizeH, isizeH); - int iendH = END_IND(oh, osizeH, isizeH); - int kH = iendH - istartH; - - for(ow = ostartW; ow < oendW; ow += ostepW) { - - int istartW = START_IND(ow, osizeW, isizeW); - int iendW = END_IND(ow, osizeW, isizeW); - int kW = iendW - istartW; - - // Compute the gradients from corresponding input pixels - T *ptr_gradInput = gradInput_nt + istartH*isizeW + istartW; - T *ptr_gradOutput = gradOutput_nt + oh*osizeW + ow; - T grad_delta = *ptr_gradOutput / kT / kH / kW; - - int it, ih, iw; - for(it = 0; it < kT; ++it) { - for(ih = 0; ih < kH; ++ih) { - for(iw = 0; iw < kW; ++iw) { - atomicAdd(&(ptr_gradInput[ih*isizeW + iw]), grad_delta); - } - } - ptr_gradInput += isizeH*isizeW; // next input frame - } - } - } -} - -#include -#include - -#undef CUDA_MAX_THREADS -#undef START_IND -#undef END_IND diff --git a/aten/src/THCUNN/VolumetricUpSamplingNearest.cu b/aten/src/THCUNN/VolumetricUpSamplingNearest.cu deleted file mode 100644 index 8e98b400a0f0..000000000000 --- a/aten/src/THCUNN/VolumetricUpSamplingNearest.cu +++ /dev/null @@ -1,120 +0,0 @@ -#include -#include -#include - -#include -#include -#include -#include - -#include -#include -#include - -template -#ifdef __HIP_PLATFORM_HCC__ -C10_LAUNCH_BOUNDS_1(1024) -#endif -__global__ void nearest_neighbor_5d_kernel( - const int n, - const THCDeviceTensor data1, - THCDeviceTensor data2) { - int index = threadIdx.x + blockIdx.x * blockDim.x; - const int batchsize = data1.getSize(0); - const int channels = data1.getSize(1); - const int depth1 = data1.getSize(2); - const int height1 = data1.getSize(3); - const int width1 = data1.getSize(4); - const int depth2 = data2.getSize(2); - const int height2 = data2.getSize(3); - const int width2 = data2.getSize(4); - const float depth_scale = (float) depth1 / (float) depth2; - const float height_scale = (float) height1 / (float) height2; - const float width_scale = (float) width1 / (float) width2; - - if (index < n) { - const int w2 = (index % (height2*width2)) % width2; // 0:width2-1 - const int h2 = (index % (height2*width2)) / width2; // 0:height2-1 - const int d2 = index / (height2*width2); // 0:depth2-1 - // special case: just copy - if (depth1 == depth2 && height1 == height2 && width1 == width2) { - const int d1 = d2; - const int h1 = h2; - const int w1 = w2; - for (int n = 0; n < batchsize ; n++){ - for (int c = 0; c < channels; ++c) { - const Dtype val = data1[n][c][d1][h1][w1]; - data2[n][c][d2][h2][w2] = val; - } - } - return; - } - // - const int h1 = nearest_neighbor_compute_source_index(height_scale, h2, height1); - const int w1 = nearest_neighbor_compute_source_index(width_scale, w2, width1); - const int d1 = nearest_neighbor_compute_source_index(depth_scale, d2, depth1); - for (int n = 0; n < batchsize; n++) { - for (int c = 0; c < channels; ++c) { - const Dtype val = data1[n][c][d1][h1][w1]; - data2[n][c][d2][h2][w2] = val; - } - } - } -} - -// Backward operation -template -#ifdef __HIP_PLATFORM_HCC__ -C10_LAUNCH_BOUNDS_1(1024) -#endif -__global__ void nearest_neighbor_5d_kernel_backward( - const int n, - THCDeviceTensor data1, - const THCDeviceTensor data2) { - int index = threadIdx.x + blockIdx.x * blockDim.x; - const int batchsize = data1.getSize(0); - const int channels = data1.getSize(1); - const int depth1 = data1.getSize(2); - const int height1 = data1.getSize(3); - const int width1 = data1.getSize(4); - const int depth2 = data2.getSize(2); - const int height2 = data2.getSize(3); - const int width2 = data2.getSize(4); - const float depth_scale = (float) depth1 / (float) depth2; - const float height_scale = (float) height1 / (float) height2; - const float width_scale = (float) width1 / (float) width2; - - if (index < n) { - const int w2 = (index % (height2*width2)) % width2; // 0:width2-1 - const int h2 = (index % (height2*width2)) / width2; // 0:height2-1 - const int d2 = index / (height2*width2); // 0:depth2-1 - - // special case: just copy - if (depth1 == depth2 && height1 == height2 && width1 == width2) { - const int d1 = d2; - const int h1 = h2; - const int w1 = w2; - for (int n = 0; n < batchsize ; n++){ - for (int c = 0; c < channels; ++c) { - const Dtype val = data2[n][c][d1][h1][w1]; - data1[n][c][d2][h2][w2] = val; - } - } - return; - } - // - const int h1 = nearest_neighbor_compute_source_index(height_scale, h2, height1); - const int w1 = nearest_neighbor_compute_source_index(width_scale, w2, width1); - const int d1 = nearest_neighbor_compute_source_index(depth_scale, d2, depth1); - for (int n = 0; n < batchsize; n++) { - for (int c = 0; c < channels; ++c) { - const Dtype val = data2[n][c][d2][h2][w2]; - atomicAdd(data1[n][c][d1][h1][w1].data(), val); - } - } - } -} - - -#include -#include diff --git a/aten/src/THCUNN/VolumetricUpSamplingTrilinear.cu b/aten/src/THCUNN/VolumetricUpSamplingTrilinear.cu deleted file mode 100644 index 48d72bba86d5..000000000000 --- a/aten/src/THCUNN/VolumetricUpSamplingTrilinear.cu +++ /dev/null @@ -1,160 +0,0 @@ -// Adapted from interp.cpp from Caffe util by Pauline Luc -// Originally developed by George Papandreou -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -template -C10_LAUNCH_BOUNDS_1(1024) -__global__ void caffe_gpu_interp2_kernel(const int n, - const Acctype rdepth, const Acctype rheight, const Acctype rwidth, const bool align_corners, - const THCDeviceTensor data1, THCDeviceTensor data2) { - int index = threadIdx.x + blockIdx.x * blockDim.x; - const int batchsize = data1.getSize(0); - const int channels = data1.getSize(1); - const int depth1 = data1.getSize(2); - const int height1 = data1.getSize(3); - const int width1 = data1.getSize(4); - const int depth2 = data2.getSize(2); - const int height2 = data2.getSize(3); - const int width2 = data2.getSize(4); - - if (index < n) { - const int w2 = (index % (height2*width2)) % width2; // 0:width2-1 - const int h2 = (index % (height2*width2)) / width2; // 0:height2-1 - const int t2 = index / (height2*width2); // 0:depth2-1 - // special case: just copy - if (depth1 == depth2 && height1 == height2 && width1 == width2) { - const int t1 = t2; - const int h1 = h2; - const int w1 = w2; - for (int n = 0; n < batchsize ; n++){ - for (int c = 0; c < channels; ++c) { - const Dtype val = data1[n][c][t1][h1][w1]; - data2[n][c][t2][h2][w2] = val; - } - } - return; - } - // - const Acctype t1r = area_pixel_compute_source_index(rdepth, t2, align_corners, /*cubic=*/false); - const int t1 = t1r; - const int t1p = (t1 < depth1 - 1) ? 1 : 0; - const Acctype t1lambda = t1r - t1; - const Acctype t0lambda = Acctype(1) - t1lambda; - // - const Acctype h1r = area_pixel_compute_source_index(rheight, h2, align_corners, /*cubic=*/false); - const int h1 = h1r; - const int h1p = (h1 < height1 - 1) ? 1 : 0; - const Acctype h1lambda = h1r - h1; - const Acctype h0lambda = Acctype(1) - h1lambda; - // - const Acctype w1r = area_pixel_compute_source_index(rwidth, w2, align_corners, /*cubic=*/false); - const int w1 = w1r; - const int w1p = (w1 < width1 - 1) ? 1 : 0; - const Acctype w1lambda = w1r - w1; - const Acctype w0lambda = Acctype(1) - w1lambda; - // - for (int n = 0; n < batchsize ; n++){ - for (int c = 0; c < channels; ++c) { - const Acctype val = t0lambda * (h0lambda * (w0lambda * data1[n][c][t1][h1][w1] - + w1lambda * data1[n][c][t1][h1][w1+w1p]) - + h1lambda * (w0lambda * data1[n][c][t1][h1+h1p][w1] - + w1lambda * data1[n][c][t1][h1+h1p][w1+w1p])) - + t1lambda * (h0lambda * (w0lambda * data1[n][c][t1+t1p][h1][w1] - + w1lambda * data1[n][c][t1+t1p][h1][w1+w1p]) - + h1lambda * (w0lambda * data1[n][c][t1+t1p][h1+h1p][w1] - + w1lambda * data1[n][c][t1+t1p][h1+h1p][w1+w1p])); - data2[n][c][t2][h2][w2] = ScalarConvert::to(val); - } - } - } -} - -// Backward (adjoint) operation 1 <- 2 (accumulates) -template -C10_LAUNCH_BOUNDS_1(1024) -__global__ void caffe_gpu_interp2_kernel_backward(const int n, - const Acctype rdepth, const Acctype rheight, const Acctype rwidth, const bool align_corners, - THCDeviceTensor data1, const THCDeviceTensor data2){ - int index = threadIdx.x + blockIdx.x * blockDim.x; - const int batchsize = data1.getSize(0); - const int channels = data1.getSize(1); - const int depth1 = data1.getSize(2); - const int height1 = data1.getSize(3); - const int width1 = data1.getSize(4); - const int depth2 = data2.getSize(2); - const int height2 = data2.getSize(3); - const int width2 = data2.getSize(4); - if (index < n) { - const int w2 = (index % (height2*width2)) % width2; // 0:width2-1 - const int h2 = (index % (height2*width2)) / width2; // 0:height2-1 - const int t2 = index / (height2*width2); // 0:depth2-1 - // special case: just copy - if (depth1 == depth2 && height1 == height2 && width1 == width2) { - const int t1 = t2; - const int h1 = h2; - const int w1 = w2; - for (int n = 0; n < batchsize ; n++){ - for (int c = 0; c < channels; ++c) { - const Dtype val = data2[n][c][t1][h1][w1]; - data1[n][c][t2][h2][w2] += val; - } - } - return; - } - // - const Acctype t1r = area_pixel_compute_source_index(rdepth, t2, align_corners, /*cubic=*/false); - const int t1 = t1r; - const int t1p = (t1 < depth1 - 1) ? 1 : 0; - const Acctype t1lambda = t1r - t1; - const Acctype t0lambda = Acctype(1) - t1lambda; - // - const Acctype h1r = area_pixel_compute_source_index(rheight, h2, align_corners, /*cubic=*/false); - const int h1 = h1r; - const int h1p = (h1 < height1 - 1) ? 1 : 0; - const Acctype h1lambda = h1r - h1; - const Acctype h0lambda = Acctype(1) - h1lambda; - // - const Acctype w1r = area_pixel_compute_source_index(rwidth, w2, align_corners, /*cubic=*/false); - const int w1 = w1r; - const int w1p = (w1 < width1 - 1) ? 1 : 0; - const Acctype w1lambda = w1r - w1; - const Acctype w0lambda = Acctype(1) - w1lambda; - // - for (int n = 0; n < batchsize ; n++){ - for (int c = 0; c < channels; ++c) { - const Dtype d2val = data2[n][c][t2][h2][w2]; - atomicAdd(data1[n][c][t1][h1][w1].data(), - ScalarConvert::to(t0lambda * h0lambda * w0lambda * d2val)); - atomicAdd(data1[n][c][t1][h1][w1+w1p].data(), - ScalarConvert::to(t0lambda * h0lambda * w1lambda * d2val)); - atomicAdd(data1[n][c][t1][h1+h1p][w1].data(), - ScalarConvert::to(t0lambda * h1lambda * w0lambda * d2val)); - atomicAdd(data1[n][c][t1][h1+h1p][w1+w1p].data(), - ScalarConvert::to(t0lambda * h1lambda * w1lambda * d2val)); - atomicAdd(data1[n][c][t1+t1p][h1][w1].data(), - ScalarConvert::to(t1lambda * h0lambda * w0lambda * d2val)); - atomicAdd(data1[n][c][t1+t1p][h1][w1+w1p].data(), - ScalarConvert::to(t1lambda * h0lambda * w1lambda * d2val)); - atomicAdd(data1[n][c][t1+t1p][h1+h1p][w1].data(), - ScalarConvert::to(t1lambda * h1lambda * w0lambda * d2val)); - atomicAdd(data1[n][c][t1+t1p][h1+h1p][w1+w1p].data(), - ScalarConvert::to(t1lambda * h1lambda * w1lambda * d2val)); - } - } - } - ///////////////////////////////////////////////////////// -} - - -#include -#include diff --git a/aten/src/THCUNN/generic/SpatialClassNLLCriterion.cu b/aten/src/THCUNN/generic/SpatialClassNLLCriterion.cu index 78055cf38729..29b2834e5def 100644 --- a/aten/src/THCUNN/generic/SpatialClassNLLCriterion.cu +++ b/aten/src/THCUNN/generic/SpatialClassNLLCriterion.cu @@ -8,10 +8,10 @@ void THNN_(SpatialClassNLLCriterion_shapeCheck)( THCIndexTensor *target, THCTensor *weights) { - AT_CHECK(!target->is_empty() && target->dim() == 3, 1, + TORCH_CHECK(!target->is_empty() && target->dim() == 3, 1, "only batches of spatial targets supported (non-empty 3D tensors)" \ " but got targets of size: : ", target->sizes()); - AT_CHECK(!input->is_empty() && input->dim() == 4, 2, + TORCH_CHECK(!input->is_empty() && input->dim() == 4, 2, "only batches of spatial inputs supported (non-empty 4D tensors), " \ "but got input of size: ", input->sizes()); if (THCTensor_(size)(state, input, 0) != THCIndexTensor_(size)(state, target, 0) || @@ -33,7 +33,7 @@ static void THNN_(SpatialClassNLLCriterion_gradOutput_no_reduce_shapeCheck)( THCTensor *gradOutput, THCIndexTensor *target) { - AT_CHECK(!gradOutput->is_empty() && THCTensor_(nDimensionLegacyNoScalars)(state, gradOutput) == 3, 2, + TORCH_CHECK(!gradOutput->is_empty() && THCTensor_(nDimensionLegacyNoScalars)(state, gradOutput) == 3, 2, "Expected non-empty dimension 3 but got gradOutput of size: ", gradOutput->sizes()); if (THCTensor_(size)(state, gradOutput, 0) != THCIndexTensor_(size)(state, target, 0) || THCTensor_(size)(state, gradOutput, 1) != THCIndexTensor_(size)(state, target, 1) || diff --git a/aten/src/THCUNN/generic/SpatialConvolutionLocal.cu b/aten/src/THCUNN/generic/SpatialConvolutionLocal.cu index c75a8bed967c..9e9825414096 100644 --- a/aten/src/THCUNN/generic/SpatialConvolutionLocal.cu +++ b/aten/src/THCUNN/generic/SpatialConvolutionLocal.cu @@ -53,7 +53,7 @@ static THCTensor* THNN_(view_weight_local)( THCTensor *_weight) { THCTensor *weight = THCTensor_(newContiguous)(state, _weight); - AT_CHECK(!weight->is_empty() && (weight->dim() == 3 || weight->dim() == 6), 4, + TORCH_CHECK(!weight->is_empty() && (weight->dim() == 3 || weight->dim() == 6), 4, "weight tensor should be (non-empty) 3D or 6D - got size: ", weight->sizes()); if (weight->dim() == 6) { int64_t s1 = weight->size(0) * weight->size(1); diff --git a/aten/src/THCUNN/generic/SpatialDilatedMaxPooling.cu b/aten/src/THCUNN/generic/SpatialDilatedMaxPooling.cu deleted file mode 100644 index aa3cb035e3d3..000000000000 --- a/aten/src/THCUNN/generic/SpatialDilatedMaxPooling.cu +++ /dev/null @@ -1,199 +0,0 @@ -#ifndef THC_GENERIC_FILE -#define THC_GENERIC_FILE "THCUNN/generic/SpatialDilatedMaxPooling.cu" -#else - -#include -#include -#include - -static inline void THNN_(SpatialDilatedMaxPooling_shapeCheck)( - THCState *state, - THCTensor *input, THCTensor *gradOutput, THCIndexTensor *indices, - int kH, int kW, int dH, int dW, int padH, int padW, - int dilationH, int dilationW, bool ceil_mode) { - - THArgCheck(kW > 0 && kH > 0, 5, - "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW); - THArgCheck(dW > 0 && dH > 0, 8, - "stride should be greater than zero, but got dH: %d dW: %d", dH, dW); - THArgCheck(dilationH > 0 && dilationW > 0, 12, - "dilation should be greater than zero, but got dilationH: %d dilationW: %d", - dilationH, dilationW); - - int ndim = input->dim(); - int dimf = 0; - int dimh = 1; - int dimw = 2; - int batchSize = 1; - - if (ndim == 4) { - batchSize = input->size(0); - dimf++; - dimh++; - dimw++; - } - - THCUNN_argCheck(state, !input->is_empty() && (ndim == 3 || ndim == 4), 2, input, - "non-empty 3D or 4D input tensor expected but got: %s"); - THArgCheck(kW/2 >= padW && kH/2 >= padH, 2, - "pad should be smaller than half of kernel size, but got " - "padW = %d, padH = %d, kW = %d, kH = %d", - padW, padH, kW, kH); - - int64_t nInputPlane = input->size(dimh-1); - int64_t nInputRows = input->size(dimh); - int64_t nInputCols = input->size(dimw); - int64_t nOutputPlane = nInputPlane; - - int64_t nOutputRows = pooling_output_shape(nInputRows, kH, padH, dH, dilationH, ceil_mode); - int64_t nOutputCols = pooling_output_shape(nInputCols, kW, padW, dW, dilationW, ceil_mode); - - if (nOutputCols < 1 || nOutputRows < 1) - THError("Given input size: (%dx%dx%d). " - "Calculated output size: (%dx%dx%d). Output size is too small", - nInputPlane,nInputRows,nInputCols,nInputPlane,nOutputRows,nOutputCols); - - if (gradOutput != NULL) { - THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane); - THCUNN_check_dim_size(state, gradOutput, ndim, dimh, nOutputRows); - THCUNN_check_dim_size(state, gradOutput, ndim, dimw, nOutputCols); - } - if (indices != NULL) { - THCUNN_check_dim_size_indices(state, indices, 4, 0, batchSize); - THCUNN_check_dim_size_indices(state, indices, 4, 1, nOutputPlane); - THCUNN_check_dim_size_indices(state, indices, 4, 2, nOutputRows); - THCUNN_check_dim_size_indices(state, indices, 4, 3, nOutputCols); - } -} - -void THNN_(SpatialDilatedMaxPooling_updateOutput)( - THCState *state, - THCTensor *input, - THCTensor *output, - THCIndexTensor *indices, - int kW, int kH, - int dW, int dH, - int padW, int padH, - int dilationW, int dilationH, - bool ceil_mode) -{ - - THCUNN_assertSameGPU(state, 3, input, output, indices); - THNN_(SpatialDilatedMaxPooling_shapeCheck) - (state, input, NULL, NULL, kH, kW, dH, dW, - padH, padW, dilationH, dilationW, ceil_mode); - - int64_t nInputCols, nInputRows, nInputPlane, batchSize; - int64_t nOutputCols, nOutputRows; - - if (input->dim() == 3) { - nInputCols = input->size(2); - nInputRows = input->size(1); - nInputPlane = input->size(0); - batchSize = 1; - } - else - { - nInputCols = input->size(3); - nInputRows = input->size(2); - nInputPlane = input->size(1); - batchSize = input->size(0); - } - - nOutputCols = pooling_output_shape(nInputCols, kW, padW, dW, dilationW, ceil_mode); - nOutputRows = pooling_output_shape(nInputRows, kH, padH, dH, dilationH, ceil_mode); - - input = THCTensor_(newContiguous)(state, input); - scalar_t* input_data = THCTensor_(data)(state, input); - - THCTensor_(resize4d)(state, output, batchSize, nInputPlane, nOutputRows, nOutputCols); - THCUNN_resizeAs_indices(state, indices, output); - - THCIndex_t* indices_data = THCIndexTensor_(data)(state, indices); - scalar_t* output_data = THCTensor_(data)(state, output); - - int count = THCTensor_(nElement)(state, output); - - MaxPoolForward <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>> - (count, input_data, - batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols, - kH, kW, dH, dW, padH, padW, dilationH, dilationW, output_data, indices_data); - THCudaCheck(cudaGetLastError()); - - if(input->dim() == 3) - THCTensor_(resize3d)(state, output, nInputPlane, nOutputRows, nOutputCols); - - THCTensor_(free)(state, input); -} - -void THNN_(SpatialDilatedMaxPooling_updateGradInput)( - THCState *state, - THCTensor *input, - THCTensor *gradOutput, - THCTensor *gradInput, - THCIndexTensor *indices, - int kW, int kH, - int dW, int dH, - int padW, int padH, - int dilationW, int dilationH, - bool ceil_mode) -{ - THCUNN_assertSameGPU(state, 4, input, gradOutput, indices, gradInput); - THNN_(SpatialDilatedMaxPooling_shapeCheck) - (state, input, gradOutput, indices, kH, kW, dH, dW, - padH, padW, dilationH, dilationW, ceil_mode); - - input = THCTensor_(newContiguous)(state, input); - gradOutput = THCTensor_(newContiguous)(state, gradOutput); - - int64_t nInputCols, nInputRows, nInputPlane, batchSize; - int64_t nOutputCols, nOutputRows; - - if (THTensor_nDimensionLegacyAll(input) == 3) { - nInputCols = input->size(2); - nInputRows = input->size(1); - nInputPlane = input->size(0); - batchSize = 1; - } - else - { - nInputCols = input->size(3); - nInputRows = input->size(2); - nInputPlane = input->size(1); - batchSize = input->size(0); - } - - nOutputCols = pooling_output_shape(nInputCols, kW, padW, dW, dilationW, ceil_mode); - nOutputRows = pooling_output_shape(nInputRows, kH, padH, dH, dilationH, ceil_mode); - - gradOutput = THCTensor_(newContiguous)(state, gradOutput); - THCTensor_(resizeAs)(state, gradInput, input); - - int count = THCTensor_(nElement)(state, input); - dim3 grid; - int imgcount = nInputCols * nInputRows; - const int blocks = (imgcount + BACKWARD_THREADS - 1) / BACKWARD_THREADS; - grid.x = blocks; - grid.y = batchSize; - grid.z = nInputPlane; - uint64_t maxGridY = at::cuda::getCurrentDeviceProperties()->maxGridSize[1]; - uint64_t maxGridZ = at::cuda::getCurrentDeviceProperties()->maxGridSize[2]; - if (maxGridY < grid.y) grid.y = maxGridY; - if (maxGridZ < grid.z) grid.z = maxGridZ; - MaxPoolBackward <<< grid, BACKWARD_THREADS, 0, THCState_getCurrentStream(state) >>> - (count, - THCTensor_(data)(state, gradOutput), - THCIndexTensor_(data)(state, indices), - batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols, - kH, kW, dH, dW, padH, padW, dilationH, dilationW, - THCTensor_(data)(state, gradInput)); - THCudaCheck(cudaGetLastError()); - - THCTensor_(free)(state, gradOutput); - - // clean - THCTensor_(free)(state, input); - THCTensor_(free)(state, gradOutput); -} - -#endif diff --git a/aten/src/THCUNN/generic/SpatialMaxPooling.cu b/aten/src/THCUNN/generic/SpatialMaxPooling.cu deleted file mode 100644 index 21a65f506a1d..000000000000 --- a/aten/src/THCUNN/generic/SpatialMaxPooling.cu +++ /dev/null @@ -1,40 +0,0 @@ -#ifndef THC_GENERIC_FILE -#define THC_GENERIC_FILE "THCUNN/generic/SpatialMaxPooling.cu" -#else - -#include - -void THNN_(SpatialMaxPooling_updateOutput)( - THCState *state, - THCTensor *input, - THCTensor *output, - THCIndexTensor *indices, - int kW, int kH, - int dW, int dH, - int padW, int padH, - bool ceil_mode) -{ - THNN_(SpatialDilatedMaxPooling_updateOutput)( - state, input, output, indices, - kW, kH, dW, dH, padW, padH, 1, 1, ceil_mode); - -} - -void THNN_(SpatialMaxPooling_updateGradInput)( - THCState *state, - THCTensor *input, - THCTensor *gradOutput, - THCTensor *gradInput, - THCIndexTensor *indices, - int kW, int kH, - int dW, int dH, - int padW, int padH, - bool ceil_mode) -{ - THNN_(SpatialDilatedMaxPooling_updateGradInput)( - state, input, gradOutput, gradInput, indices, - kW, kH, dW, dH, padW, padH, 1, 1, ceil_mode); - -} - -#endif diff --git a/aten/src/THCUNN/generic/SpatialUpSamplingBicubic.cu b/aten/src/THCUNN/generic/SpatialUpSamplingBicubic.cu deleted file mode 100644 index b984745147c0..000000000000 --- a/aten/src/THCUNN/generic/SpatialUpSamplingBicubic.cu +++ /dev/null @@ -1,115 +0,0 @@ -#ifndef THC_GENERIC_FILE -#define THC_GENERIC_FILE "THCUNN/generic/SpatialUpSamplingBicubic.cu" -#else - -#include -#include - -static inline void THNN_(SpatialUpSamplingBicubic_shapeCheck) - (THCState *state, - THCTensor *input, THCTensor *gradOutput, - int nBatch, int nChannels, - int inputHeight, int inputWidth, - int outputHeight, int outputWidth) { - THArgCheck(inputHeight > 0 && inputWidth > 0 - && outputHeight > 0 && outputWidth > 0, 2, - "input and output sizes should be greater than 0," - " but got input (H: %d, W: %d) output (H: %d, W: %d)", - inputHeight, inputWidth, outputHeight, outputWidth); - if (input != NULL) { - THCUNN_argCheck(state, !input->is_empty() && input->dim() == 4, 2, input, - "non-empty 4D input tensor expected but got: %s"); - } - - if (gradOutput != NULL) { - THCUNN_check_dim_size(state, gradOutput, 4, 0, nBatch); - THCUNN_check_dim_size(state, gradOutput, 4, 1, nChannels); - THCUNN_check_dim_size(state, gradOutput, 4, 2, outputHeight); - THCUNN_check_dim_size(state, gradOutput, 4, 3, outputWidth); - } -} - -void THNN_(SpatialUpSamplingBicubic_updateOutput)( - THCState *state, - THCTensor *input, - THCTensor *output, - int outputHeight, - int outputWidth, - bool align_corners) -{ - int nbatch = THCTensor_(size)(state, input, 0); - int channels = THCTensor_(size)(state, input, 1); - int inputHeight = THCTensor_(size)(state, input, 2); - int inputWidth = THCTensor_(size)(state, input, 3); - THNN_(SpatialUpSamplingBicubic_shapeCheck) - (state, input, NULL, - nbatch, channels, - inputHeight, inputWidth, - outputHeight, outputWidth); - - THCUNN_assertSameGPU(state, 2, input, output); - THCTensor_(resize4d)(state, output, - THCTensor_(size)(state, input, 0), - THCTensor_(size)(state, input, 1), - outputHeight, outputWidth); - THCTensor_(zero)(state, output); - THCDeviceTensor idata = toDeviceTensor(state, input); - THCDeviceTensor odata = toDeviceTensor(state, output); - THAssert(inputHeight > 0 && inputWidth > 0 && outputHeight > 0 && outputWidth > 0); - - // Get scaling factors - const accreal rheight = area_pixel_compute_scale(inputHeight, outputHeight, align_corners); - const accreal rwidth = area_pixel_compute_scale(inputWidth, outputWidth, align_corners); - - const int num_output_elements = outputHeight * outputWidth; - const int max_threads = - at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock; - - // Launch kernel - cudaStream_t stream = THCState_getCurrentStream(state); - bicubic_interp2d_kernel <<< - THCCeilDiv(num_output_elements, max_threads), - max_threads, - 0, - stream - >>>(num_output_elements, rheight, rwidth, align_corners, idata, odata); - THCudaCheck(cudaGetLastError()); -} - - -void THNN_(SpatialUpSamplingBicubic_updateGradInput)( - THCState *state, - THCTensor *gradOutput, - THCTensor *gradInput, - int nbatch, - int nchannels, - int inputHeight, - int inputWidth, - int outputHeight, - int outputWidth, - bool align_corners) -{ - THNN_(SpatialUpSamplingBicubic_shapeCheck) - (state, NULL, gradOutput, - nbatch, nchannels, - inputHeight, inputWidth, - outputHeight, outputWidth); - gradOutput = THCTensor_(newContiguous)(state, gradOutput); - THCUNN_assertSameGPU(state, 2, gradOutput, gradInput); - THCTensor_(resize4d)(state, gradInput, nbatch, nchannels, inputHeight, inputWidth); - THCTensor_(zero)(state, gradInput); - THCDeviceTensor in_data = toDeviceTensor(state, gradInput); - THCDeviceTensor out_data = toDeviceTensor(state, gradOutput); - const accreal rheight = area_pixel_compute_scale(inputHeight, outputHeight, align_corners); - const accreal rwidth = area_pixel_compute_scale(inputWidth, outputWidth, align_corners); - const int num_kernels = outputHeight * outputWidth; - const int num_threads = - at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock; - cudaStream_t stream = THCState_getCurrentStream(state); - bicubic_interp2d_backward_kernel <<>>(num_kernels, rheight, rwidth, align_corners, in_data, out_data); - THCudaCheck(cudaGetLastError()); - THCTensor_(free)(state, gradOutput); -} - -#endif diff --git a/aten/src/THCUNN/generic/SpatialUpSamplingBilinear.cu b/aten/src/THCUNN/generic/SpatialUpSamplingBilinear.cu deleted file mode 100644 index 6afb863f8dc3..000000000000 --- a/aten/src/THCUNN/generic/SpatialUpSamplingBilinear.cu +++ /dev/null @@ -1,106 +0,0 @@ -#ifndef THC_GENERIC_FILE -#define THC_GENERIC_FILE "THCUNN/generic/SpatialUpSamplingBilinear.cu" -#else - -#include -#include "ATen/cuda/CUDAContext.h" - -static inline void THNN_(SpatialUpSamplingBilinear_shapeCheck) - (THCState *state, - THCTensor *input, THCTensor *gradOutput, - int nBatch, int nChannels, - int inputHeight, int inputWidth, - int outputHeight, int outputWidth) { - THArgCheck(inputHeight > 0 && inputWidth > 0 - && outputHeight > 0 && outputWidth > 0, 2, - "input and output sizes should be greater than 0," - " but got input (H: %d, W: %d) output (H: %d, W: %d)", - inputHeight, inputWidth, outputHeight, outputWidth); - if (input != NULL) { - THCUNN_argCheck(state, !input->is_empty() && input->dim() == 4, 2, input, - "non-empty 4D input tensor expected but got: %s"); - } - - if (gradOutput != NULL) { - THCUNN_check_dim_size(state, gradOutput, 4, 0, nBatch); - THCUNN_check_dim_size(state, gradOutput, 4, 1, nChannels); - THCUNN_check_dim_size(state, gradOutput, 4, 2, outputHeight); - THCUNN_check_dim_size(state, gradOutput, 4, 3, outputWidth); - } -} - -void THNN_(SpatialUpSamplingBilinear_updateOutput)( - THCState *state, - THCTensor *input, - THCTensor *output, - int outputHeight, - int outputWidth, - bool align_corners) -{ - int nbatch = THCTensor_(size)(state, input, 0); - int channels = THCTensor_(size)(state, input, 1); - int inputHeight = THCTensor_(size)(state, input, 2); - int inputWidth = THCTensor_(size)(state, input, 3); - THNN_(SpatialUpSamplingBilinear_shapeCheck) - (state, input, NULL, - nbatch, channels, - inputHeight, inputWidth, - outputHeight, outputWidth); - - THCUNN_assertSameGPU(state, 2, input, output); - THCTensor_(resize4d)(state, output, - THCTensor_(size)(state, input, 0), - THCTensor_(size)(state, input, 1), - outputHeight, outputWidth); - THCTensor_(zero)(state, output); - THCDeviceTensor idata = toDeviceTensor(state, input); - THCDeviceTensor odata = toDeviceTensor(state, output); - THAssert(inputHeight > 0 && inputWidth > 0 && outputHeight > 0 && outputWidth > 0); - const accreal rheight = area_pixel_compute_scale(inputHeight, outputHeight, align_corners); - const accreal rwidth = area_pixel_compute_scale(inputWidth, outputWidth, align_corners); - const int num_kernels = outputHeight * outputWidth; - const int num_threads = - at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock; - cudaStream_t stream = THCState_getCurrentStream(state); - caffe_gpu_interp2_kernel <<>>(num_kernels, rheight, rwidth, align_corners, idata, odata); - THCudaCheck(cudaGetLastError()); -} - - -void THNN_(SpatialUpSamplingBilinear_updateGradInput)( - THCState *state, - THCTensor *gradOutput, - THCTensor *gradInput, - int nbatch, - int nchannels, - int inputHeight, - int inputWidth, - int outputHeight, - int outputWidth, - bool align_corners) -{ - THNN_(SpatialUpSamplingBilinear_shapeCheck) - (state, NULL, gradOutput, - nbatch, nchannels, - inputHeight, inputWidth, - outputHeight, outputWidth); - gradOutput = THCTensor_(newContiguous)(state, gradOutput); - THCUNN_assertSameGPU(state, 2, gradOutput, gradInput); - THCTensor_(resize4d)(state, gradInput, nbatch, nchannels, inputHeight, inputWidth); - THCTensor_(zero)(state, gradInput); - THCDeviceTensor data1 = toDeviceTensor(state, gradInput); - THCDeviceTensor data2 = toDeviceTensor(state, gradOutput); - const accreal rheight = area_pixel_compute_scale(inputHeight, outputHeight, align_corners); - const accreal rwidth = area_pixel_compute_scale(inputWidth, outputWidth, align_corners); - const int num_kernels = outputHeight * outputWidth; - const int num_threads = - at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock; - cudaStream_t stream = THCState_getCurrentStream(state); - caffe_gpu_interp2_kernel_backward <<>>(num_kernels, rheight, rwidth, align_corners, data1, data2); - THCudaCheck(cudaGetLastError()); - THCTensor_(free)(state, gradOutput); -} - -#endif diff --git a/aten/src/THCUNN/generic/SpatialUpSamplingNearest.cu b/aten/src/THCUNN/generic/SpatialUpSamplingNearest.cu deleted file mode 100644 index 85a7b831561d..000000000000 --- a/aten/src/THCUNN/generic/SpatialUpSamplingNearest.cu +++ /dev/null @@ -1,102 +0,0 @@ -#ifndef THC_GENERIC_FILE -#define THC_GENERIC_FILE "THCUNN/generic/SpatialUpSamplingNearest.cu" -#else - -#include -#include "ATen/cuda/CUDAContext.h" - -static inline void THNN_(SpatialUpSamplingNearest_shapeCheck) - (THCState *state, - THCTensor *input, THCTensor *gradOutput, - int nBatch, int nChannels, - int inputHeight, int inputWidth, - int outputHeight, int outputWidth) { - THArgCheck(inputHeight > 0 && inputWidth > 0 - && outputHeight > 0 && outputWidth > 0, 2, - "input and output sizes should be greater than 0," - " but got input (H: %d, W: %d) output (H: %d, W: %d)", - inputHeight, inputWidth, outputHeight, outputWidth); - if (input != NULL) { - THCUNN_argCheck(state, THTensor_nDimensionLegacyAll(input) == 4, 2, input, - "4D input tensor expected but got: %s"); - } - - if (gradOutput != NULL) { - THCUNN_check_dim_size(state, gradOutput, 4, 0, nBatch); - THCUNN_check_dim_size(state, gradOutput, 4, 1, nChannels); - THCUNN_check_dim_size(state, gradOutput, 4, 2, outputHeight); - THCUNN_check_dim_size(state, gradOutput, 4, 3, outputWidth); - } -} - - -void THNN_(SpatialUpSamplingNearest_updateOutput)( - THCState *state, - THCTensor *input, - THCTensor *output, - int outputHeight, - int outputWidth) -{ - THCUNN_assertSameGPU(state, 2, input, output); - int nbatch = THCTensor_(size)(state, input, 0); - int channels = THCTensor_(size)(state, input, 1); - int inputHeight = THCTensor_(size)(state, input, 2); - int inputWidth = THCTensor_(size)(state, input, 3); - - THNN_(SpatialUpSamplingNearest_shapeCheck)(state, input, NULL, nbatch, channels, - inputHeight, inputWidth, - outputHeight, outputWidth); - THAssert(inputHeight > 0 && inputWidth > 0 && outputHeight > 0 && outputWidth > 0); - - THCTensor_(resize4d)(state, output, - THCTensor_(size)(state, input, 0), - THCTensor_(size)(state, input, 1), - outputHeight, - outputWidth); - THCTensor_(zero)(state, output); - - THCDeviceTensor idata = toDeviceTensor(state, input); - THCDeviceTensor odata = toDeviceTensor(state, output); - - const int num_kernels = outputHeight * outputWidth; - const int num_threads = at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock; - cudaStream_t stream = THCState_getCurrentStream(state); - nearest_neighbor_4d_kernel <<>>(num_kernels, idata, odata); - THCudaCheck(cudaGetLastError()); -} - - - -void THNN_(SpatialUpSamplingNearest_updateGradInput)( - THCState *state, - THCTensor *gradOutput, - THCTensor *gradInput, - int nbatch, - int nchannels, - int inputHeight, - int inputWidth, - int outputHeight, - int outputWidth) -{ - THCUNN_assertSameGPU(state, 2, gradOutput, gradInput); - THNN_(SpatialUpSamplingNearest_shapeCheck)(state, NULL, gradOutput, nbatch, nchannels, - inputHeight, inputWidth, outputHeight, outputWidth); - gradOutput = THCTensor_(newContiguous)(state, gradOutput); - THCTensor_(resize4d)(state, gradInput, nbatch, nchannels, inputHeight, inputWidth); - - THCTensor_(zero)(state, gradInput); - THCDeviceTensor data1 = toDeviceTensor(state, gradInput); - THCDeviceTensor data2 = toDeviceTensor(state, gradOutput); - - const int num_kernels = outputHeight * outputWidth; - const int num_threads = at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock; - cudaStream_t stream = THCState_getCurrentStream(state); - - nearest_neighbor_4d_kernel_backward <<>>(num_kernels, data1, data2); - THCudaCheck(cudaGetLastError()); - THCTensor_(free)(state, gradOutput); -} - -#endif diff --git a/aten/src/THCUNN/generic/THCUNN.h b/aten/src/THCUNN/generic/THCUNN.h index 58b4b3c42c8f..9739dc06b18c 100644 --- a/aten/src/THCUNN/generic/THCUNN.h +++ b/aten/src/THCUNN/generic/THCUNN.h @@ -665,29 +665,6 @@ THC_API void THNN_(SpatialFullDilatedConvolution_accGradParameters)( int adjW, int adjH, accreal scale); -THC_API void THNN_(SpatialDilatedMaxPooling_updateOutput)( - THCState *state, - THCTensor *input, - THCTensor *output, - THCIndexTensor *indices, - int kW, int kH, - int dW, int dH, - int padW, int padH, - int dilationW, int dilationH, - bool ceil_mode); - -THC_API void THNN_(SpatialDilatedMaxPooling_updateGradInput)( - THCState *state, - THCTensor *input, - THCTensor *gradOutput, - THCTensor *gradInput, - THCIndexTensor *indices, - int kW, int kH, - int dW, int dH, - int padW, int padH, - int dilationW, int dilationH, - bool ceil_mode); - THC_API void THNN_(SpatialFullConvolution_updateOutput)( THCState *state, THCTensor *input, @@ -727,27 +704,6 @@ THC_API void THNN_(SpatialFullConvolution_accGradParameters)( int adjW, int adjH, accreal scale); -THC_API void THNN_(SpatialMaxPooling_updateOutput)( - THCState *state, - THCTensor *input, - THCTensor *output, - THCIndexTensor *indices, - int kW, int kH, - int dW, int dH, - int padW, int padH, - bool ceil_mode); - -THC_API void THNN_(SpatialMaxPooling_updateGradInput)( - THCState *state, - THCTensor *input, - THCTensor *gradOutput, - THCTensor *gradInput, - THCIndexTensor *indices, - int kW, int kH, - int dW, int dH, - int padW, int padH, - bool ceil_mode); - THC_API void THNN_(SpatialMaxUnpooling_updateOutput)( THCState *state, THCTensor *input, @@ -791,64 +747,6 @@ THC_API void THNN_(SpatialSubSampling_accGradParameters)( int dW, int dH, accreal scale); -THC_API void THNN_(SpatialUpSamplingBilinear_updateOutput)( - THCState *state, - THCTensor *input, - THCTensor *output, - int outputHeight, - int outputWidth, - bool align_corners); - -THC_API void THNN_(SpatialUpSamplingBilinear_updateGradInput)( - THCState *state, - THCTensor *gradOutput, - THCTensor *gradInput, - int nbatch, - int nchannels, - int inputHeight, - int inputWidth, - int outputHeight, - int outputWidth, - bool align_corners); - -THC_API void THNN_(SpatialUpSamplingBicubic_updateOutput)( - THCState *state, - THCTensor *input, - THCTensor *output, - int outputHeight, - int outputWidth, - bool align_corners); - -THC_API void THNN_(SpatialUpSamplingBicubic_updateGradInput)( - THCState *state, - THCTensor *gradOutput, - THCTensor *gradInput, - int nbatch, - int nchannels, - int inputHeight, - int inputWidth, - int outputHeight, - int outputWidth, - bool align_corners); - -THC_API void THNN_(SpatialUpSamplingNearest_updateGradInput)( - THCState *state, - THCTensor *gradOutput, - THCTensor *gradInput, - int nbatch, - int nchannels, - int inputHeight, - int inputWidth, - int outputHeight, - int outputWidth); - -THC_API void THNN_(SpatialUpSamplingNearest_updateOutput)( - THCState *state, - THCTensor *input, - THCTensor *output, - int outputHeight, - int outputWidth); - THC_API void THNN_(RReLU_updateOutput)( THCState *state, THCTensor *input, @@ -1043,38 +941,6 @@ THC_API void THNN_(TemporalRowConvolution_accGradParameters)( bool featFirst, accreal scale); -THC_API void THNN_(TemporalUpSamplingLinear_updateOutput)( - THCState *state, - THCTensor *input, - THCTensor *output, - int outputWidth, - bool align_corners); - -THC_API void THNN_(TemporalUpSamplingLinear_updateGradInput)( - THCState *state, - THCTensor *gradOutput, - THCTensor *gradInput, - int nbatch, - int nchannels, - int inputWidth, - int outputWidth, - bool align_corners); - -THC_API void THNN_(TemporalUpSamplingNearest_updateGradInput)( - THCState *state, - THCTensor *gradOutput, - THCTensor *gradInput, - int nbatch, - int nchannels, - int inputWidth, - int outputWidth); - -THC_API void THNN_(TemporalUpSamplingNearest_updateOutput)( - THCState *state, - THCTensor *input, - THCTensor *output, - int outputWidth); - THC_API void THNN_(VolumetricAveragePooling_updateOutput)( THCState *state, THCTensor *input, @@ -1315,62 +1181,4 @@ THC_API void THNN_(VolumetricMaxUnpooling_updateGradInput)( int dT, int dW, int dH, int padT, int padW, int padH); -THC_API void THNN_(VolumetricAdaptiveAveragePooling_updateOutput)( - THCState *state, - THCTensor *input, - THCTensor *output, - int osizeT, - int osizeW, - int osizeH); - -THC_API void THNN_(VolumetricAdaptiveAveragePooling_updateGradInput)( - THCState *state, - THCTensor *input, - THCTensor *gradOutput, - THCTensor *gradInput); - -THC_API void THNN_(VolumetricUpSamplingNearest_updateGradInput)( - THCState *state, - THCTensor *gradOutput, - THCTensor *gradInput, - int nbatch, - int nchannels, - int inputDepth, - int inputHeight, - int inputWidth, - int outputDepth, - int outputHeight, - int outputWidth); - -THC_API void THNN_(VolumetricUpSamplingNearest_updateOutput)( - THCState *state, - THCTensor *input, - THCTensor *output, - int outputDepth, - int outputHeight, - int outputWidth); - -THC_API void THNN_(VolumetricUpSamplingTrilinear_updateOutput)( - THCState *state, - THCTensor *input, - THCTensor *output, - int outputDepth, - int outputHeight, - int outputWidth, - bool align_corners); - -THC_API void THNN_(VolumetricUpSamplingTrilinear_updateGradInput)( - THCState *state, - THCTensor *gradOutput, - THCTensor *gradInput, - int nbatch, - int nchannels, - int inputDepth, - int inputHeight, - int inputWidth, - int outputDepth, - int outputHeight, - int outputWidth, - bool align_corners); - #endif diff --git a/aten/src/THCUNN/generic/TemporalUpSamplingLinear.cu b/aten/src/THCUNN/generic/TemporalUpSamplingLinear.cu deleted file mode 100644 index 4b1ef97c9c09..000000000000 --- a/aten/src/THCUNN/generic/TemporalUpSamplingLinear.cu +++ /dev/null @@ -1,96 +0,0 @@ -#ifndef THC_GENERIC_FILE -#define THC_GENERIC_FILE "THCUNN/generic/TemporalUpSamplingLinear.cu" -#else - -#include -#include "ATen/cuda/CUDAContext.h" - -static inline void THNN_(TemporalUpSamplingLinear_shapeCheck) - (THCState *state, - THCTensor *input, THCTensor *gradOutput, - int nBatch, int nChannels, - int inputWidth, - int outputWidth) { - THArgCheck(inputWidth > 0 && outputWidth > 0, 2, - "input and output sizes should be greater than 0," - " but got input (W: %d) output (W: %d)", - inputWidth, outputWidth); - if (input != NULL) { - THCUNN_argCheck(state, !input->is_empty() && input->dim() == 3, 2, input, - "non-empty 3D input tensor expected but got: %s"); - } - - if (gradOutput != NULL) { - THCUNN_check_dim_size(state, gradOutput, 3, 0, nBatch); - THCUNN_check_dim_size(state, gradOutput, 3, 1, nChannels); - THCUNN_check_dim_size(state, gradOutput, 3, 2, outputWidth); - } -} - -void THNN_(TemporalUpSamplingLinear_updateOutput)( - THCState *state, - THCTensor *input, - THCTensor *output, - int outputWidth, - bool align_corners) -{ - int nbatch = THCTensor_(size)(state, input, 0); - int channels = THCTensor_(size)(state, input, 1); - int inputWidth = THCTensor_(size)(state, input, 2); - THNN_(TemporalUpSamplingLinear_shapeCheck) - (state, input, NULL, - nbatch, channels, - inputWidth, outputWidth); - - THCUNN_assertSameGPU(state, 2, input, output); - THCTensor_(resize3d)(state, output, - THCTensor_(size)(state, input, 0), - THCTensor_(size)(state, input, 1), - outputWidth); - THCTensor_(zero)(state, output); - THCDeviceTensor idata = toDeviceTensor(state, input); - THCDeviceTensor odata = toDeviceTensor(state, output); - THAssert(inputWidth > 0 && outputWidth > 0); - const accreal rwidth = area_pixel_compute_scale(inputWidth, outputWidth, align_corners); - const int num_kernels = outputWidth; - const int num_threads = - at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock; - cudaStream_t stream = THCState_getCurrentStream(state); - caffe_gpu_interp2_kernel <<>>(num_kernels, rwidth, align_corners, idata, odata); - THCudaCheck(cudaGetLastError()); -} - - -void THNN_(TemporalUpSamplingLinear_updateGradInput)( - THCState *state, - THCTensor *gradOutput, - THCTensor *gradInput, - int nbatch, - int nchannels, - int inputWidth, - int outputWidth, - bool align_corners) -{ - THNN_(TemporalUpSamplingLinear_shapeCheck) - (state, NULL, gradOutput, - nbatch, nchannels, - inputWidth, outputWidth); - gradOutput = THCTensor_(newContiguous)(state, gradOutput); - THCUNN_assertSameGPU(state, 2, gradOutput, gradInput); - THCTensor_(resize3d)(state, gradInput, nbatch, nchannels, inputWidth); - THCTensor_(zero)(state, gradInput); - THCDeviceTensor data1 = toDeviceTensor(state, gradInput); - THCDeviceTensor data2 = toDeviceTensor(state, gradOutput); - const accreal rwidth = area_pixel_compute_scale(inputWidth, outputWidth, align_corners); - const int num_kernels = outputWidth; - const int num_threads = - at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock; - cudaStream_t stream = THCState_getCurrentStream(state); - caffe_gpu_interp2_kernel_backward <<>>(num_kernels, rwidth, align_corners, data1, data2); - THCudaCheck(cudaGetLastError()); - THCTensor_(free)(state, gradOutput); -} - -#endif diff --git a/aten/src/THCUNN/generic/TemporalUpSamplingNearest.cu b/aten/src/THCUNN/generic/TemporalUpSamplingNearest.cu deleted file mode 100644 index 1658b180c85a..000000000000 --- a/aten/src/THCUNN/generic/TemporalUpSamplingNearest.cu +++ /dev/null @@ -1,91 +0,0 @@ -#ifndef THC_GENERIC_FILE -#define THC_GENERIC_FILE "THCUNN/generic/TemporalUpSamplingNearest.cu" -#else - -#include -#include "ATen/cuda/CUDAContext.h" - -static inline void THNN_(TemporalUpSamplingNearest_shapeCheck) - (THCState *state, - THCTensor *input, THCTensor *gradOutput, - int nBatch, int nChannels, - int inputWidth, - int outputWidth) { - THArgCheck(inputWidth > 0 && outputWidth > 0, 2, - "input and output sizes should be greater than 0," - " but got input (W: %d) output (W: %d)", - inputWidth, outputWidth); - if (input != NULL) { - THCUNN_argCheck(state, THTensor_nDimensionLegacyAll(input) == 3, 2, input, - "3D input tensor expected but got: %s"); - } - - if (gradOutput != NULL) { - THCUNN_check_dim_size(state, gradOutput, 3, 0, nBatch); - THCUNN_check_dim_size(state, gradOutput, 3, 1, nChannels); - THCUNN_check_dim_size(state, gradOutput, 3, 2, outputWidth); - } -} - -void THNN_(TemporalUpSamplingNearest_updateOutput)( - THCState *state, - THCTensor *input, - THCTensor *output, - int outputWidth) -{ - THCUNN_assertSameGPU(state, 2, input, output); - int nbatch = THCTensor_(size)(state, input, 0); - int channels = THCTensor_(size)(state, input, 1); - int inputWidth = THCTensor_(size)(state, input, 2); - - THNN_(TemporalUpSamplingNearest_shapeCheck)(state, input, NULL, nbatch, channels, inputWidth, outputWidth); - THAssert(inputWidth > 0 && outputWidth > 0); - - THCTensor_(resize3d)(state, output, - THCTensor_(size)(state, input, 0), - THCTensor_(size)(state, input, 1), - outputWidth); - THCTensor_(zero)(state, output); - - THCDeviceTensor idata = toDeviceTensor(state, input); - THCDeviceTensor odata = toDeviceTensor(state, output); - - const int num_kernels = outputWidth; - const int num_threads = at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock; - cudaStream_t stream = THCState_getCurrentStream(state); - nearest_neighbor_3d_kernel <<>>(num_kernels, idata, odata); - THCudaCheck(cudaGetLastError()); -} - - -void THNN_(TemporalUpSamplingNearest_updateGradInput)( - THCState *state, - THCTensor *gradOutput, - THCTensor *gradInput, - int nbatch, - int nchannels, - int inputWidth, - int outputWidth) -{ - THCUNN_assertSameGPU(state, 2, gradOutput, gradInput); - THNN_(TemporalUpSamplingNearest_shapeCheck)(state, NULL, gradOutput, nbatch, nchannels, inputWidth, outputWidth); - gradOutput = THCTensor_(newContiguous)(state, gradOutput); - THCTensor_(resize3d)(state, gradInput, nbatch, nchannels, inputWidth); - - THCTensor_(zero)(state, gradInput); - THCDeviceTensor data1 = toDeviceTensor(state, gradInput); - THCDeviceTensor data2 = toDeviceTensor(state, gradOutput); - - const int num_kernels = outputWidth; - const int num_threads = at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock; - cudaStream_t stream = THCState_getCurrentStream(state); - - nearest_neighbor_3d_kernel_backward <<>>(num_kernels, data1, data2); - - THCudaCheck(cudaGetLastError()); - THCTensor_(free)(state, gradOutput); -} - -#endif diff --git a/aten/src/THCUNN/generic/VolumetricAdaptiveAveragePooling.cu b/aten/src/THCUNN/generic/VolumetricAdaptiveAveragePooling.cu deleted file mode 100644 index 5e315a407029..000000000000 --- a/aten/src/THCUNN/generic/VolumetricAdaptiveAveragePooling.cu +++ /dev/null @@ -1,173 +0,0 @@ -#ifndef THC_GENERIC_FILE -#define THC_GENERIC_FILE "THCUNN/generic/VolumetricAdaptiveAveragePooling.cu" -#else - -#include - -// 5d tensor B x D x T x H x W - -void THNN_(VolumetricAdaptiveAveragePooling_updateOutput)( - THCState *state, - THCTensor *input, - THCTensor *output, - int osizeT, - int osizeW, - int osizeH) -{ - THCUNN_assertSameGPU(state, 2, input, output); - - THCUNN_argCheck(state, !input->is_empty() && (input->dim() == 4 || input->dim() == 5), 2, input, - "non-empty 4D or 5D (batch mode) tensor expected for input, but got: %s"); - - - scalar_t *output_data; - scalar_t *input_data; - - int64_t sizeD, isizeT, isizeH, isizeW; - int64_t istrideD, istrideT, istrideH, istrideW; - int64_t totalZ; - - if (input->dim() == 4) { - sizeD = input->size(0); - isizeT = input->size(1); - isizeH = input->size(2); - isizeW = input->size(3); - - istrideD = input->stride(0); - istrideT = input->stride(1); - istrideH = input->stride(2); - istrideW = input->stride(3); - - THCTensor_(resize4d)(state, output, sizeD, osizeT, osizeH, osizeW); - - totalZ = sizeD * osizeT; - } else { - input = THCTensor_(newContiguous)(state, input); - - int64_t sizeB = input->size(0); - sizeD = input->size(1); - isizeT = input->size(2); - isizeH = input->size(3); - isizeW = input->size(4); - - istrideD = input->stride(1); - istrideT = input->stride(2); - istrideH = input->stride(3); - istrideW = input->stride(4); - - THCTensor_(resize5d)(state, output, sizeB, sizeD, osizeT, osizeH, osizeW); - - totalZ = sizeB * sizeD * osizeT; - } - - input_data = THCTensor_(data)(state, input); - output_data = THCTensor_(data)(state, output); - - int64_t offsetZ = 0; - dim3 threads(32, 8); - // each H*W plane is processed by blocksH thread blocks - int blocksH = max((int)(16L / totalZ), 1); - while (totalZ > 0) { - dim3 blocks(totalZ > 65535 ? 65535 : totalZ, blocksH); - cunn_VolumetricAdaptiveAveragePooling_updateOutput_kernel - <<>>( - input_data, output_data, isizeT, isizeH, isizeW, osizeT, osizeH, osizeW, - istrideD, istrideT, istrideH, istrideW, offsetZ - ); - - totalZ -= 65535; - offsetZ += 65535; - THCudaCheck(cudaGetLastError()); - } - - if (input->dim() == 5) { - // clean - THCTensor_(free)(state, input); - } -} - -void THNN_(VolumetricAdaptiveAveragePooling_updateGradInput)( - THCState *state, - THCTensor *input, - THCTensor *gradOutput, - THCTensor *gradInput) -{ - THCUNN_assertSameGPU(state, 3, input, gradOutput, gradInput); - - gradOutput = THCTensor_(newContiguous)(state, gradOutput); - - THCTensor_(resizeAs)(state, gradInput, input); - THCTensor_(zero)(state, gradInput); - - scalar_t *gradInput_data; - scalar_t *gradOutput_data; - - int64_t sizeD, isizeT, isizeH, isizeW; - int64_t osizeT, osizeH, osizeW; - int64_t totalZ; - - if (input->dim() == 4) { - sizeD = input->size(0); - isizeT = input->size(1); - isizeH = input->size(2); - isizeW = input->size(3); - - osizeT = gradOutput->size(1); - osizeH = gradOutput->size(2); - osizeW = gradOutput->size(3); - } else { - sizeD = input->size(1); - isizeT = input->size(2); - isizeH = input->size(3); - isizeW = input->size(4); - - osizeT = gradOutput->size(2); - osizeH = gradOutput->size(3); - osizeW = gradOutput->size(4); - } - - // somehow nonatomic is passing all test for volumetric case. - bool atomic = false; //(isizeW%osizeW != 0) || (isizeH%osizeH != 0) || (isizeT%osizeT != 0); - - if (input->dim() == 4) { - totalZ = atomic ? sizeD * osizeT : sizeD * isizeT; - } else { - int sizeB = input->size(0); - totalZ = atomic ? sizeB * sizeD * osizeT : sizeB * sizeD * isizeT; - } - - gradInput_data = THCTensor_(data)(state, gradInput); - gradOutput_data = THCTensor_(data)(state, gradOutput); - - int64_t offsetZ = 0; - dim3 threads(32, 8); - // each H*W plane is processed by blocksH thread blocks - int blocksH = max((int)(16L / totalZ), 1); - while (totalZ > 0) { - dim3 blocks(totalZ > 65535 ? 65535 : totalZ, blocksH); - - if (atomic) - { - cunn_atomic_VolumetricAdaptiveAveragePooling_updateGradInput_kernel - <<>>( - gradInput_data, gradOutput_data, isizeT, isizeH, isizeW, - osizeT, osizeH, osizeW, offsetZ - ); - } else { - cunn_VolumetricAdaptiveAveragePooling_updateGradInput_kernel - <<>>( - gradInput_data, gradOutput_data, isizeT, isizeH, isizeW, - osizeT, osizeH, osizeW, offsetZ - ); - } - - totalZ -= 65535; - offsetZ += 65535; - THCudaCheck(cudaGetLastError()); - } - // clean - THCTensor_(free)(state, gradOutput); - -} - -#endif diff --git a/aten/src/THCUNN/generic/VolumetricUpSamplingNearest.cu b/aten/src/THCUNN/generic/VolumetricUpSamplingNearest.cu deleted file mode 100644 index 7b3a142876f3..000000000000 --- a/aten/src/THCUNN/generic/VolumetricUpSamplingNearest.cu +++ /dev/null @@ -1,108 +0,0 @@ -#ifndef THC_GENERIC_FILE -#define THC_GENERIC_FILE "THCUNN/generic/VolumetricUpSamplingNearest.cu" -#else - -#include -#include "ATen/cuda/CUDAContext.h" - -static inline void THNN_(VolumetricUpSamplingNearest_shapeCheck) - (THCState *state, - THCTensor *input, THCTensor *gradOutput, - int nBatch, int nChannels, - int inputDepth, int inputHeight, int inputWidth, - int outputDepth, int outputHeight, int outputWidth) { - THArgCheck(inputDepth > 0 && inputHeight > 0 && inputWidth > 0 - && outputDepth && outputHeight > 0 && outputWidth > 0, 2, - "input and output sizes should be greater than 0," - " but got input (D: %d, H: %d, W: %d) output (D: %d, H: %d, W: %d)", - inputDepth, inputHeight, inputWidth, outputDepth, outputHeight, outputWidth); - if (input != NULL) { - THCUNN_argCheck(state, THTensor_nDimensionLegacyAll(input) == 5, 2, input, - "5D input tensor expected but got: %s"); - } - - if (gradOutput != NULL) { - THCUNN_check_dim_size(state, gradOutput, 5, 0, nBatch); - THCUNN_check_dim_size(state, gradOutput, 5, 1, nChannels); - THCUNN_check_dim_size(state, gradOutput, 5, 2, outputDepth); - THCUNN_check_dim_size(state, gradOutput, 5, 3, outputHeight); - THCUNN_check_dim_size(state, gradOutput, 5, 4, outputWidth); - } -} - - -void THNN_(VolumetricUpSamplingNearest_updateOutput)( - THCState *state, - THCTensor *input, - THCTensor *output, - int outputDepth, - int outputHeight, - int outputWidth) -{ - THCUNN_assertSameGPU(state, 2, input, output); - int nbatch = THCTensor_(size)(state, input, 0); - int channels = THCTensor_(size)(state, input, 1); - int inputDepth = THCTensor_(size)(state, input, 2); - int inputHeight = THCTensor_(size)(state, input, 3); - int inputWidth = THCTensor_(size)(state, input, 4); - - THNN_(VolumetricUpSamplingNearest_shapeCheck)(state, input, NULL, nbatch, channels, - inputDepth, inputHeight, inputWidth, - outputDepth, outputHeight, outputWidth); - THAssert(inputDepth > 0 && inputHeight > 0 && inputWidth > 0 && - outputDepth > 0 && outputHeight > 0 && outputWidth > 0); - - THCTensor_(resize5d)(state, output, - THCTensor_(size)(state, input, 0), - THCTensor_(size)(state, input, 1), - outputDepth, - outputHeight, - outputWidth); - THCTensor_(zero)(state, output); - - THCDeviceTensor idata = toDeviceTensor(state, input); - THCDeviceTensor odata = toDeviceTensor(state, output); - - const int num_kernels = outputDepth * outputHeight * outputWidth; - const int num_threads = at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock; - cudaStream_t stream = THCState_getCurrentStream(state); - nearest_neighbor_5d_kernel <<>>(num_kernels, idata, odata); - THCudaCheck(cudaGetLastError()); -} - - - -void THNN_(VolumetricUpSamplingNearest_updateGradInput)( - THCState *state, - THCTensor *gradOutput, - THCTensor *gradInput, - int nbatch, - int nchannels, - int inputDepth, - int inputHeight, - int inputWidth, - int outputDepth, - int outputHeight, - int outputWidth) -{ - THCUNN_assertSameGPU(state, 2, gradOutput, gradInput); - THNN_(VolumetricUpSamplingNearest_shapeCheck)(state, NULL, gradOutput, nbatch, nchannels, - inputDepth, inputHeight, inputWidth, - outputDepth, outputHeight, outputWidth); - gradOutput = THCTensor_(newContiguous)(state, gradOutput); - THCTensor_(resize5d)(state, gradInput, nbatch, nchannels, inputDepth, inputHeight, inputWidth); - - THCTensor_(zero)(state, gradInput); - THCDeviceTensor data1 = toDeviceTensor(state, gradInput); - THCDeviceTensor data2 = toDeviceTensor(state, gradOutput); - const int num_kernels = outputDepth * outputHeight * outputWidth; - const int num_threads = at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock; - cudaStream_t stream = THCState_getCurrentStream(state); - nearest_neighbor_5d_kernel_backward <<>>(num_kernels, data1, data2); - THCudaCheck(cudaGetLastError()); - THCTensor_(free)(state, gradOutput); -} - -#endif diff --git a/aten/src/THCUNN/generic/VolumetricUpSamplingTrilinear.cu b/aten/src/THCUNN/generic/VolumetricUpSamplingTrilinear.cu deleted file mode 100644 index 73e0655352e3..000000000000 --- a/aten/src/THCUNN/generic/VolumetricUpSamplingTrilinear.cu +++ /dev/null @@ -1,113 +0,0 @@ -#ifndef THC_GENERIC_FILE -#define THC_GENERIC_FILE "THCUNN/generic/VolumetricUpSamplingTrilinear.cu" -#else - -#include -#include "ATen/cuda/CUDAContext.h" - -static inline void THNN_(VolumetricUpSamplingTrilinear_shapeCheck) - (THCState *state, - THCTensor *input, THCTensor *gradOutput, - int nBatch, int nChannels, - int inputDepth, int inputHeight, int inputWidth, - int outputDepth, int outputHeight, int outputWidth) { - THArgCheck(inputDepth > 0 && inputHeight > 0 && inputWidth > 0 - && outputDepth && outputHeight > 0 && outputWidth > 0, 2, - "input and output sizes should be greater than 0," - " but got input (D: %d, H: %d, W: %d) output (D: %d, H: %d, W: %d)", - inputDepth, inputHeight, inputWidth, outputDepth, outputHeight, outputWidth); - if (input != NULL) { - THCUNN_argCheck(state, !input->is_empty() && input->dim() == 5, 2, input, - "non-empty 5D input tensor expected but got: %s"); - } - - if (gradOutput != NULL) { - THCUNN_check_dim_size(state, gradOutput, 5, 0, nBatch); - THCUNN_check_dim_size(state, gradOutput, 5, 1, nChannels); - THCUNN_check_dim_size(state, gradOutput, 5, 2, outputDepth); - THCUNN_check_dim_size(state, gradOutput, 5, 3, outputHeight); - THCUNN_check_dim_size(state, gradOutput, 5, 4, outputWidth); - } -} - -void THNN_(VolumetricUpSamplingTrilinear_updateOutput)( - THCState *state, - THCTensor *input, - THCTensor *output, - int outputDepth, - int outputHeight, - int outputWidth, - bool align_corners) -{ - int nbatch = THCTensor_(size)(state, input, 0); - int channels = THCTensor_(size)(state, input, 1); - int inputDepth = THCTensor_(size)(state, input, 2); - int inputHeight = THCTensor_(size)(state, input, 3); - int inputWidth = THCTensor_(size)(state, input, 4); - THNN_(VolumetricUpSamplingTrilinear_shapeCheck) - (state, input, NULL, - nbatch, channels, - inputDepth, inputHeight, inputWidth, - outputDepth, outputHeight, outputWidth); - - THCUNN_assertSameGPU(state, 2, input, output); - THCTensor_(resize5d)(state, output, - THCTensor_(size)(state, input, 0), - THCTensor_(size)(state, input, 1), - outputDepth, outputHeight, outputWidth); - THCTensor_(zero)(state, output); - THCDeviceTensor idata = toDeviceTensor(state, input); - THCDeviceTensor odata = toDeviceTensor(state, output); - THAssert(inputDepth > 0 && inputHeight > 0 && inputWidth > 0 && outputDepth > 0 && outputHeight > 0 && outputWidth > 0); - const accreal rdepth = area_pixel_compute_scale(inputDepth, outputDepth, align_corners); - const accreal rheight = area_pixel_compute_scale(inputHeight, outputHeight, align_corners); - const accreal rwidth = area_pixel_compute_scale(inputWidth, outputWidth, align_corners); - const int num_kernels = outputDepth * outputHeight * outputWidth; - const int num_threads = - at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock; - cudaStream_t stream = THCState_getCurrentStream(state); - caffe_gpu_interp2_kernel <<>>(num_kernels, rdepth, rheight, rwidth, align_corners, idata, odata); - THCudaCheck(cudaGetLastError()); -} - - -void THNN_(VolumetricUpSamplingTrilinear_updateGradInput)( - THCState *state, - THCTensor *gradOutput, - THCTensor *gradInput, - int nbatch, - int nchannels, - int inputDepth, - int inputHeight, - int inputWidth, - int outputDepth, - int outputHeight, - int outputWidth, - bool align_corners) -{ - THNN_(VolumetricUpSamplingTrilinear_shapeCheck) - (state, NULL, gradOutput, - nbatch, nchannels, - inputDepth, inputHeight, inputWidth, - outputDepth, outputHeight, outputWidth); - gradOutput = THCTensor_(newContiguous)(state, gradOutput); - THCUNN_assertSameGPU(state, 2, gradOutput, gradInput); - THCTensor_(resize5d)(state, gradInput, nbatch, nchannels, inputDepth, inputHeight, inputWidth); - THCTensor_(zero)(state, gradInput); - THCDeviceTensor data1 = toDeviceTensor(state, gradInput); - THCDeviceTensor data2 = toDeviceTensor(state, gradOutput); - const accreal rdepth = area_pixel_compute_scale(inputDepth, outputDepth, align_corners); - const accreal rheight = area_pixel_compute_scale(inputHeight, outputHeight, align_corners); - const accreal rwidth = area_pixel_compute_scale(inputWidth, outputWidth, align_corners); - const int num_kernels = outputDepth * outputHeight * outputWidth; - const int num_threads = - at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock; - cudaStream_t stream = THCState_getCurrentStream(state); - caffe_gpu_interp2_kernel_backward <<>>(num_kernels, rdepth, rheight, rwidth, align_corners, data1, data2); - THCudaCheck(cudaGetLastError()); - THCTensor_(free)(state, gradOutput); -} - -#endif diff --git a/aten/src/THCUNN/upsampling.h b/aten/src/THCUNN/upsampling.h deleted file mode 100644 index 66daea10d754..000000000000 --- a/aten/src/THCUNN/upsampling.h +++ /dev/null @@ -1,130 +0,0 @@ -#ifndef THCUNN_UPSAMPLING_H -#define THCUNN_UPSAMPLING_H - -#include -#include - -#undef MIN -#define MIN(a,b) ( ((a)<(b)) ? (a) : (b) ) -#undef MAX -#define MAX(a,b) ( ((a)>(b)) ? (a) : (b) ) - - -template -__host__ __forceinline__ -static Acctype area_pixel_compute_scale( - int inputSize, int outputSize, bool align_corners) { - if (outputSize > 1) { - return align_corners ? (Acctype) (inputSize - 1) / (outputSize - 1) - : (Acctype) inputSize / outputSize; - } else { - return Acctype(0); - } -} - -template -__device__ __forceinline__ -static Acctype area_pixel_compute_source_index( - Acctype scale, int dst_index, bool align_corners, bool cubic) { - if (align_corners) { - return scale * dst_index; - } else { - Acctype src_idx = scale * (dst_index + Acctype(0.5)) - Acctype(0.5); - // See Note[Follow Opencv resize logic] - return (!cubic && src_idx < Acctype(0)) ? Acctype(0) : src_idx; - } -} - -__device__ __forceinline__ -static int nearest_neighbor_compute_source_index( - const float scale, int dst_index, int inputSize) { - const int src_index = MIN(floor(dst_index * scale), inputSize - 1); - return src_index; -} - -template -__device__ __forceinline__ -static Dtype upsampling_get_value_bounded( - const THCDeviceTensor data, - int channel, - int batch, - int width, - int height, - int x, - int y -) { - int access_x = max(min(x, width - 1), 0); - int access_y = max(min(y, height - 1), 0); - return data[batch][channel][access_y][access_x]; -} - -template -__device__ __forceinline__ -static void upsampling_increment_value_bounded( - const THCDeviceTensor data, - int channel, - int batch, - int width, - int height, - int x, - int y, - Acctype value -) { - int access_x = max(min(x, width - 1), 0); - int access_y = max(min(y, height - 1), 0); - atomicAdd( - data[batch][channel][access_y][access_x].data(), - ScalarConvert::to(value) - ); -} - -// Based on https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm -template -__device__ __forceinline__ -static Acctype cubic_convolution1(Acctype x, Acctype A) { - return ((A + 2) * x - (A + 3)) * x * x + 1; -} - -template -__device__ __forceinline__ -static Acctype cubic_convolution2(Acctype x, Acctype A) { - return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A; -} - -template -__device__ __forceinline__ -static void get_cubic_upsampling_coefficients( - Acctype coeffs[4], - Acctype t -) { - Acctype A = -0.75; - - Acctype x1 = t; - coeffs[0] = cubic_convolution2(x1 + 1.0, A); - coeffs[1] = cubic_convolution1(x1, A); - - // opposite coefficients - Acctype x2 = 1.0 - t; - coeffs[2] = cubic_convolution1(x2, A); - coeffs[3] = cubic_convolution2(x2 + 1.0, A); -} - -template -__device__ __forceinline__ -static Acctype cubic_interp1d( - Dtype x0, - Dtype x1, - Dtype x2, - Dtype x3, - Acctype t -) { - Acctype coeffs[4]; - get_cubic_upsampling_coefficients(coeffs, t); - - return x0 * coeffs[0] - + x1 * coeffs[1] - + x2 * coeffs[2] - + x3 * coeffs[3]; -} - -#endif diff --git a/aten/src/THNN/generic/MultiLabelMarginCriterion.c b/aten/src/THNN/generic/MultiLabelMarginCriterion.c index 8be35566342d..cdc4d37a1744 100644 --- a/aten/src/THNN/generic/MultiLabelMarginCriterion.c +++ b/aten/src/THNN/generic/MultiLabelMarginCriterion.c @@ -17,21 +17,21 @@ void THNN_(MultiLabelMarginCriterion_updateOutput)( int64_t t, d, dt, ddt; scalar_t sum; - AT_CHECK(!input->is_empty() && input->dim() <= 2, + TORCH_CHECK(!input->is_empty() && input->dim() <= 2, "non-empty vector or matrix expected, got size: ", input->sizes()); if (input->dim() <= 1) { nframe = 1; dim = THTensor_sizeLegacyNoScalars(input, 0); - AT_CHECK(!target->is_empty() && (target->dim() <= 1) && (THTensor_sizeLegacyNoScalars(target, 0) == dim), + TORCH_CHECK(!target->is_empty() && (target->dim() <= 1) && (THTensor_sizeLegacyNoScalars(target, 0) == dim), "inconsistent target size"); } else { nframe = input->size(0); dim = input->size(1); - AT_CHECK(!target->is_empty() && target->dim() == 2 && (target->size(0) == nframe) + TORCH_CHECK(!target->is_empty() && target->dim() == 2 && (target->size(0) == nframe) && (target->size(1) == dim), "inconsistent target size"); } @@ -157,25 +157,25 @@ void THNN_(MultiLabelMarginCriterion_updateGradInput)( int64_t t, d, dt; scalar_t g; - AT_CHECK(!input->is_empty() && input->dim() <= 2, + TORCH_CHECK(!input->is_empty() && input->dim() <= 2, "vector or matrix expected, got size: ", input->sizes()); if (input->dim() <= 1) { nframe = 1; dim = THTensor_sizeLegacyNoScalars(input, 0); - AT_CHECK((!target->is_empty() && target->dim() <= 1) && (THTensor_sizeLegacyNoScalars(target, 0) == dim), + TORCH_CHECK((!target->is_empty() && target->dim() <= 1) && (THTensor_sizeLegacyNoScalars(target, 0) == dim), "inconsistent target size"); - AT_CHECK((!isTarget->is_empty() && isTarget->dim() <= 1) && (THTensor_sizeLegacyNoScalars(isTarget, 0) == dim), + TORCH_CHECK((!isTarget->is_empty() && isTarget->dim() <= 1) && (THTensor_sizeLegacyNoScalars(isTarget, 0) == dim), "inconsistent isTarget size"); } else { nframe = input->size(0); dim = input->size(1); - AT_CHECK(!target->is_empty() && (target->dim() == 2) && (target->size(0) == nframe) + TORCH_CHECK(!target->is_empty() && (target->dim() == 2) && (target->size(0) == nframe) && (target->size(1) == dim), 3, "inconsistent target size"); - AT_CHECK(!isTarget->is_empty() && (isTarget->dim() == 2) && (isTarget->size(0) == nframe) + TORCH_CHECK(!isTarget->is_empty() && (isTarget->dim() == 2) && (isTarget->size(0) == nframe) && (isTarget->size(1) == dim), 3, "inconsistent isTarget size"); } diff --git a/aten/src/THNN/generic/MultiMarginCriterion.c b/aten/src/THNN/generic/MultiMarginCriterion.c index 34ecf26b8ac0..00080169ab86 100644 --- a/aten/src/THNN/generic/MultiMarginCriterion.c +++ b/aten/src/THNN/generic/MultiMarginCriterion.c @@ -20,7 +20,7 @@ void THNN_(MultiMarginCriterion_updateOutput)( int64_t t, d; scalar_t sum; - AT_CHECK(!input->is_empty() && input->dim() <= 2, + TORCH_CHECK(!input->is_empty() && input->dim() <= 2, "non-empty vector or matrix expected, got size: ", input->sizes()); if (input->dim() <= 1) @@ -32,7 +32,7 @@ void THNN_(MultiMarginCriterion_updateOutput)( { nframe = input->size(0); dim = input->size(1); - AT_CHECK(!target->is_empty() && (THTensor_nDimensionLegacyNoScalars(target) == 1) && (THTensor_sizeLegacyNoScalars(target, 0) == nframe), + TORCH_CHECK(!target->is_empty() && (THTensor_nDimensionLegacyNoScalars(target) == 1) && (THTensor_sizeLegacyNoScalars(target, 0) == nframe), "inconsistent target size, got: ", target->sizes()); } @@ -136,7 +136,7 @@ void THNN_(MultiMarginCriterion_updateGradInput)( int64_t t, d; scalar_t g; - AT_CHECK(!input->is_empty() && (input->dim() <= 2), + TORCH_CHECK(!input->is_empty() && (input->dim() <= 2), "non-empty vector or matrix expected, got size: ", input->sizes()); if (input->dim() <= 1) @@ -148,7 +148,7 @@ void THNN_(MultiMarginCriterion_updateGradInput)( { nframe = input->size(0); dim = input->size(1); - AT_CHECK(!target->is_empty() && (target->dim() <= 1) && (THTensor_sizeLegacyNoScalars(target, 0) == nframe), + TORCH_CHECK(!target->is_empty() && (target->dim() <= 1) && (THTensor_sizeLegacyNoScalars(target, 0) == nframe), "inconsistent target size, got: ", target->sizes()); } diff --git a/aten/src/THNN/generic/SpatialDilatedMaxPooling.c b/aten/src/THNN/generic/SpatialDilatedMaxPooling.c deleted file mode 100644 index 94d438a36a37..000000000000 --- a/aten/src/THNN/generic/SpatialDilatedMaxPooling.c +++ /dev/null @@ -1,368 +0,0 @@ -#ifndef TH_GENERIC_FILE -#define TH_GENERIC_FILE "THNN/generic/SpatialDilatedMaxPooling.c" -#else - -#include -#include - -#include - -static inline void THNN_(SpatialDilatedMaxPooling_shapeCheck)( - THTensor *input, THTensor *gradOutput, THIndexTensor *indices, - int kH, int kW, int dH, int dW, int padH, int padW, - int dilationH, int dilationW, bool ceil_mode) { - - THArgCheck(kW > 0 && kH > 0, 5, - "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW); - THArgCheck(dW > 0 && dH > 0, 8, - "stride should be greater than zero, but got dH: %d dW: %d", dH, dW); - THArgCheck(dilationH > 0 && dilationW > 0, 12, - "dilation should be greater than zero, but got dilationH: %d dilationW: %d", - dilationH, dilationW); - - int ndim = input->dim(); - int dimf = 0; - int dimh = 1; - int dimw = 2; - - if (ndim == 4) { - dimf++; - dimh++; - dimw++; - } - - THNN_ARGCHECK(!input->is_empty() && (ndim == 3 || ndim == 4), 2, input, - "non-empty 3D or 4D input tensor expected but got: %s"); - - THArgCheck(kW/2 >= padW && kH/2 >= padH, 2, - "pad should be smaller than half of kernel size, but got " - "padW = %d, padH = %d, kW = %d, kH = %d", - padW, padH, kW, kH); - - int64_t nInputPlane = input->size(dimh-1); - int64_t inputHeight = input->size(dimh); - int64_t inputWidth = input->size(dimw); - int64_t nOutputPlane = nInputPlane; - - int64_t outputHeight = pooling_output_shape(inputHeight, kH, padH, dH, dilationH, ceil_mode); - int64_t outputWidth = pooling_output_shape(inputWidth, kW, padW, dW, dilationW, ceil_mode); - - if (outputWidth < 1 || outputHeight < 1) - THError("Given input size: (%dx%dx%d). " - "Calculated output size: (%dx%dx%d). Output size is too small", - nInputPlane,inputHeight,inputWidth,nInputPlane,outputHeight,outputWidth); - - if (gradOutput != NULL) { - THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); - THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight); - THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth); - } - if (indices != NULL) { - THNN_CHECK_DIM_SIZE_INDICES(indices, ndim, dimf, nOutputPlane); - THNN_CHECK_DIM_SIZE_INDICES(indices, ndim, dimh, outputHeight); - THNN_CHECK_DIM_SIZE_INDICES(indices, ndim, dimw, outputWidth); - } -} - -static void THNN_(SpatialDilatedMaxPooling_updateOutput_frame)( - scalar_t *input_p, - scalar_t *output_p, - THIndex_t *ind_p, - int64_t nslices, - int64_t iwidth, - int64_t iheight, - int64_t owidth, - int64_t oheight, - int kW, - int kH, - int dW, - int dH, - int padW, - int padH, - int dilationW, - int dilationH - ) -{ - at::parallel_for(0, nslices, 0, [&](int64_t start, int64_t end) { - for (auto k = start; k < end; k++) - { - /* loop over output */ - int64_t i, j; - scalar_t *ip = input_p + k*iwidth*iheight; - for(i = 0; i < oheight; i++) - { - for(j = 0; j < owidth; j++) - { - int64_t hstart = i * dH - padH; - int64_t wstart = j * dW - padW; - int64_t hend = std::min(hstart + (kH - 1) * dilationH + 1, iheight); - int64_t wend = std::min(wstart + (kW - 1) * dilationW + 1, iwidth); - while(hstart < 0) - hstart += dilationH; - while(wstart < 0) - wstart += dilationW; - - /* local pointers */ - scalar_t *op = output_p + k*owidth*oheight + i*owidth + j; - THIndex_t *indp = ind_p + k*owidth*oheight + i*owidth + j; - - /* compute local max: */ - int64_t maxindex = -1; - scalar_t maxval = -THInf; - int64_t tcntr = 0; - int64_t x,y; - for(y = hstart; y < hend; y += dilationH) - { - for(x = wstart; x < wend; x += dilationW) - { - tcntr = y*iwidth + x; - scalar_t val = *(ip + tcntr); - if ((val > maxval) || std::isnan(val)) - { - maxval = val; - maxindex = tcntr; - } - } - } - - /* set output to local max */ - *op = maxval; - - /* store location of max */ - *indp = maxindex; - } - } - } - }); -} - -void THNN_(SpatialDilatedMaxPooling_updateOutput)( - THNNState *state, - THTensor *input, - THTensor *output, - THIndexTensor *indices, - int kW, - int kH, - int dW, - int dH, - int padW, - int padH, - int dilationW, - int dilationH, - bool ceil_mode) -{ - - int dimw = 2; - int dimh = 1; - int64_t nbatch = 1; - int64_t nInputPlane; - int64_t inputHeight; - int64_t inputWidth; - int64_t outputHeight; - int64_t outputWidth; - scalar_t *input_data; - scalar_t *output_data; - THIndex_t *indices_data; - - THNN_(SpatialDilatedMaxPooling_shapeCheck) - (input, NULL, NULL, kH, kW, dH, dW, - padH, padW, dilationH, dilationW, ceil_mode); - - if (input->dim() == 4) - { - nbatch = input->size(0); - dimw++; - dimh++; - } - - /* sizes */ - nInputPlane = input->size(dimh-1); - inputHeight = input->size(dimh); - inputWidth = input->size(dimw); - outputHeight = pooling_output_shape(inputHeight, kH, padH, dH, dilationH, ceil_mode); - outputWidth = pooling_output_shape(inputWidth, kW, padW, dW, dilationW, ceil_mode); - - /* get contiguous input */ - input = THTensor_(newContiguous)(input); - - /* resize output */ - if (input->dim() == 3) - { - THTensor_(resize3d)(output, nInputPlane, outputHeight, outputWidth); - /* indices will contain the locations for each output point */ - THIndexTensor_(resize3d)(indices, nInputPlane, outputHeight, outputWidth); - - input_data = input->data(); - output_data = output->data(); - indices_data = THIndexTensor_(data)(indices); - - THNN_(SpatialDilatedMaxPooling_updateOutput_frame) - (input_data, output_data, - indices_data, - nInputPlane, - inputWidth, inputHeight, - outputWidth, outputHeight, - kW, kH, dW, dH, - padW, padH, - dilationW, dilationH - ); - } - else - { - THTensor_(resize4d)(output, nbatch, nInputPlane, outputHeight, outputWidth); - /* indices will contain the locations for each output point */ - THIndexTensor_(resize4d)(indices, nbatch, nInputPlane, outputHeight, outputWidth); - - input_data = input->data(); - output_data = output->data(); - indices_data = THIndexTensor_(data)(indices); - - at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) { - for (auto p = start; p < end; p++) - { - THNN_(SpatialDilatedMaxPooling_updateOutput_frame) - (input_data+p*nInputPlane*inputWidth*inputHeight, - output_data+p*nInputPlane*outputWidth*outputHeight, - indices_data+p*nInputPlane*outputWidth*outputHeight, - nInputPlane, - inputWidth, inputHeight, - outputWidth, outputHeight, - kW, kH, dW, dH, - padW, padH, - dilationW, dilationH - ); - } - }); - } - - /* cleanup */ - c10::raw::intrusive_ptr::decref(input); -} - -static void THNN_(SpatialDilatedMaxPooling_updateGradInput_frame)( - scalar_t *gradInput_p, - scalar_t *gradOutput_p, - THIndex_t *ind_p, - int64_t nInputPlane, - int64_t inputWidth, - int64_t inputHeight, - int64_t outputWidth, - int64_t outputHeight, - int dW, - int dH) -{ - at::parallel_for(0, nInputPlane, 0, [&](int64_t start, int64_t end) { - for (auto k = start; k < end; k++) - { - scalar_t *gradInput_p_k = gradInput_p + k*inputWidth*inputHeight; - scalar_t *gradOutput_p_k = gradOutput_p + k*outputWidth*outputHeight; - THIndex_t *ind_p_k = ind_p + k*outputWidth*outputHeight; - - /* calculate max points */ - int64_t i, j; - for(i = 0; i < outputHeight; i++) - { - for(j = 0; j < outputWidth; j++) - { - /* retrieve position of max */ - int64_t maxp = ind_p_k[i*outputWidth + j]; - if (maxp != -1) { - /* update gradient */ - gradInput_p_k[maxp] += gradOutput_p_k[i*outputWidth + j]; - } - } - } - } - }); -} - -void THNN_(SpatialDilatedMaxPooling_updateGradInput)( - THNNState *state, - THTensor *input, - THTensor *gradOutput, - THTensor *gradInput, - THIndexTensor *indices, - int kW, - int kH, - int dW, - int dH, - int padW, - int padH, - int dilationW, - int dilationH, - bool ceil_mode) -{ - int dimw = 2; - int dimh = 1; - int64_t nbatch = 1; - int nInputPlane; - int inputHeight; - int inputWidth; - int outputHeight; - int outputWidth; - scalar_t *gradInput_data; - scalar_t *gradOutput_data; - THIndex_t *indices_data; - - THNN_(SpatialDilatedMaxPooling_shapeCheck) - (input, gradOutput, indices, kH, kW, dH, dW, - padH, padW, dilationH, dilationW, ceil_mode); - - /* get contiguous gradOutput */ - gradOutput = THTensor_(newContiguous)(gradOutput); - - /* resize */ - THTensor_(resizeAs)(gradInput, input); - THTensor_(zero)(gradInput); - - if (input->dim() == 4) { - nbatch = input->size(0); - dimw++; - dimh++; - } - - /* sizes */ - nInputPlane = input->size(dimh-1); - inputHeight = input->size(dimh); - inputWidth = input->size(dimw); - outputHeight = gradOutput->size(dimh); - outputWidth = gradOutput->size(dimw); - - /* get raw pointers */ - gradInput_data = gradInput->data(); - gradOutput_data = gradOutput->data(); - indices_data = THIndexTensor_(data)(indices); - - /* backprop */ - if (input->dim() == 3) - { - THNN_(SpatialDilatedMaxPooling_updateGradInput_frame) - (gradInput_data, gradOutput_data, - indices_data, - nInputPlane, - inputWidth, inputHeight, - outputWidth, outputHeight, - dW, dH); - } - else - { - at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) { - for (auto p = start; p < end; p++) - { - THNN_(SpatialDilatedMaxPooling_updateGradInput_frame) - (gradInput_data+p*nInputPlane*inputWidth*inputHeight, - gradOutput_data+p*nInputPlane*outputWidth*outputHeight, - indices_data+p*nInputPlane*outputWidth*outputHeight, - nInputPlane, - inputWidth, inputHeight, - outputWidth, outputHeight, - dW, dH); - } - }); - } - - /* cleanup */ - c10::raw::intrusive_ptr::decref(gradOutput); -} - -#endif diff --git a/aten/src/THNN/generic/SpatialMaxUnpooling.c b/aten/src/THNN/generic/SpatialMaxUnpooling.c index ffddf6144de4..d66164499dbf 100644 --- a/aten/src/THNN/generic/SpatialMaxUnpooling.c +++ b/aten/src/THNN/generic/SpatialMaxUnpooling.c @@ -63,7 +63,7 @@ void THNN_(SpatialMaxUnpooling_updateOutput)( THIndex_t *indices_data; - AT_CHECK(!input->is_empty() && (input->dim() == 3 || input->dim() == 4), + TORCH_CHECK(!input->is_empty() && (input->dim() == 3 || input->dim() == 4), "non-empty 3D or 4D (batch mode) tensor expected for input, but got sizes: ", input->sizes()); THNN_CHECK_SHAPE_INDICES(input, indices); diff --git a/aten/src/THNN/generic/SpatialUpSamplingBicubic.c b/aten/src/THNN/generic/SpatialUpSamplingBicubic.c deleted file mode 100644 index a81d6bc0d88a..000000000000 --- a/aten/src/THNN/generic/SpatialUpSamplingBicubic.c +++ /dev/null @@ -1,29 +0,0 @@ -#ifndef TH_GENERIC_FILE -#define TH_GENERIC_FILE "THNN/generic/SpatialUpSamplingBicubic.c" -#else - -void THNN_(SpatialUpSamplingBicubic_updateOutput)( - THNNState* state, - THTensor* input, - THTensor* output, - int outputHeight, - int outputWidth, - bool align_corners) { - AT_ERROR("This function is deprecated, please use it from ATen."); -} - -void THNN_(SpatialUpSamplingBicubic_updateGradInput)( - THNNState* state, - THTensor* gradOutput, - THTensor* gradInput, - int nbatch, - int channels, - int inputHeight, - int inputWidth, - int outputHeight, - int outputWidth, - bool align_corners) { - AT_ERROR("This function is deprecated, please use it from ATen."); -} - -#endif diff --git a/aten/src/THNN/generic/SpatialUpSamplingBilinear.c b/aten/src/THNN/generic/SpatialUpSamplingBilinear.c deleted file mode 100644 index 0dc646307b11..000000000000 --- a/aten/src/THNN/generic/SpatialUpSamplingBilinear.c +++ /dev/null @@ -1,29 +0,0 @@ -#ifndef TH_GENERIC_FILE -#define TH_GENERIC_FILE "THNN/generic/SpatialUpSamplingBilinear.c" -#else - -void THNN_(SpatialUpSamplingBilinear_updateOutput)( - THNNState* state, - THTensor* input, - THTensor* output, - int outputHeight, - int outputWidth, - bool align_corners) { - AT_ERROR("This function is deprecated, please use it from ATen."); -} - -void THNN_(SpatialUpSamplingBilinear_updateGradInput)( - THNNState* state, - THTensor* gradOutput, - THTensor* gradInput, - int nbatch, - int channels, - int inputHeight, - int inputWidth, - int outputHeight, - int outputWidth, - bool align_corners) { - AT_ERROR("This function is deprecated, please use it from ATen."); -} - -#endif diff --git a/aten/src/THNN/generic/SpatialUpSamplingNearest.c b/aten/src/THNN/generic/SpatialUpSamplingNearest.c deleted file mode 100644 index 82f8237fe677..000000000000 --- a/aten/src/THNN/generic/SpatialUpSamplingNearest.c +++ /dev/null @@ -1,27 +0,0 @@ -#ifndef TH_GENERIC_FILE -#define TH_GENERIC_FILE "THNN/generic/SpatialUpSamplingNearest.c" -#else - -void THNN_(SpatialUpSamplingNearest_updateOutput)( - THNNState* state, - THTensor* input, - THTensor* output, - int outputHeight, - int outputWidth) { - AT_ERROR("This function is deprecated, please use it from ATen."); -} - -void THNN_(SpatialUpSamplingNearest_updateGradInput)( - THNNState* state, - THTensor* gradOutput, - THTensor* gradInput, - int nbatch, - int channels, - int inputHeight, - int inputWidth, - int outputHeight, - int outputWidth) { - AT_ERROR("This function is deprecated, please use it from ATen."); -} - -#endif diff --git a/aten/src/THNN/generic/THNN.h b/aten/src/THNN/generic/THNN.h index 04a21329e5d5..188194cf35ed 100644 --- a/aten/src/THNN/generic/THNN.h +++ b/aten/src/THNN/generic/THNN.h @@ -390,36 +390,6 @@ TH_API void THNN_(TemporalRowConvolution_accGradParameters)( bool featFirst, accreal scale); -TH_API void THNN_(TemporalUpSamplingNearest_updateOutput)( - THNNState *state, - THTensor *input, - THTensor *output, - int osizeW); -TH_API void THNN_(TemporalUpSamplingNearest_updateGradInput)( - THNNState *state, - THTensor *gradOutput, - THTensor *gradInput, - int isizeB, - int isizeC, - int isizeW, - int osizeW); - -TH_API void THNN_(TemporalUpSamplingLinear_updateOutput)( - THNNState *state, - THTensor *input, - THTensor *output, - int osizeW, - bool align_corners); -TH_API void THNN_(TemporalUpSamplingLinear_updateGradInput)( - THNNState *state, - THTensor *gradOutput, - THTensor *gradInput, - int isizeB, - int isizeC, - int isizeW, - int osizeW, - bool align_corners); - TH_API void THNN_(SpatialConvolutionMM_updateOutput)( THNNState *state, THTensor *input, @@ -556,28 +526,6 @@ TH_API void THNN_(SpatialFullDilatedConvolution_accGradParameters)( int adjW, int adjH, accreal scale); -TH_API void THNN_(SpatialDilatedMaxPooling_updateOutput)( - THNNState *state, - THTensor *input, - THTensor *output, - THIndexTensor *indices, - int kW, int kH, - int dW, int dH, - int padW, int padH, - int dilationW, int dilationH, - bool ceil_mode); -TH_API void THNN_(SpatialDilatedMaxPooling_updateGradInput)( - THNNState *state, - THTensor *input, - THTensor *gradOutput, - THTensor *gradInput, - THIndexTensor *indices, - int kW, int kH, - int dW, int dH, - int padW, int padH, - int dilationW, int dilationH, - bool ceil_mode); - TH_API void THNN_(SpatialMaxUnpooling_updateOutput)( THNNState *state, THTensor *input, @@ -592,64 +540,6 @@ TH_API void THNN_(SpatialMaxUnpooling_updateGradInput)( THIndexTensor *indices, int owidth, int oheight); -TH_API void THNN_(SpatialUpSamplingNearest_updateOutput)( - THNNState *state, - THTensor *input, - THTensor *output, - int osizeH, - int osizeW); - -TH_API void THNN_(SpatialUpSamplingNearest_updateGradInput)( - THNNState *state, - THTensor *gradOutput, - THTensor *gradInput, - int isizeB, - int isizeC, - int isizeH, - int isizeW, - int osizeH, - int osizeW); - -TH_API void THNN_(SpatialUpSamplingBilinear_updateOutput)( - THNNState *state, - THTensor *input, - THTensor *output, - int osizeH, - int osizeW, - bool align_corners); - -TH_API void THNN_(SpatialUpSamplingBilinear_updateGradInput)( - THNNState *state, - THTensor *gradOutput, - THTensor *gradInput, - int isizeB, - int isizeC, - int isizeH, - int isizeW, - int osizeH, - int osizeW, - bool align_corners); - -TH_API void THNN_(SpatialUpSamplingBicubic_updateOutput)( - THNNState *state, - THTensor *input, - THTensor *output, - int osizeH, - int osizeW, - bool align_corners); - -TH_API void THNN_(SpatialUpSamplingBicubic_updateGradInput)( - THNNState *state, - THTensor *gradOutput, - THTensor *gradInput, - int isizeB, - int isizeC, - int isizeH, - int isizeW, - int osizeH, - int osizeW, - bool align_corners); - TH_API void THNN_(unfolded_acc)( THTensor *finput, THTensor *input, @@ -808,19 +698,6 @@ TH_API void THNN_(VolumetricMaxUnpooling_updateGradInput)( int dT, int dW, int dH, int pT, int pW, int pH); -TH_API void THNN_(VolumetricAdaptiveAveragePooling_updateOutput)( - THNNState *state, - THTensor *input, - THTensor *output, - int osizeT, - int osizeW, - int osizeH); -TH_API void THNN_(VolumetricAdaptiveAveragePooling_updateGradInput)( - THNNState *state, - THTensor *input, - THTensor *gradOutput, - THTensor *gradInput); - TH_API void THNN_(FeatureLPPooling_updateOutput)( THNNState *state, THTensor *input, @@ -841,50 +718,6 @@ TH_API void THNN_(FeatureLPPooling_updateGradInput)( int stride, bool batchMode); -TH_API void THNN_(VolumetricUpSamplingNearest_updateOutput)( - THNNState *state, - THTensor *input, - THTensor *output, - int osizeT, - int osizeH, - int osizeW); - -TH_API void THNN_(VolumetricUpSamplingNearest_updateGradInput)( - THNNState *state, - THTensor *gradOutput, - THTensor *gradInput, - int isizeB, - int isizeC, - int isizeT, - int isizeH, - int isizeW, - int osizeT, - int osizeH, - int osizeW); - -TH_API void THNN_(VolumetricUpSamplingTrilinear_updateOutput)( - THNNState *state, - THTensor *input, - THTensor *output, - int osizeT, - int osizeH, - int osizeW, - bool align_corners); - -TH_API void THNN_(VolumetricUpSamplingTrilinear_updateGradInput)( - THNNState *state, - THTensor *gradOutput, - THTensor *gradInput, - int isizeB, - int isizeC, - int isizeT, - int isizeH, - int isizeW, - int osizeT, - int osizeH, - int osizeW, - bool align_corners); - TH_API void THNN_(Tanh_updateOutput)( THNNState *state, THTensor *input, diff --git a/aten/src/THNN/generic/TemporalUpSamplingLinear.c b/aten/src/THNN/generic/TemporalUpSamplingLinear.c deleted file mode 100644 index 69680540917b..000000000000 --- a/aten/src/THNN/generic/TemporalUpSamplingLinear.c +++ /dev/null @@ -1,29 +0,0 @@ -// Adapted from interp.cpp from Caffe util by Pauline Luc -// Originally developed by George Papandreou - -#ifndef TH_GENERIC_FILE -#define TH_GENERIC_FILE "THNN/generic/TemporalUpSamplingLinear.c" -#else - -void THNN_(TemporalUpSamplingLinear_updateOutput)( - THNNState* state, - THTensor* input, - THTensor* output, - int outputWidth, - bool align_corners) { - AT_ERROR("This function is deprecated, please use it from ATen."); -} - -void THNN_(TemporalUpSamplingLinear_updateGradInput)( - THNNState* state, - THTensor* gradOutput, - THTensor* gradInput, - int nbatch, - int channels, - int inputWidth, - int outputWidth, - bool align_corners) { - AT_ERROR("This function is deprecated, please use it from ATen."); -} - -#endif diff --git a/aten/src/THNN/generic/TemporalUpSamplingNearest.c b/aten/src/THNN/generic/TemporalUpSamplingNearest.c deleted file mode 100644 index 0d3fca5ebb89..000000000000 --- a/aten/src/THNN/generic/TemporalUpSamplingNearest.c +++ /dev/null @@ -1,24 +0,0 @@ -#ifndef TH_GENERIC_FILE -#define TH_GENERIC_FILE "THNN/generic/TemporalUpSamplingNearest.c" -#else - -void THNN_(TemporalUpSamplingNearest_updateOutput)( - THNNState* state, - THTensor* input, - THTensor* output, - int outputWidth) { - AT_ERROR("This function is deprecated, please use it from ATen."); -} - -void THNN_(TemporalUpSamplingNearest_updateGradInput)( - THNNState* state, - THTensor* gradOutput, - THTensor* gradInput, - int nbatch, - int channels, - int inputWidth, - int outputWidth) { - AT_ERROR("This function is deprecated, please use it from ATen."); -} - -#endif diff --git a/aten/src/THNN/generic/VolumetricAdaptiveAveragePooling.c b/aten/src/THNN/generic/VolumetricAdaptiveAveragePooling.c deleted file mode 100644 index af35f6e6b0ce..000000000000 --- a/aten/src/THNN/generic/VolumetricAdaptiveAveragePooling.c +++ /dev/null @@ -1,305 +0,0 @@ -#ifndef TH_GENERIC_FILE -#define TH_GENERIC_FILE "THNN/generic/VolumetricAdaptiveAveragePooling.c" -#else - -#include - -#define START_IND(a,b,c) (int)floor((float)(a * c) / b) -#define END_IND(a,b,c) (int)ceil((float)((a + 1) * c) / b) -// #define START_IND(a,b,c) a * c / b -// #define END_IND(a,b,c) (a + 1) * c / b + ((a + 1) * c % b > 0)?1:0 - -// 5d tensor B x D x T x H x W - -static void THNN_(VolumetricAdaptiveAveragePooling_updateOutput_frame)( - scalar_t *input_p, - scalar_t *output_p, - int64_t sizeD, - int64_t isizeT, - int64_t isizeH, - int64_t isizeW, - int64_t osizeT, - int64_t osizeH, - int64_t osizeW, - int64_t istrideD, - int64_t istrideT, - int64_t istrideH, - int64_t istrideW) -{ - at::parallel_for(0, sizeD, 0, [&](int64_t start, int64_t end) { - for (auto d = start; d < end; d++) - { - /* loop over output */ - int64_t ot, oh, ow; - for(ot = 0; ot < osizeT; ot++) - { - int istartT = START_IND(ot, osizeT, isizeT); - int iendT = END_IND(ot, osizeT, isizeT); - int kT = iendT - istartT; - - for(oh = 0; oh < osizeH; oh++) - { - int istartH = START_IND(oh, osizeH, isizeH); - int iendH = END_IND(oh, osizeH, isizeH); - int kH = iendH - istartH; - - for(ow = 0; ow < osizeW; ow++) - { - - int istartW = START_IND(ow, osizeW, isizeW); - int iendW = END_IND(ow, osizeW, isizeW); - int kW = iendW - istartW; - - /* local pointers */ - scalar_t *ip = input_p + d*istrideD + istartT*istrideT + istartH*istrideH + istartW*istrideW; - scalar_t *op = output_p + d*osizeT*osizeH*osizeW + ot*osizeH*osizeW + oh*osizeW + ow; - - /* compute local average: */ - scalar_t sum = 0; - int it, ih, iw; - for(it = 0; it < kT; it++) - { - for(ih = 0; ih < kH; ih++) - { - for(iw = 0; iw < kW; iw++) - { - scalar_t val = *(ip + it*istrideT + ih*istrideH + iw*istrideW); - sum += val; - } - } - } - - /* set output to local average */ - *op = sum / kT / kH / kW; - } - } - } - } - }); -} - -void THNN_(VolumetricAdaptiveAveragePooling_updateOutput)( - THNNState *state, - THTensor *input, - THTensor *output, - int osizeT, - int osizeW, - int osizeH) -{ - int dimD = 0; - int dimT = 1; - int dimH = 2; - int dimW = 3; - int64_t sizeB = 1; - int64_t sizeD = 0; - int64_t isizeT = 0; - int64_t isizeH = 0; - int64_t isizeW = 0; - - int64_t istrideB = 0; - int64_t istrideD = 0; - int64_t istrideT = 0; - int64_t istrideH = 0; - int64_t istrideW = 0; - - scalar_t *input_data = nullptr; - scalar_t *output_data = nullptr; - - - THNN_ARGCHECK(!input->is_empty() && (input->dim() == 4 || input->dim() == 5), 2, input, - "non-empty 4D or 5D (batch mode) tensor expected for input, but got: %s"); - - if (input->dim() == 5) - { - istrideB = input->stride(0); - sizeB = input->size(0); - dimD++; - dimT++; - dimH++; - dimW++; - } - - /* sizes */ - sizeD = input->size(dimD); - isizeT = input->size(dimT); - isizeH = input->size(dimH); - isizeW = input->size(dimW); - /* strides */ - istrideD = input->stride(dimD); - istrideT = input->stride(dimT); - istrideH = input->stride(dimH); - istrideW = input->stride(dimW); - - /* resize output */ - if (input->dim() == 4) - { - THTensor_(resize4d)(output, sizeD, osizeT, osizeH, osizeW); - - input_data = input->data(); - output_data = output->data(); - - THNN_(VolumetricAdaptiveAveragePooling_updateOutput_frame)(input_data, output_data, - sizeD, - isizeT, isizeH, isizeW, - osizeT, osizeH, osizeW, - istrideD, istrideT, - istrideH, istrideW); - } - else - { - THTensor_(resize5d)(output, sizeB, sizeD, osizeT, osizeH, osizeW); - - input_data = input->data(); - output_data = output->data(); - - at::parallel_for(0, sizeB, 0, [&](int64_t start, int64_t end) { - for (auto b = start; b < end; b++) - { - THNN_(VolumetricAdaptiveAveragePooling_updateOutput_frame)(input_data+b*istrideB, output_data+b*sizeD*osizeT*osizeH*osizeW, - sizeD, - isizeT, isizeH, isizeW, - osizeT, osizeH, osizeW, - istrideD, istrideT, - istrideH, istrideW); - } - }); - } -} - -static void THNN_(VolumetricAdaptiveAveragePooling_updateGradInput_frame)( - scalar_t *gradInput_p, - scalar_t *gradOutput_p, - int64_t sizeD, - int64_t isizeT, - int64_t isizeH, - int64_t isizeW, - int64_t osizeT, - int64_t osizeH, - int64_t osizeW) -{ - at::parallel_for(0, sizeD, 0, [&](int64_t start, int64_t end) { - for (auto d = start; d < end; d++) - { - scalar_t *gradInput_p_d = gradInput_p + d*isizeT*isizeW*isizeH; - scalar_t *gradOutput_p_d = gradOutput_p + d*osizeT*osizeW*osizeH; - - /* calculate average */ - int64_t ot, oh, ow; - for(ot = 0; ot < osizeT; ot++) - { - int istartT = START_IND(ot, osizeT, isizeT); - int iendT = END_IND(ot, osizeT, isizeT); - int kT = iendT - istartT; - - for(oh = 0; oh < osizeH; oh++) - { - int istartH = START_IND(oh, osizeH, isizeH); - int iendH = END_IND(oh, osizeH, isizeH); - int kH = iendH - istartH; - - for(ow = 0; ow < osizeW; ow++) - { - - int istartW = START_IND(ow, osizeW, isizeW); - int iendW = END_IND(ow, osizeW, isizeW); - int kW = iendW - istartW; - - scalar_t grad_delta = gradOutput_p_d[ot*osizeH*osizeW + oh*osizeW + ow] / kT / kH / kW; - - int it, ih, iw; - for(it = istartT; it < iendT; it++) - { - for(ih = istartH; ih < iendH; ih++) - { - for(iw = istartW; iw < iendW; iw++) - { - /* update gradient */ - gradInput_p_d[it*isizeH*isizeW + ih*isizeW + iw] += grad_delta; - } - } - } - } - } - } - } - }); -} - -void THNN_(VolumetricAdaptiveAveragePooling_updateGradInput)( - THNNState *state, - THTensor *input, - THTensor *gradOutput, - THTensor *gradInput) -{ - int dimD = 0; - int dimT = 1; - int dimH = 2; - int dimW = 3; - int64_t sizeB = 1; - int64_t sizeD; - int64_t isizeT; - int64_t isizeH; - int64_t isizeW; - int64_t osizeT; - int64_t osizeH; - int64_t osizeW; - scalar_t *gradInput_data; - scalar_t *gradOutput_data; - - /* get contiguous gradOutput */ - gradOutput = THTensor_(newContiguous)(gradOutput); - - /* resize */ - THTensor_(resizeAs)(gradInput, input); - THTensor_(zero)(gradInput); - - if (input->dim() == 5) { - sizeB = input->size(0); - dimD++; - dimT++; - dimH++; - dimW++; - } - - /* sizes */ - sizeD = input->size(dimD); - isizeT = input->size(dimT); - isizeH = input->size(dimH); - isizeW = input->size(dimW); - osizeT = gradOutput->size(dimT); - osizeH = gradOutput->size(dimH); - osizeW = gradOutput->size(dimW); - - /* get raw pointers */ - gradInput_data = gradInput->data(); - gradOutput_data = gradOutput->data(); - - /* backprop */ - if (input->dim() == 4) - { - THNN_(VolumetricAdaptiveAveragePooling_updateGradInput_frame)(gradInput_data, gradOutput_data, - sizeD, - isizeT, isizeH, isizeW, - osizeT, osizeH, osizeW); - } - else - { - at::parallel_for(0, sizeB, 0, [&](int64_t start, int64_t end) { - for (auto b = start; b < end; b++) - { - THNN_(VolumetricAdaptiveAveragePooling_updateGradInput_frame)(gradInput_data+b*sizeD*isizeT*isizeH*isizeW, gradOutput_data+b*sizeD*osizeT*osizeH*osizeW, - sizeD, - isizeT, isizeH, isizeW, - osizeT, osizeH, osizeW); - } - }); - } - - /* cleanup */ - c10::raw::intrusive_ptr::decref(gradOutput); -} - -#endif - -#undef START_IND -#undef END_IND diff --git a/aten/src/THNN/generic/VolumetricUpSamplingNearest.c b/aten/src/THNN/generic/VolumetricUpSamplingNearest.c deleted file mode 100644 index f4ff9442fd6c..000000000000 --- a/aten/src/THNN/generic/VolumetricUpSamplingNearest.c +++ /dev/null @@ -1,30 +0,0 @@ -#ifndef TH_GENERIC_FILE -#define TH_GENERIC_FILE "THNN/generic/VolumetricUpSamplingNearest.c" -#else - -void THNN_(VolumetricUpSamplingNearest_updateOutput)( - THNNState* state, - THTensor* input, - THTensor* output, - int outputDepth, - int outputHeight, - int outputWidth) { - AT_ERROR("This function is deprecated, please use it from ATen."); -} - -void THNN_(VolumetricUpSamplingNearest_updateGradInput)( - THNNState* state, - THTensor* gradOutput, - THTensor* gradInput, - int nbatch, - int channels, - int inputDepth, - int inputHeight, - int inputWidth, - int outputDepth, - int outputHeight, - int outputWidth) { - AT_ERROR("This function is deprecated, please use it from ATen."); -} - -#endif diff --git a/aten/src/THNN/generic/VolumetricUpSamplingTrilinear.c b/aten/src/THNN/generic/VolumetricUpSamplingTrilinear.c deleted file mode 100644 index 4eaa507298f2..000000000000 --- a/aten/src/THNN/generic/VolumetricUpSamplingTrilinear.c +++ /dev/null @@ -1,35 +0,0 @@ -// Adapted from interp.cpp from Caffe util by Pauline Luc -// Originally developed by George Papandreou - -#ifndef TH_GENERIC_FILE -#define TH_GENERIC_FILE "THNN/generic/VolumetricUpSamplingTrilinear.c" -#else - -void THNN_(VolumetricUpSamplingTrilinear_updateOutput)( - THNNState* state, - THTensor* input, - THTensor* output, - int outputDepth, - int outputHeight, - int outputWidth, - bool align_corners) { - AT_ERROR("This function is deprecated, please use it from ATen."); -} - -void THNN_(VolumetricUpSamplingTrilinear_updateGradInput)( - THNNState* state, - THTensor* gradOutput, - THTensor* gradInput, - int nbatch, - int channels, - int inputDepth, - int inputHeight, - int inputWidth, - int outputDepth, - int outputHeight, - int outputWidth, - bool align_corners) { - AT_ERROR("This function is deprecated, please use it from ATen."); -} - -#endif diff --git a/aten/src/THNN/init.cpp b/aten/src/THNN/init.cpp index 4d026987a7d4..97f14e2a6af0 100644 --- a/aten/src/THNN/init.cpp +++ b/aten/src/THNN/init.cpp @@ -127,12 +127,6 @@ #include #include -#include -#include - -#include -#include - #include #include @@ -151,24 +145,12 @@ #include #include -#include -#include - #include #include -#include -#include - -#include -#include - #include #include -#include -#include - #include #include @@ -178,20 +160,11 @@ #include #include -#include -#include - #include #include #include #include -#include -#include - -#include -#include - #include #include diff --git a/aten/tools/run_tests.sh b/aten/tools/run_tests.sh index bb76979b6ae1..69e63ac7f3fb 100755 --- a/aten/tools/run_tests.sh +++ b/aten/tools/run_tests.sh @@ -34,6 +34,9 @@ fi if [[ -x ./cuda_half_test ]]; then ./cuda_half_test fi +if [[ -x ./cuda_distributions_test ]]; then + ./cuda_distributions_test +fi if [[ -x ./cuda_optional_test ]]; then ./cuda_optional_test fi diff --git a/benchmarks/operator_benchmark/ops/repeat_benchmark.py b/benchmarks/operator_benchmark/ops/repeat_benchmark.py new file mode 100644 index 000000000000..f9e53886f1d0 --- /dev/null +++ b/benchmarks/operator_benchmark/ops/repeat_benchmark.py @@ -0,0 +1,58 @@ +from __future__ import absolute_import, division, print_function, unicode_literals +import numpy as np +import torch + +import time + +"""Microbenchmarks for Tensor repeat operator. Supports PyTorch.""" + +input_shapes = ( + (4, 4, 1), + (16, 1, 32), + (64, 64, 1, 1), + (8, 256, 128), + (1, 64, 128, 32), + (512, 512), +) + +repeats = ( + (1, 1, 1, 64), + (1, 4, 1, 2), + (1, 2, 2, 15), + (1, 1, 3, 2), + (128, 1, 8, 1), + (1, 1, 2, 16), +) + +NUM_WARMUP_ITERS = 5 +NUM_BENCHMARK_ITERS = 10 +DTYPE_TO_BYTES = {'float' : 4} + +def generate_data_for_repeat(): + input_tensors = [torch.randn(*input_shape) for input_shape in input_shapes] + total_num_elements = 0 + for input_tensor, repeat in zip(input_tensors, repeats): + total_num_elements += input_tensor.numel() + total_num_elements += input_tensor.numel() * np.prod(repeat) + return input_tensors, (total_num_elements * DTYPE_TO_BYTES['float']) + +input_tensors, total_bytes = generate_data_for_repeat() +BYTES_TO_MB = (1. / 1000. / 1000.) + +def pt_repeat(input_tensor, repeat): + return input_tensor.repeat(repeat) + +def pt_repeat_n_times(niters): + for _ in range(niters): + for input_tensor, repeat in zip(input_tensors, repeats): + pt_repeat(input_tensor, repeat) + +if __name__ == "__main__": + # Warm up runs. + pt_repeat_n_times(NUM_WARMUP_ITERS) + s = time.time() + pt_repeat_n_times(NUM_BENCHMARK_ITERS) + total_time_s = (time.time() - s) + total_time_per_iter_s = total_time_s / NUM_BENCHMARK_ITERS + achieved_bandwidth = (total_bytes * BYTES_TO_MB) / total_time_per_iter_s + print("Time:{} Achieved Bandwidth:{} MB/s".format(total_time_per_iter_s, achieved_bandwidth)) diff --git a/c10/core/Backend.h b/c10/core/Backend.h index 738c9b10cf91..709769195703 100644 --- a/c10/core/Backend.h +++ b/c10/core/Backend.h @@ -275,4 +275,15 @@ static inline const char* toString(Backend b) { } } +static inline bool isSparse(Backend b) { + switch (b) { + case Backend::SparseCPU: + case Backend::SparseCUDA: + case Backend::SparseHIP: + return true; + default: + return false; + } +} + } // namespace c10 diff --git a/c10/core/CPUAllocator.cpp b/c10/core/CPUAllocator.cpp index 7ce339d751f5..8e3013836568 100644 --- a/c10/core/CPUAllocator.cpp +++ b/c10/core/CPUAllocator.cpp @@ -52,13 +52,24 @@ void* alloc_cpu(size_t nbytes) { #elif defined(_MSC_VER) data = _aligned_malloc(nbytes, gAlignment); #else - CAFFE_ENFORCE_EQ(posix_memalign(&data, gAlignment, nbytes), 0); + int err = posix_memalign(&data, gAlignment, nbytes); + if (err != 0) { + CAFFE_THROW( + "DefaultCPUAllocator: can't allocate memory: you tried to allocate ", + nbytes, + " bytes. Error code ", + err, + " (", + strerror(err), + ")"); + } #endif CAFFE_ENFORCE( data, - "DefaultCPUAllocator: not enough memory: you tried to allocate %dGB. Buy new RAM!", - nbytes / 1073741824); + "DefaultCPUAllocator: not enough memory: you tried to allocate ", + nbytes, + " bytes. Buy new RAM!"); // move data to a thread's NUMA node NUMAMove(data, nbytes, GetCurrentNUMANode()); diff --git a/c10/core/Device.cpp b/c10/core/Device.cpp index 79ee6251ec66..4ee041062a57 100644 --- a/c10/core/Device.cpp +++ b/c10/core/Device.cpp @@ -39,9 +39,9 @@ DeviceType parse_type(const std::string& device_string) { } // namespace void Device::validate() { - AT_CHECK(index_ == -1 || index_ >= 0, + TORCH_CHECK(index_ == -1 || index_ >= 0, "Device index must be -1 or non-negative, got ", index_); - AT_CHECK(!is_cpu() || index_ <= 0, + TORCH_CHECK(!is_cpu() || index_ <= 0, "CPU device index must be -1 or zero, got ", index_); } @@ -56,7 +56,7 @@ void Device::validate() { // std::regex_constants::basic); // std::smatch match; // const bool ok = std::regex_match(device_string, match, regex); -// AT_CHECK(ok, "Invalid device string: '", device_string, "'"); +// TORCH_CHECK(ok, "Invalid device string: '", device_string, "'"); // if (match[1].matched) { // type_ = parse_type_from_string(match[1].str()); // } else { @@ -69,14 +69,14 @@ void Device::validate() { // index_ = std::stoi(match[3].str()); // } Device::Device(const std::string& device_string) : Device(Type::CPU) { - AT_CHECK(!device_string.empty(), "Device string must not be empty"); + TORCH_CHECK(!device_string.empty(), "Device string must not be empty"); int index = device_string.find(":"); if (index == std::string::npos) { type_ = parse_type(device_string); } else { std::string s; s = device_string.substr(0, index); - AT_CHECK(!s.empty(), "Device string must not be empty"); + TORCH_CHECK(!s.empty(), "Device string must not be empty"); type_ = parse_type(s); std::string device_index = device_string.substr(index + 1); @@ -86,7 +86,7 @@ Device::Device(const std::string& device_string) : Device(Type::CPU) { AT_ERROR("Could not parse device index '", device_index, "' in device string '", device_string, "'"); } - AT_CHECK(index_ >= 0, + TORCH_CHECK(index_ >= 0, "Device index must be non-negative, got ", index_); } validate(); diff --git a/c10/core/MemoryFormat.h b/c10/core/MemoryFormat.h new file mode 100644 index 000000000000..14209900e4c1 --- /dev/null +++ b/c10/core/MemoryFormat.h @@ -0,0 +1,49 @@ +#pragma once + +#include +#include + +#include + +// Memory format is not the property of a Tensor. It is the way to tell an +// operator how the result should be organized in memory and nothing more. That +// means memory format should never be used as return value for any tensor state +// interrogation functions (internally and externally). +// +// Possible options are: +// Any: +// An operator can return Tensor with any memory format. This describes the +// current behavior of operators. +// +// Preserve: +// If any of the input tensors is in channels_last format, operator output +// should be in channels_last format +// +// Contiguous: +// Regardless of input tensors format, the output should be contiguous Tensor. +// +// ChannelsLast: +// Regardless of input tensors format, the output should be in channels_last format. + + +namespace c10 { +enum class MemoryFormat : int8_t { Any, Preserve, Contiguous, ChannelsLast }; + +inline std::ostream& operator<<( + std::ostream& stream, + at::MemoryFormat memory_format) { + switch (memory_format) { + case MemoryFormat::Any: + return stream << "Any"; + case MemoryFormat::Preserve: + return stream << "Preserve"; + case MemoryFormat::Contiguous: + return stream << "Contiguous"; + case MemoryFormat::ChannelsLast: + return stream << "ChannelsLast"; + default: + AT_ERROR("Unknown memory format"); + } +} + +} // namespace c10 diff --git a/c10/core/ScalarType.h b/c10/core/ScalarType.h index b34ed427d34f..6cb2102b2b0c 100644 --- a/c10/core/ScalarType.h +++ b/c10/core/ScalarType.h @@ -29,7 +29,9 @@ namespace c10 { _(std::complex, ComplexFloat, z) /* 9 */ \ _(std::complex, ComplexDouble, z) /* 10 */ \ _(bool, Bool, i) /* 11 */ \ - _(c10::qint8, QInt8, i) /* 12 */ + _(c10::qint8, QInt8, i) /* 12 */ \ + _(c10::quint8, QUInt8, i) /* 13 */ \ + _(c10::qint32, QInt32, i) /* 14 */ // If you want to support ComplexHalf for real, replace occurrences // of this macro with AT_FORALL_SCALAR_TYPES_WITH_COMPLEX. But @@ -46,7 +48,9 @@ namespace c10 { _(std::complex, ComplexFloat, z) \ _(std::complex, ComplexDouble, z) \ _(bool, Bool, i) \ - _(c10::qint8, QInt8, i) + _(c10::qint8, QInt8, i) \ + _(c10::quint8, QUInt8, i) \ + _(c10::qint32, QInt32, i) #define AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF_AND_QINT(_) \ _(uint8_t, Byte, i) \ @@ -70,7 +74,9 @@ namespace c10 { _(at::Half, Half, d) \ _(float, Float, d) \ _(double, Double, d) \ - _(c10::qint8, QInt8, i) + _(c10::qint8, QInt8, i) \ + _(c10::quint8, QUInt8, i) \ + _(c10::qint32, QInt32, i) #define AT_FORALL_SCALAR_TYPES_EXCEPT_QINT(_) \ _(uint8_t, Byte, i) \ @@ -101,7 +107,9 @@ namespace c10 { _(int64_t, Long, i) \ _(float, Float, d) \ _(double, Double, d) \ - _(c10::qint8, QInt8, i) + _(c10::qint8, QInt8, i) \ + _(c10::quint8, QUInt8, i) \ + _(c10::qint32, QInt32, i) #define AT_FORALL_SCALAR_TYPES_EXCEPT_HALF_AND_QINT(_) \ _(uint8_t, Byte, i) \ @@ -223,7 +231,37 @@ static inline bool isComplexType(ScalarType t) { static inline bool isQIntType(ScalarType t) { // Don't forget to extend this when adding new QInt types - return t == ScalarType::QInt8; + return t == ScalarType:: QInt8 || t == ScalarType::QUInt8 || t == ScalarType::QInt32; +} + +static inline ScalarType toQIntType(ScalarType t) { + switch (t) { + case ScalarType::Byte: + return ScalarType::QUInt8; + case ScalarType::Char: + return ScalarType::QInt8; + case ScalarType::Int: + return ScalarType::QInt32; + default: + return t; + } +} + +static inline ScalarType toUnderlying(ScalarType t) { + switch (t) { + case ScalarType::QUInt8: + return ScalarType::Byte; + case ScalarType::QInt8: + return ScalarType::Char; + case ScalarType::QInt32: + return ScalarType::Int; + default: + return t; + } +} + +static inline bool isUnderlying(ScalarType type, ScalarType qtype) { + return type == toUnderlying(qtype); } static inline ScalarType promoteTypes(ScalarType a, ScalarType b) { diff --git a/c10/core/Stream.h b/c10/core/Stream.h index 85b3bcff656b..01e9a341ecc2 100644 --- a/c10/core/Stream.h +++ b/c10/core/Stream.h @@ -120,7 +120,7 @@ class Stream final { auto device_index = static_cast(bits) & 0xFFFFull; bits >>= 16; auto device_type = static_cast(bits); - AT_CHECK(isValidDeviceType(device_type)); + TORCH_CHECK(isValidDeviceType(device_type)); // Unfortunately, we can't check if the StreamId is valid here; it // will be checked upon first use. return Stream(UNSAFE, Device(device_type, device_index), stream_id); diff --git a/c10/core/TensorImpl.cpp b/c10/core/TensorImpl.cpp index 41af810b1ef1..84fcb466133b 100644 --- a/c10/core/TensorImpl.cpp +++ b/c10/core/TensorImpl.cpp @@ -113,6 +113,26 @@ bool TensorImpl::has_storage() const { return storage_; } +bool TensorImpl::is_contiguous(at::MemoryFormat memory_format) const { +#ifdef DEBUG + AT_ASSERT(compute_contiguous() == is_contiguous_); +#endif + if (memory_format == at::MemoryFormat::ChannelsLast) { + if (dim() == 4) { + auto strides_1 = 1; + auto strides_3 = sizes_[1]; + auto strides_2 = strides_3 * sizes_[3]; + auto strides_0 = strides_2 * sizes_[2]; + if (strides_0 == strides_[0] && strides_1 == strides_[1] && + strides_2 == strides_[2] && strides_3 == strides_[3]) { + return true; + } + } + return false; + } + return is_contiguous_; +} + const Storage& TensorImpl::storage() const { return storage_; } @@ -135,4 +155,52 @@ at::DataPtr PlacementDeleteContext::makeDataPtr( AutogradMetaInterface::~AutogradMetaInterface() {} +/// NOTE [ Treating Variables as non-Variables in type dispatch ] +/// +/// Previously, in VariableType_*.cpp (generated by gen_variable_type.py), when +/// a function is using the 'use_derived' strategy, we call its implementation +/// on the base non-Variable type (`baseType`), passing unwrapped tensors to the +/// call so that any `.dispatch_type()` calls in the implementation can treat the passed +/// tensors as non-Variables and won't dispatch back to functions in VariableType. +/// +/// However, after the Variable/Tensor merge, there is no concept of unwrapping +/// a tensor anymore, and directly passing variables to the base type calls will +/// cause the `.dispatch_type()` dispatch in the implementation to treat the tensor as a +/// variable, and any function dispatch based on `.dispatch_type()` will dispatch back to +/// VariableType, which is not what we want. +/// +/// The solution to the above problem is to add `at::NonVariableTypeMode`, which +/// when enabled will cause `legacyTensorType()` and `getType()` to always return +/// non-Variable type, even if the tensor being called on is a variable. +/// +/// TODO: Since `torch::NoGradGuard` serves the same purpose in libtorch, we should +/// merge these two thread-local guards. + +/// In the CAFFE2_FB_LIMITED_MOBILE_CAPABILITY build setting, +/// thread_local is not supported. In that case, we don't provide +/// `at::NonVariableTypeMode`. +#ifndef CAFFE2_FB_LIMITED_MOBILE_CAPABILITY + +thread_local bool NonVariableTypeMode_enabled = false; + +bool NonVariableTypeMode::is_enabled() { + return NonVariableTypeMode_enabled; +} + +void NonVariableTypeMode::set_enabled(bool enabled) { + NonVariableTypeMode_enabled = enabled; +} + +#else // defined(CAFFE2_FB_LIMITED_MOBILE_CAPABILITY) + +bool NonVariableTypeMode::is_enabled() { + throw std::runtime_error("NonVariableTypeMode is not supported on mobile"); +} + +void NonVariableTypeMode::set_enabled(bool enabled) { + throw std::runtime_error("NonVariableTypeMode is not supported on mobile"); +} + +#endif + } // namespace c10 diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h index 907834c07404..7a4afeef8e2a 100644 --- a/c10/core/TensorImpl.h +++ b/c10/core/TensorImpl.h @@ -5,6 +5,7 @@ #include #include +#include #include #include #include @@ -61,7 +62,7 @@ inline int64_t size_from_dim_(int k, IntArrayRef dims) { // Product of all dims up to k (not including dims[k]) inline int64_t size_to_dim_(int k, IntArrayRef dims) { - AT_ASSERT((unsigned)k <= dims.size()); + TORCH_CHECK((unsigned)k <= dims.size()); int64_t r = 1; for (int i = 0; i < k; ++i) { r *= dims[i]; @@ -71,7 +72,7 @@ inline int64_t size_to_dim_(int k, IntArrayRef dims) { // Product of all dims between k and l (not including dims[k] and dims[l]) inline int64_t size_between_dim_(int k, int l, IntArrayRef dims) { - AT_ASSERT((unsigned)l < dims.size()); + TORCH_CHECK((unsigned)l < dims.size()); int64_t r = 1; if (k < l) { for (int i = k + 1; i < l; ++i) { @@ -87,8 +88,8 @@ inline int64_t size_between_dim_(int k, int l, IntArrayRef dims) { // Wrap around axis_index if it is negative, s.t., -1 is the last dim inline int canonical_axis_index_(int axis_index, int ndims) { - AT_ASSERT(axis_index >= -ndims); - AT_ASSERT(axis_index < ndims); + TORCH_CHECK(axis_index >= -ndims); + TORCH_CHECK(axis_index < ndims); if (axis_index < 0) { return axis_index + ndims; } @@ -138,6 +139,11 @@ struct C10_API AutogradMetaInterface { virtual ~AutogradMetaInterface(); }; +struct C10_API NonVariableTypeMode { + static bool is_enabled(); + static void set_enabled(bool enabled); +}; + // NOTE [ Version Counter Sharing ] // // Every Tensor has a version counter. Version counters are incremented whenever the @@ -370,7 +376,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { */ virtual int64_t numel() const { #ifdef DEBUG - AT_ASSERT(compute_numel() == numel_); + TORCH_INTERNAL_ASSERT(compute_numel() == numel_); #endif return numel_; } @@ -382,12 +388,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { * compute_contiguous() for the exact definition of whether or not * a tensor is contiguous or not. */ - virtual bool is_contiguous() const { -#ifdef DEBUG - AT_ASSERT(compute_contiguous() == is_contiguous_); -#endif - return is_contiguous_; - } + virtual bool is_contiguous(at::MemoryFormat memory_format=at::MemoryFormat::Any) const; bool is_sparse() const { // NB: This method is not virtual and avoid dispatches for performance reasons. @@ -426,25 +427,21 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { } int64_t get_device() const { - if (device_opt_.has_value()) { - // See NOTE [c10::optional operator usage in CUDA] - return (*device_opt_).index(); - } - - AT_ERROR( + TORCH_CHECK( + device_opt_.has_value(), "tensor with backend ", toString(tensorTypeIdToBackend(type_id())), " does not have a device"); + // See NOTE [c10::optional operator usage in CUDA] + return (*device_opt_).index(); } Device device() const { - if (device_opt_.has_value()) { - // See NOTE [c10::optional operator usage in CUDA] - return *device_opt_; - } - - AT_ERROR( + TORCH_CHECK( + device_opt_.has_value(), "tensor with backend ", toString(tensorTypeIdToBackend(type_id())), " does not have a device"); + // See NOTE [c10::optional operator usage in CUDA] + return *device_opt_; } Layout layout() const { @@ -501,7 +498,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { * See Note [We regret making Variable hold a Tensor] */ bool is_wrapped_number() const { - AT_ASSERT(!is_variable()); // TODO: remove this when Variable and Tensor are merged + TORCH_INTERNAL_ASSERT(!is_variable()); // TODO: remove this when Variable and Tensor are merged return is_wrapped_number_; } @@ -514,8 +511,8 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { * See Note [We regret making Variable hold a Tensor] */ void set_wrapped_number(bool value) { - AT_ASSERT(!is_variable()); // TODO: remove this when Variable and Tensor are merged - AT_ASSERT(dim() == 0); + TORCH_INTERNAL_ASSERT(!is_variable()); // TODO: remove this when Variable and Tensor are merged + TORCH_INTERNAL_ASSERT(dim() == 0); is_wrapped_number_ = value; } @@ -563,11 +560,8 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { * See Note [Tensor versus Variable in C++]. */ void set_requires_grad(bool requires_grad) { - if (autograd_meta()) { - autograd_meta()->set_requires_grad(requires_grad, this); - } else { - AT_ERROR("set_requires_grad is not implemented for Tensor"); - } + TORCH_INTERNAL_ASSERT(autograd_meta(), "set_requires_grad is not implemented for Tensor"); + autograd_meta()->set_requires_grad(requires_grad, this); } /** @@ -581,11 +575,8 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { * See Note [Tensor versus Variable in C++]. */ bool requires_grad() const { - if (autograd_meta()) { - return autograd_meta()->requires_grad(); - } else { - AT_ERROR("requires_grad is not implemented for Tensor"); - } + TORCH_INTERNAL_ASSERT(autograd_meta(), "requires_grad is not implemented for Tensor"); + return autograd_meta()->requires_grad(); } /** @@ -624,15 +615,15 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { */ template inline T * data() const { - AT_ASSERT(!is_variable()); // TODO: remove this when Variable and Tensor are merged - AT_CHECK(has_storage(), + TORCH_INTERNAL_ASSERT(!is_variable()); // TODO: remove this when Variable and Tensor are merged + TORCH_CHECK(has_storage(), "Cannot access data pointer of Tensor that doesn't have storage"); - AT_ASSERTM( + TORCH_CHECK( storage_initialized(), "The tensor has a non-zero number of elements, but its data is not allocated yet. " "Caffe2 uses a lazy allocation, so you will need to call " "mutable_data() or raw_mutable_data() to actually allocate memory."); - AT_ASSERTM( + TORCH_CHECK( storage_.IsType(), "Tensor type mismatch, caller expects elements to be ", caffe2::TypeMeta::TypeName(), @@ -657,10 +648,12 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { * See Note [We regret making Variable hold a Tensor] */ inline void* data() const { - AT_ASSERT(!is_variable()); // TODO: remove this when Variable and Tensor are merged - AT_CHECK(has_storage(), + TORCH_INTERNAL_ASSERT(!is_variable()); // TODO: remove this when Variable and Tensor are merged + TORCH_CHECK(has_storage(), "Cannot access data pointer of Tensor that doesn't have storage"); - AT_ASSERT(dtype_initialized()); + TORCH_CHECK(dtype_initialized(), + "Cannot access data pointer of Tensor that doesn't have initialized dtype " + "(e.g., caffe2::Tensor x(CPU), prior to calling mutable_data() on x)"); return static_cast( static_cast(storage_.data()) + data_type_.itemsize() * storage_offset_); @@ -699,7 +692,9 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { * Return the size of a single element of this tensor in bytes. */ size_t itemsize() const { - AT_ASSERT(dtype_initialized()); + TORCH_CHECK(dtype_initialized(), + "Cannot report itemsize of Tensor that doesn't have initialized dtype " + "(e.g., caffe2::Tensor x(CPU), prior to calling mutable_data() on x)"); return data_type_.itemsize(); } @@ -735,7 +730,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { * which is harder to misuse. */ virtual void resize_dim(int64_t ndim) { - AT_CHECK(allow_tensor_metadata_change(), "resize_dim is not allowed on Tensor created from .data or .detach()"); + TORCH_CHECK(allow_tensor_metadata_change(), "resize_dim is not allowed on Tensor created from .data or .detach()"); sizes_.resize(ndim, 0); strides_.resize(ndim, 0); refresh_numel(); @@ -751,7 +746,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { * which is harder to misuse. */ virtual void set_size(int64_t dim, int64_t new_size) { - AT_CHECK(allow_tensor_metadata_change(), "set_size is not allowed on Tensor created from .data or .detach()"); + TORCH_CHECK(allow_tensor_metadata_change(), "set_size is not allowed on Tensor created from .data or .detach()"); sizes_.at(dim) = new_size; refresh_numel(); refresh_contiguous(); @@ -764,7 +759,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { * which is harder to misuse. */ virtual void set_stride(int64_t dim, int64_t new_stride) { - AT_CHECK(allow_tensor_metadata_change(), "set_stride is not allowed on Tensor created from .data or .detach()"); + TORCH_CHECK(allow_tensor_metadata_change(), "set_stride is not allowed on Tensor created from .data or .detach()"); strides_[dim] = new_stride; refresh_numel(); refresh_contiguous(); @@ -778,7 +773,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { * (and resizing if necessary.) */ virtual void set_storage_offset(int64_t storage_offset) { - AT_CHECK(allow_tensor_metadata_change(), "set_storage_offset is not allowed on Tensor created from .data or .detach()"); + TORCH_CHECK(allow_tensor_metadata_change(), "set_storage_offset is not allowed on Tensor created from .data or .detach()"); storage_offset_ = storage_offset; } @@ -793,8 +788,8 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { * See Note [We regret making Variable hold a Tensor] */ void set_sizes_contiguous(IntArrayRef new_size) { - AT_CHECK(allow_tensor_metadata_change(), "set_sizes_contiguous is not allowed on Tensor created from .data or .detach()"); - AT_ASSERT(!is_variable()); // TODO: remove this when Variable and Tensor are merged + TORCH_CHECK(allow_tensor_metadata_change(), "set_sizes_contiguous is not allowed on Tensor created from .data or .detach()"); + TORCH_INTERNAL_ASSERT(!is_variable()); // TODO: remove this when Variable and Tensor are merged auto old_dim = sizes_.size(); auto new_dim = new_size.size(); @@ -818,9 +813,9 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { * See Note [We regret making Variable hold a Tensor] */ void set_sizes_and_strides(IntArrayRef new_size, IntArrayRef new_stride) { - AT_CHECK(allow_tensor_metadata_change(), "set_sizes_and_strides is not allowed on Tensor created from .data or .detach()"); - AT_ASSERT(!is_variable()); // TODO: remove this when Variable and Tensor are merged - AT_CHECK( + TORCH_INTERNAL_ASSERT(!is_variable()); // TODO: remove this when Variable and Tensor are merged + TORCH_CHECK(allow_tensor_metadata_change(), "set_sizes_and_strides is not allowed on Tensor created from .data or .detach()"); + TORCH_CHECK( new_size.size() == new_stride.size(), "dimensionality of sizes (", new_size.size(), @@ -871,10 +866,13 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { /** * True if a tensor is a variable. See Note [Tensor versus Variable in C++] */ - bool is_variable() const { return autograd_meta_ != nullptr; }; + bool is_variable() const { + return autograd_meta_ != nullptr && !at::NonVariableTypeMode::is_enabled(); + } /** * Set whether a tensor allows changes to its metadata (e.g. sizes / strides / storage / storage_offset). + * See NOTE [ Metadata Change for a Detached Tensor ] for details. */ virtual void set_allow_tensor_metadata_change(bool value) { allow_tensor_metadata_change_ = value; @@ -882,6 +880,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { /** * True if a tensor allows changes to its metadata (e.g. sizes / strides / storage / storage_offset). + * See NOTE [ Metadata Change for a Detached Tensor ] for details. */ virtual bool allow_tensor_metadata_change() const { return allow_tensor_metadata_change_; @@ -910,16 +909,16 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { // NOTE: `shallow_copy_and_detach()` does not copy the following TensorImpl fields: // 1. the AutogradMeta pointer, because it is unique for each Variable. - // 2. the version counter, because although it lives in TensorImpl, the version counter is managed - // by autograd, and the call sites of `shallow_copy_and_detach()` (from autograd) should decide what - // the version counter should be for each new TensorImpl. See NOTE [ Version Counter Sharing ] for details. + // 2. the version counter, because it is set to the passed in `version_counter`. + // See NOTE [ Version Counter Sharing ] for details. // - // NOTE: We don't set `allow_tensor_metadata_change_` to false here, because there are call sites - // to this function that need to change the shallow copy's size or storage afterwards, and setting - // `allow_tensor_metadata_change_` to false would prevent those changes from happening and is - // undesirable. - virtual c10::intrusive_ptr shallow_copy_and_detach() const { - AT_ASSERT(!is_variable()); // TODO: remove this when Variable and Tensor are merged + // NOTE: `allow_tensor_metadata_change` determines whether the TensorImpl shallow-copy + // allows changes to its metadata (e.g. sizes / strides / storage / storage_offset). + // See NOTE [ Metadata Change for a Detached Tensor ] for details. + virtual c10::intrusive_ptr shallow_copy_and_detach( + const c10::VariableVersion& version_counter, + bool allow_tensor_metadata_change) const { + TORCH_INTERNAL_ASSERT(!is_variable()); // TODO: remove this when Variable and Tensor are merged auto impl = c10::make_intrusive(Storage(storage()), type_id()); impl->set_sizes_and_strides(sizes(), strides()); impl->storage_offset_ = storage_offset_; @@ -927,6 +926,8 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { impl->reserved_ = reserved_; impl->refresh_numel(); impl->refresh_contiguous(); + impl->set_version_counter(version_counter); + impl->set_allow_tensor_metadata_change(allow_tensor_metadata_change); return impl; } @@ -965,8 +966,10 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { * The device type of a Tensor, e.g., DeviceType::CPU or DeviceType::CUDA. */ DeviceType device_type() const { - AT_ASSERT(!is_variable()); // TODO: remove this when Variable and Tensor are merged - AT_ASSERT(device_opt_.has_value()); + TORCH_INTERNAL_ASSERT(!is_variable()); // TODO: remove this when Variable and Tensor are merged + // TODO: A useful internal assert would be to show that device_opt_ is null + // only if you are an undefined tensor + TORCH_CHECK(device_opt_.has_value(), "device_type cannot be run on undefined Tensor"); // See NOTE [c10::optional operator usage in CUDA] return (*device_opt_).type(); } @@ -983,9 +986,9 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { * This op is auto-asynchronous if the underlying device (CUDA) supports it. */ void Extend(int64_t num, float growthPct) { - AT_ASSERT(sizes_.size() >= 1u); - AT_ASSERTM(num >= 0, "`num` must be non-negative for Extend"); - AT_ASSERTM( + TORCH_CHECK(sizes_.size() >= 1u); + TORCH_CHECK(num >= 0, "`num` must be non-negative for Extend"); + TORCH_CHECK( is_contiguous_, "Right now Extend is only supported for contiguous Tensor."); auto newDims = sizes_; @@ -1013,7 +1016,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { Resize(newCapacity); auto* newData = raw_mutable_data(data_type_); if (data_type_.copy()) { - AT_ASSERTM( + TORCH_CHECK( device_type() == DeviceType::CPU, "non-POD types work only on CPU"); data_type_.copy()(oldData.get(), newData, oldSize); @@ -1048,10 +1051,10 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { */ template void ReserveSpace(const T& outer_dim) { - AT_ASSERTM( + TORCH_CHECK( is_contiguous_, "Right now ReserveSpace is only supported for contiguous Tensor."); - AT_ASSERTM( + TORCH_CHECK( storage_.unique(), "Can't call ReserveSpace on shared storage."); auto newCapacity = sizes_; newCapacity[0] = outer_dim; @@ -1122,15 +1125,15 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { * This requires the total size of the tensor to remains constant. */ inline void Reshape(const std::vector& dims) { - AT_ASSERTM( + TORCH_CHECK( is_contiguous_, "Right now Reshape is only supported for contiguous Tensor."); int64_t new_size = 1; for (auto d : dims) { - AT_ASSERT(d >= 0); + TORCH_CHECK(d >= 0); new_size *= d; } - AT_ASSERTM( + TORCH_CHECK( new_size == numel_, "New size and old size are not equal. You cannot use Reshape, " "but should use Resize." @@ -1172,20 +1175,20 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { // Right now, we are assuming the device_type are the same, since it is // inherently the same in the non-templatized code. We should probably add // an assert here which might affect perf a little bit. - AT_ASSERTM( + TORCH_CHECK( src.numel_ == numel_, "Size mismatch - did you call reshape before sharing the data?"); // It is possible that the source tensor hasn't called mutable_data() yet, // in which case ShareData() doesn't make much sense since we don't really // know what to share yet. // TODO: Add the assert after all uninitialized states are eliminated - // AT_ASSERTM(src.dtype_initialized(), + // TORCH_CHECK(src.dtype_initialized(), // "Source tensor don't have a data type (did you call mutable_data on the tensor?)"); if (!src.dtype_initialized()) { C10_LOG_EVERY_MS(WARNING, 1000) << "Source tensor don't have a data type (did you call mutable_data on the tensor?)"; } - AT_ASSERTM( + TORCH_CHECK( src.storage_initialized(), "Source tensor has no content and has size > 0"); // Finally, do sharing. @@ -1202,7 +1205,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { DataPtr&& data_ptr, const caffe2::TypeMeta& data_type, size_t capacity) { - AT_ASSERTM( + TORCH_CHECK( data_type.id() != caffe2::TypeIdentifier::uninitialized(), "To share with a raw external pointer you need to pass in an " "initialized data_type(TypeMeta)."); @@ -1264,7 +1267,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { if (numel_ == 0 || (meta.placementNew() == nullptr && !had_special_dtor && storage_.numel() >= numel_)) { - AT_ASSERT(storage_offset_ == 0); // because we just reallocated + TORCH_INTERNAL_ASSERT(storage_offset_ == 0); // because we just reallocated return storage_.data(); } const Allocator* allocator = storage_.allocator(); @@ -1291,7 +1294,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { allocator->allocate(numel_ * storage_.itemsize())); } storage_.set_numel(numel_); - AT_ASSERT(storage_offset_ == 0); // because we just reallocated + TORCH_INTERNAL_ASSERT(storage_offset_ == 0); // because we just reallocated device_opt_ = storage_.device(); return storage_.data(); } @@ -1321,7 +1324,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { * storage UNINITIALIZED after a Resize() or FreeMemory() */ bool storage_initialized() const { - AT_ASSERT(has_storage()); + TORCH_CHECK(has_storage(), "cannot call storage_initialized on tensor that does not have storage"); return storage_.data() || numel_ == 0; } @@ -1335,7 +1338,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { } void set_storage(at::Storage storage) { - AT_CHECK(allow_tensor_metadata_change(), "set_storage is not allowed on Tensor created from .data or .detach()"); + TORCH_CHECK(allow_tensor_metadata_change(), "set_storage is not allowed on Tensor created from .data or .detach()"); storage_ = std::move(storage); data_type_ = storage_.dtype(); device_opt_ = storage_.device(); @@ -1435,7 +1438,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { * Recompute the cached numel of a tensor. Call this if you modify sizes. */ void refresh_numel() { - AT_ASSERT(!is_variable()); // TODO: remove this when Variable and Tensor are merged + TORCH_INTERNAL_ASSERT(!is_variable()); // TODO: remove this when Variable and Tensor are merged numel_ = compute_numel(); } @@ -1444,7 +1447,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { * or strides. */ void refresh_contiguous() { - AT_ASSERT(!is_variable()); // TODO: remove this when Variable and Tensor are merged + TORCH_INTERNAL_ASSERT(!is_variable()); // TODO: remove this when Variable and Tensor are merged is_contiguous_ = compute_contiguous(); } @@ -1487,6 +1490,9 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { // INVARIANT: When storage is non-null, this Device must // agree with the type meta in storage. + // + // INVARIANT: device_opt_ is only nullopt for undefined tensors + // (which do not have a device.) c10::optional device_opt_; // You get to have eight byte-size fields here, before you @@ -1495,15 +1501,21 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { bool is_contiguous_ = true; bool is_wrapped_number_ = false; - // Previously, if we change the tensor metadata (e.g. sizes / strides / storage / storage_offset) - // of a derived tensor (i.e. tensors created from Python `tensor.data` or Python/C++ `tensor.detach()`), - // those metadata in the original tensor will also be updated. However, the new behavior is that - // those metadata changes to a derived tensor will not update the original tensor anymore, and we - // need this flag to make such changes explicitly illegal, to prevent users from changing metadata of - // the derived tensor and expecting the original tensor to also be updated. + // NOTE [ Metadata Change for a Detached Tensor ] + // + // Normally, a user is allowed to change the tensor metadata + // (e.g. sizes / strides / storage / storage_offset) of a tensor. + // However, if the tensor is created by `t1_detached = t1.data` in Python + // or `t1_detached = t1.detach()` in Python/C++, those changes to the + // tensor metadata of `t1_detached` will not be propagated back to the + // original tensor `t1`. In order to make such changes explicitly illegal, + // we created the `allow_tensor_metadata_change_` flag, to prevent users + // from changing metadata of the detached tensor and expecting the original + // tensor to also be updated. // - // NOTE: For a full list of tensor metadata fields, please see `shallow_copy_and_detach()` in TensorImpl - // and its subclasses to find which fields are copied by value. + // NOTE: For a full list of tensor metadata fields, please see + // `shallow_copy_and_detach()` in TensorImpl and its subclasses to find + // which fields are copied by value. bool allow_tensor_metadata_change_ = true; // we decide to keep reserved_ and it will diff --git a/c10/core/TensorTypeIdRegistration.h b/c10/core/TensorTypeIdRegistration.h index 231f86f61842..0c51623b1f57 100644 --- a/c10/core/TensorTypeIdRegistration.h +++ b/c10/core/TensorTypeIdRegistration.h @@ -87,13 +87,13 @@ inline c10::TensorTypeId TensorTypeIdRegistrar::id() const noexcept { return id_; } -#define C10_DECLARE_TENSOR_TYPE(TensorName) \ +#define C10_DECLARE_TENSOR_TYPE(TensorName) \ C10_API ::c10::TensorTypeId TensorName() -#define C10_DEFINE_TENSOR_TYPE(TensorName) \ - ::c10::TensorTypeId TensorName() { \ +#define C10_DEFINE_TENSOR_TYPE(TensorName) \ + C10_EXPORT ::c10::TensorTypeId TensorName() { \ static ::c10::TensorTypeIdRegistrar registration_raii; \ - return registration_raii.id(); \ + return registration_raii.id(); \ } C10_DECLARE_TENSOR_TYPE(UndefinedTensorId); diff --git a/c10/core/thread_pool.cpp b/c10/core/thread_pool.cpp index cc13566e29ee..1529f74bd4a9 100644 --- a/c10/core/thread_pool.cpp +++ b/c10/core/thread_pool.cpp @@ -2,8 +2,8 @@ namespace c10 { -ThreadPool::ThreadPool(std::size_t pool_size, int numa_node_id) - : threads_(pool_size), +ThreadPool::ThreadPool(int pool_size, int numa_node_id) + : threads_(pool_size < 0 ? defaultNumThreads() : pool_size), running_(true), complete_(true), available_(threads_.size()), @@ -48,6 +48,9 @@ bool ThreadPool::inThreadPool() const { } void ThreadPool::run(const std::function& func) { + if (threads_.size() == 0) { + throw std::runtime_error("No threads to run a task"); + } std::unique_lock lock(mutex_); // Set task and signal condition variable so that a worker thread will @@ -120,20 +123,6 @@ void ThreadPool::main_loop(std::size_t index) { } // while running_ } -// constexpr initialization guaranteed to be before any static initialization -std::atomic num_threads{1}; -void setNumThreads(size_t v) { - if(-1 == num_threads.exchange(v)) { - throw std::runtime_error("Error: cannot set num threads after pool has started"); - } -} - -TaskThreadPoolBase& global_work_queue() { - static std::shared_ptr pool = - ThreadPoolRegistry()->Create("C10", 0, num_threads.exchange(-1), false); - return *pool; -} - C10_DEFINE_SHARED_REGISTRY( ThreadPoolRegistry, TaskThreadPoolBase, diff --git a/c10/core/thread_pool.h b/c10/core/thread_pool.h index b4a716ac5b6a..5fe8b416c6f9 100644 --- a/c10/core/thread_pool.h +++ b/c10/core/thread_pool.h @@ -36,6 +36,10 @@ class C10_API TaskThreadPoolBase { virtual bool inThreadPool() const = 0; virtual ~TaskThreadPoolBase() noexcept {} + + static size_t defaultNumThreads() { + return std::thread::hardware_concurrency(); + } }; class C10_API ThreadPool : public c10::TaskThreadPoolBase { @@ -66,7 +70,7 @@ class C10_API ThreadPool : public c10::TaskThreadPoolBase { ThreadPool() = delete; explicit ThreadPool( - std::size_t pool_size, + int pool_size, int numa_node_id = -1); ~ThreadPool(); @@ -102,10 +106,6 @@ class C10_API ThreadPool : public c10::TaskThreadPoolBase { void main_loop(std::size_t index); }; -C10_API void setNumThreads(size_t v); - -C10_API TaskThreadPoolBase& global_work_queue(); - class C10_API TaskThreadPool : public c10::ThreadPool { public: explicit TaskThreadPool( diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp index f03ba432f26d..e5c5552ab1a4 100644 --- a/c10/cuda/CUDACachingAllocator.cpp +++ b/c10/cuda/CUDACachingAllocator.cpp @@ -338,7 +338,7 @@ struct THCCachingAllocator std::lock_guard lock(mutex); Block* block = find_allocated_block(ptr); if (!block) { - AT_ERROR("invalid device pointer: %p", ptr); + AT_ERROR("invalid device pointer: ", ptr); } while (block->prev) { block = block->prev; @@ -378,17 +378,21 @@ struct THCCachingAllocator void recordStream(void* ptr, cuda::CUDAStream stream) { - std::lock_guard lock(mutex); - Block* block = find_allocated_block(ptr); - if (!block) { - AT_ERROR("invalid device pointer: %p", ptr); - } - if (stream.stream() == block->stream) { - // ignore uses on the allocation stream, since those don't require any - // special synchronization - return; + // Empty tensor's storage().data() might be a null ptr. As there is no + // blocks associated with those tensors, it is fine to do nothing here. + if (ptr) { + std::lock_guard lock(mutex); + Block* block = find_allocated_block(ptr); + if (!block) { + AT_ERROR("invalid device pointer: ", ptr); + } + if (stream.stream() == block->stream) { + // ignore uses on the allocation stream, since those don't require any + // special synchronization + return; + } + block->stream_uses.insert(stream); } - block->stream_uses.insert(stream); } /** moves a block into a pool of cached free blocks */ diff --git a/c10/cuda/CUDAStream.h b/c10/cuda/CUDAStream.h index 7f7f8640ae17..b23f8aa1c65f 100644 --- a/c10/cuda/CUDAStream.h +++ b/c10/cuda/CUDAStream.h @@ -68,7 +68,7 @@ class C10_CUDA_API CUDAStream { /// Construct a CUDAStream from a Stream. This construction is checked, /// and will raise an error if the Stream is not, in fact, a CUDA stream. explicit CUDAStream(Stream stream) : stream_(stream) { - AT_CHECK(stream_.device_type() == DeviceType::CUDA); + TORCH_CHECK(stream_.device_type() == DeviceType::CUDA); } /// Construct a CUDAStream from a Stream with no error checking. diff --git a/c10/macros/Macros.h b/c10/macros/Macros.h index 879a276b6db4..09e4b46fcfee 100644 --- a/c10/macros/Macros.h +++ b/c10/macros/Macros.h @@ -35,19 +35,41 @@ #define MACRO_EXPAND(args) args /// C10_NODISCARD - Warn if a type or return value is discarded. + +// Technically, we should check if __cplusplus > 201402L here, because +// [[nodiscard]] is only defined in C++17. However, some compilers +// we care about don't advertise being C++17 (e.g., clang), but +// support the attribute anyway. In fact, this is not just a good idea, +// it's the law: clang::warn_unused_result doesn't work on nvcc + clang +// and the best workaround for this case is to use [[nodiscard]] +// instead; see https://github.com/pytorch/pytorch/issues/13118 +// +// Note to future editors: if you have noticed that a compiler is +// misbehaving (e.g., it advertises support, but the support doesn't +// actually work, or it is emitting warnings). Some compilers which +// are strict about the matter include MSVC, which will complain: +// +// error C2429: attribute 'nodiscard' requires compiler flag '/std:c++latest' +// +// Exhibits: +// - MSVC 19.14: https://godbolt.org/z/Dzd7gn (requires /std:c++latest) +// - Clang 8.0.0: https://godbolt.org/z/3PYL4Z (always advertises support) +// - gcc 8.3: https://godbolt.org/z/4tLMQS (always advertises support) #define C10_NODISCARD -#if __cplusplus > 201402L && defined(__has_cpp_attribute) -#if __has_cpp_attribute(nodiscard) -#undef C10_NODISCARD -#define C10_NODISCARD [[nodiscard]] -#endif +#if defined(__has_cpp_attribute) +# if __has_cpp_attribute(nodiscard) +# undef C10_NODISCARD +# define C10_NODISCARD [[nodiscard]] +# endif // Workaround for llvm.org/PR23435, since clang 3.6 and below emit a spurious // error when __has_cpp_attribute is given a scoped attribute in C mode. #elif __cplusplus && defined(__has_cpp_attribute) -#if __has_cpp_attribute(clang::warn_unused_result) -#undef C10_NODISCARD -#define C10_NODISCARD [[clang::warn_unused_result]] -#endif +# if __has_cpp_attribute(clang::warn_unused_result) +// TODO: It's possible this is still triggering https://github.com/pytorch/pytorch/issues/13118 +// on Windows; if it is, better fix it. +# undef C10_NODISCARD +# define C10_NODISCARD [[clang::warn_unused_result]] +# endif #endif // suppress an unused variable. diff --git a/c10/util/ArrayRef.h b/c10/util/ArrayRef.h index 989e12143935..2f376d46b9b0 100644 --- a/c10/util/ArrayRef.h +++ b/c10/util/ArrayRef.h @@ -145,13 +145,13 @@ class ArrayRef final { /// front - Get the first element. AT_CPP14_CONSTEXPR const T& front() const { - AT_CHECK(!empty(), "ArrayRef: attempted to access front() of empty list"); + TORCH_CHECK(!empty(), "ArrayRef: attempted to access front() of empty list"); return Data[0]; } /// back - Get the last element. AT_CPP14_CONSTEXPR const T& back() const { - AT_CHECK(!empty(), "ArrayRef: attempted to access back() of empty list"); + TORCH_CHECK(!empty(), "ArrayRef: attempted to access back() of empty list"); return Data[Length - 1]; } @@ -163,7 +163,7 @@ class ArrayRef final { /// slice(n, m) - Chop off the first N elements of the array, and keep M /// elements in the array. AT_CPP14_CONSTEXPR ArrayRef slice(size_t N, size_t M) const { - AT_CHECK( + TORCH_CHECK( N + M <= size(), "ArrayRef: invalid slice, N = ", N, @@ -188,7 +188,7 @@ class ArrayRef final { /// Vector compatibility AT_CPP14_CONSTEXPR const T& at(size_t Index) const { - AT_CHECK( + TORCH_CHECK( Index < Length, "ArrayRef: invalid index Index = ", Index, diff --git a/c10/util/Deprecated.h b/c10/util/Deprecated.h index b9bcf22eb207..59acf78d6d21 100644 --- a/c10/util/Deprecated.h +++ b/c10/util/Deprecated.h @@ -39,36 +39,45 @@ // Sample usage: // // using BadType C10_DEPRECATED_USING = int; -// -#if defined(__cplusplus) && __cplusplus >= 201402L +// technically [[deprecated]] syntax is from c++14 standard, but it works in +// many compilers. +#if defined(__has_cpp_attribute) +#if __has_cpp_attribute(deprecated) # define C10_DEPRECATED_USING [[deprecated]] -#elif defined(_MSC_VER) && defined(__CUDACC__) -// Apparently, [[deprecated]] doesn't work on nvcc on Windows; +#endif +#endif + +#if !defined(C10_DEPRECATED_USING) && defined(_MSC_VER) +#if defined(__CUDACC__) +// [[deprecated]] doesn't work on nvcc on Windows; // you get the error: // // error: attribute does not apply to any entity // // So we just turn the macro off in this case. # define C10_DEPRECATED_USING -#elif defined(_MSC_VER) -// __declspec(deprecated) does not work in using declarations: -// https://godbolt.org/z/lOwe1h -// but it seems that most of C++14 is available in MSVC even if you don't ask for -// it. (It's also harmless to specify an attribute because it is C++11 supported -// syntax; you mostly risk it not being understood). Some more notes at -// https://blogs.msdn.microsoft.com/vcblog/2016/06/07/standards-version-switches-in-the-compiler/ +#else +// [[deprecated]] does work in windows without nvcc, though msc doesn't support +// `__has_cpp_attribute`. # define C10_DEPRECATED_USING [[deprecated]] -#elif defined(__CUDACC__) +#endif +#endif + +#if !defined(C10_DEPRECATED_USING) && defined(__GNUC__) // nvcc has a bug where it doesn't understand __attribute__((deprecated)) -// declarations even when the host compiler supports it. It's OK -// with [[deprecated]] though (although, if you are on an old version -// of gcc which doesn't understand attributes, you'll get a -Wattributes -// error that it is ignored -# define C10_DEPRECATED_USING [[deprecated]] -#elif defined(__GNUC__) +// declarations even when the host compiler supports it. We'll only use this gcc +// attribute when not cuda, and when using a GCC compiler that doesn't support +// the c++14 syntax we checked for above (availble in __GNUC__ >= 5) +#if !defined(__CUDACC__) # define C10_DEPRECATED_USING __attribute__((deprecated)) #else +// using cuda + gcc < 5, neither deprecated syntax is available so turning off. +# define C10_DEPRECATED_USING +#endif +#endif + +#if ! defined(C10_DEPRECATED_USING) # warning "You need to implement C10_DEPRECATED_USING for this compiler" # define C10_DEPRECATED_USING #endif diff --git a/c10/util/Exception.h b/c10/util/Exception.h index 8c4fbed94008..6d270d944a54 100644 --- a/c10/util/Exception.h +++ b/c10/util/Exception.h @@ -3,6 +3,7 @@ #include "c10/macros/Macros.h" #include "c10/util/StringUtil.h" +#include "c10/util/Deprecated.h" #include #include @@ -19,8 +20,8 @@ namespace c10 { /// The primary ATen error class. /// Provides a complete error message with source location information via -/// `what()`, and a more concise message via `what_without_backtrace()`. Should -/// primarily be used with the `AT_ERROR` macro. +/// `what()`, and a more concise message via `what_without_backtrace()`. +/// Don't throw this directly; use TORCH_CHECK/TORCH_INTERNAL_ASSERT instead. /// /// NB: c10::Error is handled specially by the default torch to suppress the /// backtrace, see torch/csrc/Exceptions.h @@ -102,7 +103,8 @@ class C10_API Warning { }; // Used in ATen for out-of-bound indices that can reasonably only be detected -// lazily inside a kernel (See: advanced indexing). +// lazily inside a kernel (See: advanced indexing). These turn into +// IndexError when they cross to Python. class C10_API IndexError : public Error { using Error::Error; }; @@ -112,58 +114,260 @@ class C10_API IndexError : public Error { // exception type before its what() content C10_API std::string GetExceptionString(const std::exception& e); -} // namespace c10 +namespace detail { + +// Return x if it is non-empty; otherwise return y. +inline std::string if_empty_then(std::string x, std::string y) { + if (x.empty()) { + return y; + } else { + return x; + } +} -// TODO: variants that print the expression tested and thus don't require -// strings -// TODO: CAFFE_ENFORCE_WITH_CALLER style macro +} -// TODO: move AT_ERROR to C10_ERROR -// TODO: consolidate the enforce and assert messages. Assert is a bit confusing -// as c++ assert quits, while this throws. -// TODO: merge AT_CHECK with AT_ASSERTM. CHECK in fbcode means strict failure if -// not met. -// In the debug build With MSVC, __LINE__ might be of long type (a.k.a int32_t), +} // namespace c10 + +// Private helper macro for implementing TORCH_INTERNAL_ASSERT and TORCH_CHECK +// +// Note: In the debug build With MSVC, __LINE__ might be of long type (a.k.a int32_t), // which is different from the definition of `SourceLocation` that requires // unsigned int (a.k.a uint32_t) and may cause a compile error with the message: // error C2397: conversion from 'long' to 'uint32_t' requires a narrowing conversion // Here the static cast is used to pass the build. +#define C10_THROW_ERROR(err_type, msg) \ + throw ::c10::err_type({__func__, __FILE__, static_cast(__LINE__)}, msg) -#define AT_ERROR(...) \ - throw ::c10::Error({__func__, __FILE__, static_cast(__LINE__)}, ::c10::str(__VA_ARGS__)) +// Private helper macro for workaround MSVC misexpansion of nested macro +// invocations involving __VA_ARGS__. See +// https://stackoverflow.com/questions/5134523/msvc-doesnt-expand-va-args-correctly +#define C10_EXPAND_MSVC_WORKAROUND(x) x -#define AT_INDEX_ERROR(...) \ - throw ::c10::IndexError({__func__, __FILE__, static_cast(__LINE__)}, ::c10::str(__VA_ARGS__)) -#define AT_WARN(...) \ - ::c10::Warning::warn({__func__, __FILE__, static_cast(__LINE__)}, ::c10::str(__VA_ARGS__)) +// ---------------------------------------------------------------------------- +// Error reporting macros +// ---------------------------------------------------------------------------- -#define AT_ASSERT(cond) \ +// A utility macro to provide assert()-like functionality; that is, enforcement +// of internal invariants in code. It supports an arbitrary number of extra +// arguments (evaluated only on failure), which will be printed in the assert +// failure message using operator<< (this is useful to print some variables +// which may be useful for debugging.) +// +// Usage: +// TORCH_INTERNAL_ASSERT(should_be_true); +// TORCH_INTERNAL_ASSERT(x == 0, "x = ", x); +// +// Assuming no bugs in PyTorch, the conditions tested by this macro should +// always be true; e.g., it should be possible to disable all of these +// conditions without changing observable user behavior. If you would like to +// do error reporting for user input, please use TORCH_CHECK instead. +// +// NOTE: It is SAFE to use this macro in production code; on failure, this +// simply raises an exception, it does NOT unceremoniously quit the process +// (unlike assert()). +// +#ifdef C10_MOBILE +#define TORCH_INTERNAL_ASSERT(cond, ...) \ if (!(cond)) { \ - AT_ERROR( \ - #cond " ASSERT FAILED at ", \ - __FILE__, \ - ":", \ - __LINE__, \ - ", please report a bug to PyTorch."); \ + C10_THROW_ERROR(Error, \ + #cond " INTERNAL ASSERT FAILED at" \ + __FILE__ \ + ); \ } - -#define AT_ASSERTM(cond, ...) \ +#else +#define TORCH_INTERNAL_ASSERT(cond, ...) \ if (!(cond)) { \ - AT_ERROR(::c10::str( \ - #cond, \ - " ASSERT FAILED at ", \ + C10_THROW_ERROR(Error, ::c10::str( \ + #cond " INTERNAL ASSERT FAILED at ", \ __FILE__, \ ":", \ __LINE__, \ ", please report a bug to PyTorch. ", \ - __VA_ARGS__)); \ + ::c10::str(__VA_ARGS__) \ + )); \ } +#endif + +// A utility macro to make it easier to test for error conditions from user +// input. Like TORCH_INTERNAL_ASSERT, it supports an arbitrary number of extra +// arguments (evaluated only on failure), which will be printed in the error +// message using operator<< (e.g., you can pass any object which has +// operator<< defined. Most objects in PyTorch have these definitions!) +// +// Usage: +// TORCH_CHECK(should_be_true); // A default error message will be provided +// // in this case; but we recommend writing an +// // explicit error message, as it is more +// // user friendly. +// TORCH_CHECK(x == 0, "Expected x to be 0, but got ", x); +// +// On failure, this macro will raise an exception. If this exception propagates +// to Python, it will convert into a Python RuntimeError. +// +// NOTE: It is SAFE to use this macro in production code; on failure, this +// simply raises an exception, it does NOT unceremoniously quit the process +// (unlike CHECK() from glog.) +// +#ifdef C10_MOBILE +#define TORCH_CHECK(cond, ...) \ + if (!(cond)) { \ + C10_THROW_ERROR(Error, \ + #cond " CHECK FAILED at " \ + __FILE__ \ + ); \ + } +#else +#define TORCH_CHECK(cond, ...) \ + if (!(cond)) { \ + C10_THROW_ERROR(Error, \ + ::c10::detail::if_empty_then( \ + ::c10::str(__VA_ARGS__), \ + "Expected " #cond " to be true, but got false. " \ + "(Could this error message be improved? If so, " \ + "please report an enhancement request to PyTorch.)" \ + ) \ + ); \ + } +#endif +// TODO: We're going to get a lot of similar looking string literals +// this way; check if this actually affects binary size. -#define AT_CHECK(cond, ...) \ - if (!(cond)) { \ - AT_ERROR(::c10::str(__VA_ARGS__)); \ +// Like TORCH_CHECK, but raises IndexErrors instead of Errors. +#ifdef C10_MOBILE +#define TORCH_CHECK_INDEX(cond, ...) \ + if (!(cond)) { \ + C10_THROW_ERROR(Error, \ + #cond " INDEX CHECK FAILED at " \ + __FILE__ \ + ); \ + } +#else +#define TORCH_CHECK_INDEX(cond, ...) \ + if (!(cond)) { \ + C10_THROW_ERROR(IndexError, \ + ::c10::detail::if_empty_then( \ + ::c10::str(__VA_ARGS__), \ + "Expected " #cond " to be true, but got false. " \ + "(Could this error message be improved? If so, " \ + "please report an enhancement request to PyTorch.)" \ + ) \ + ); \ } +#endif + + +// Report a warning to the user. Accepts an arbitrary number of extra +// arguments which are concatenated into the warning message using operator<< +// +#define TORCH_WARN(...) \ + ::c10::Warning::warn({__func__, __FILE__, static_cast(__LINE__)}, ::c10::str(__VA_ARGS__)) + + +// ---------------------------------------------------------------------------- +// Deprecated macros +// ---------------------------------------------------------------------------- + +namespace c10 { namespace detail { + +/* +// Deprecation disabled until we fix sites in our codebase +C10_DEPRECATED_MESSAGE("AT_ERROR(msg) is deprecated, use TORCH_CHECK(false, msg) instead.") +*/ +inline void deprecated_AT_ERROR() {} + +/* +// Deprecation disabled until we fix sites in our codebase +C10_DEPRECATED_MESSAGE("AT_INDEX_ERROR(msg) is deprecated, use TORCH_CHECK_INDEX(false, msg) instead.") +*/ +inline void deprecated_AT_INDEX_ERROR() {} + +/* +// Deprecation disabled until we fix sites in our codebase +C10_DEPRECATED_MESSAGE("AT_WARN is deprecated, use TORCH_WARN instead.") +*/ +inline void deprecated_AT_WARN() {} + +C10_DEPRECATED_MESSAGE("AT_CHECK is deprecated, use TORCH_CHECK instead.") +inline void deprecated_AT_CHECK() {} + +/* +// Deprecation disabled until we fix sites in our codebase +C10_DEPRECATED_MESSAGE("AT_ASSERT is deprecated, if you mean to indicate an internal invariant failure, use " \ + "TORCH_INTERNAL_ASSERT instead; if you mean to do user error checking, use " \ + "TORCH_CHECK. See https://github.com/pytorch/pytorch/issues/20287 for more details.") +*/ +inline void deprecated_AT_ASSERT() {} + +/* +// Deprecation disabled until we fix sites in our codebase +C10_DEPRECATED_MESSAGE("AT_ASSERTM is deprecated, if you mean to indicate an internal invariant failure, use " \ + "TORCH_INTERNAL_ASSERT instead; if you mean to do user error checking, use " \ + "TORCH_CHECK. See https://github.com/pytorch/pytorch/issues/20287 for more details.") +*/ +inline void deprecated_AT_ASSERTM() {} + +}} // namespace c10::detail + +// Deprecated alias; this alias was deprecated because it wasn't clear to +// people that you should use a macro with AT_ prefix inside the torch/csrc +// directory. Use TORCH_CHECK instead. +#define AT_CHECK(...) \ + do { \ + ::c10::detail::deprecated_AT_CHECK(); \ + C10_EXPAND_MSVC_WORKAROUND(TORCH_CHECK(__VA_ARGS__)); \ + } while (false); + +// Deprecated alias; this alias was deprecated because people kept mistakenly +// using it for user error checking. Use TORCH_INTERNAL_ASSERT or TORCH_CHECK +// instead. See https://github.com/pytorch/pytorch/issues/20287 for more details. +#define AT_ASSERT(...) \ + do { \ + ::c10::detail::deprecated_AT_ASSERT(); \ + C10_EXPAND_MSVC_WORKAROUND(TORCH_INTERNAL_ASSERT(__VA_ARGS__)); \ + } while (false); + +// Deprecated alias, like AT_ASSERT. The new TORCH_INTERNAL_ASSERT macro supports +// both 0-ary and variadic calls, so having a separate message-accepting macro +// is not necessary. +// +// NB: we MUST include cond explicitly here, as MSVC will miscompile the macro +// expansion, shunting all of __VA_ARGS__ to cond. An alternate workaround +// can be seen at +// https://stackoverflow.com/questions/5134523/msvc-doesnt-expand-va-args-correctly +#define AT_ASSERTM(cond, ...) \ + do { \ + ::c10::detail::deprecated_AT_ASSERTM(); \ + C10_EXPAND_MSVC_WORKAROUND(TORCH_INTERNAL_ASSERT(cond, __VA_ARGS__)); \ + } while (false); + +// Deprecated alias; this alias was deprecated because it represents extra API +// surface that makes it hard for people to understand what macro to use. +// Use TORCH_CHECK(false, ...) or TORCH_INTERNAL_ASSERT(false, ...) to +// unconditionally fail at a line of code. +#define AT_ERROR(...) \ + do { \ + ::c10::detail::deprecated_AT_ERROR(); \ + C10_EXPAND_MSVC_WORKAROUND(TORCH_CHECK(false, ::c10::str(__VA_ARGS__))); \ + } while (false); + +// Deprecated alias; this alias was deprecated for consistency with TORCH_CHECK. +#define AT_INDEX_ERROR(...) \ + do { \ + ::c10::detail::deprecated_AT_INDEX_ERROR(); \ + C10_EXPAND_MSVC_WORKAROUND(TORCH_CHECK_INDEX(false, ::c10::str(__VA_ARGS__))); \ + } while (false); + +// Deprecated alias; this alias was deprecated because it wasn't clear to +// people that you should use a macro with AT_ prefix inside the torch/csrc +// directory. Use TORCH_WARN instead. +#define AT_WARN(...) \ + do { \ + ::c10::detail::deprecated_AT_WARN(); \ + C10_EXPAND_MSVC_WORKAROUND(TORCH_WARN(__VA_ARGS__)); \ + } while (false); + #endif // C10_UTIL_EXCEPTION_H_ diff --git a/c10/util/Registry.h b/c10/util/Registry.h index 544ebed93aef..060f80d5fa0d 100644 --- a/c10/util/Registry.h +++ b/c10/util/Registry.h @@ -71,9 +71,11 @@ class Registry { if (registry_.count(key) != 0) { auto cur_priority = priority_[key]; if (priority > cur_priority) { + #ifdef DEBUG std::string warn_msg = "Overwriting already registered item for key " + KeyStrRepr(key); fprintf(stderr, "%s\n", warn_msg.c_str()); + #endif registry_[key] = creator; priority_[key] = priority; } else if (priority == cur_priority) { diff --git a/c10/util/StringUtil.cpp b/c10/util/StringUtil.cpp index 4f49fad6646f..7c46b1332074 100644 --- a/c10/util/StringUtil.cpp +++ b/c10/util/StringUtil.cpp @@ -26,8 +26,8 @@ std::ostream& operator<<(std::ostream& out, const SourceLocation& loc) { } size_t ReplaceAll(std::string& s, const char* from, const char* to) { - AT_CHECK(from && *from, ""); - AT_CHECK(to, ""); + TORCH_CHECK(from && *from, ""); + TORCH_CHECK(to, ""); size_t numReplaced = 0; std::string::size_type lenFrom = std::strlen(from); diff --git a/c10/util/numa.cpp b/c10/util/numa.cpp index 2f4ceda56f18..99d58c23f846 100644 --- a/c10/util/numa.cpp +++ b/c10/util/numa.cpp @@ -27,7 +27,7 @@ void NUMABind(int numa_node_id) { return; } - AT_CHECK( + TORCH_CHECK( numa_node_id <= numa_max_node(), "NUMA node id ", numa_node_id, @@ -46,7 +46,7 @@ int GetNUMANode(const void* ptr) { AT_ASSERT(ptr); int numa_node = -1; - AT_CHECK( + TORCH_CHECK( get_mempolicy( &numa_node, NULL, @@ -83,7 +83,7 @@ void NUMAMove(void* ptr, size_t size, int numa_node_id) { numa_node_id >= 0 && static_cast(numa_node_id) < sizeof(unsigned long) * 8); unsigned long mask = 1UL << numa_node_id; - AT_CHECK( + TORCH_CHECK( mbind( reinterpret_cast(page_start_ptr), size + offset, diff --git a/c10/util/qint32.h b/c10/util/qint32.h new file mode 100644 index 000000000000..0aa744ee1701 --- /dev/null +++ b/c10/util/qint32.h @@ -0,0 +1,15 @@ +#pragma once +#include + +namespace c10 { + +/** + * qint32 is for signed 32 bit quantized Tensors + */ +struct alignas(4) qint32 { + using underlying = int32_t; + int32_t val_; + explicit qint32(int32_t val) : val_(val) {} +}; + +} // namespace c10 diff --git a/c10/util/qint8.h b/c10/util/qint8.h index e6c37216803c..27dd7b37351a 100644 --- a/c10/util/qint8.h +++ b/c10/util/qint8.h @@ -5,12 +5,13 @@ namespace c10 { /** * This is the data type for quantized Tensors. Right now we only have - * qint8 which is for 8 bit Tensors, we might have 4 bit, 2 bit or 1 bit - * data types in the future. + * qint8 which is for 8 bit Tensors, and qint32 for 32 bit int Tensors, + * we might have 4 bit, 2 bit or 1 bit data types in the future. */ struct alignas(1) qint8 { - uint8_t val_; - explicit qint8(uint8_t val) : val_(val) {} + using underlying = int8_t; + int8_t val_; + explicit qint8(int8_t val) : val_(val) {} }; } // namespace c10 diff --git a/c10/util/quint8.h b/c10/util/quint8.h new file mode 100644 index 000000000000..0dbef3764283 --- /dev/null +++ b/c10/util/quint8.h @@ -0,0 +1,15 @@ +#pragma once +#include + +namespace c10 { + +/** + * qint8 is for signed 8 bit quantized Tensors + */ +struct alignas(1) quint8 { + using underlying = uint8_t; + uint8_t val_; + explicit quint8(uint8_t val) : val_(val) {} +}; + +} // namespace c10 diff --git a/c10/util/typeid.cpp b/c10/util/typeid.cpp index edfe204f6e41..0d56050a4de5 100644 --- a/c10/util/typeid.cpp +++ b/c10/util/typeid.cpp @@ -78,10 +78,13 @@ CAFFE_DEFINE_PREALLOCATED_KNOWN_TYPE(24, int*) CAFFE_DEFINE_PREALLOCATED_KNOWN_TYPE(25, detail::_guard_long_unique); CAFFE_DEFINE_PREALLOCATED_KNOWN_TYPE( 26, - detail::_guard_long_unique>); + detail::_guard_long_unique>) + +CAFFE_DEFINE_PREALLOCATED_KNOWN_TYPE(27, float*) +CAFFE_DEFINE_PREALLOCATED_KNOWN_TYPE(28, at::Half*) +CAFFE_DEFINE_PREALLOCATED_KNOWN_TYPE(29, c10::qint8) +CAFFE_DEFINE_PREALLOCATED_KNOWN_TYPE(30, c10::quint8) +CAFFE_DEFINE_PREALLOCATED_KNOWN_TYPE(31, c10::qint32) +CAFFE_DEFINE_PREALLOCATED_KNOWN_TYPE(32, _CaffeHighestPreallocatedTypeId) -CAFFE_DEFINE_PREALLOCATED_KNOWN_TYPE(27, c10::qint8); -CAFFE_DEFINE_PREALLOCATED_KNOWN_TYPE(28, _CaffeHighestPreallocatedTypeId) -CAFFE_DEFINE_PREALLOCATED_KNOWN_TYPE(29, float*) -CAFFE_DEFINE_PREALLOCATED_KNOWN_TYPE(30, at::Half*) } // namespace caffe2 diff --git a/c10/util/typeid.h b/c10/util/typeid.h index 058a0a7809e9..36e39458ac0f 100644 --- a/c10/util/typeid.h +++ b/c10/util/typeid.h @@ -17,15 +17,16 @@ #include -#include "c10/macros/Macros.h" -#include "c10/util/Backtrace.h" -#include "c10/util/C++17.h" -#include "c10/util/Exception.h" -#include "c10/util/Half.h" -#include "c10/util/IdWrapper.h" -#include "c10/util/qint8.h" - -#include "c10/util/Type.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* * TypeIdentifier is a small type containing an id. @@ -623,8 +624,11 @@ CAFFE_DECLARE_PREALLOCATED_KNOWN_TYPE( 26, detail::_guard_long_unique>) -CAFFE_DECLARE_PREALLOCATED_KNOWN_TYPE(27, c10::qint8); -CAFFE_DECLARE_PREALLOCATED_KNOWN_TYPE(28, _CaffeHighestPreallocatedTypeId) -CAFFE_DECLARE_PREALLOCATED_KNOWN_TYPE(29, float*) -CAFFE_DECLARE_PREALLOCATED_KNOWN_TYPE(30, at::Half*) +CAFFE_DECLARE_PREALLOCATED_KNOWN_TYPE(27, float*) +CAFFE_DECLARE_PREALLOCATED_KNOWN_TYPE(28, at::Half*) +CAFFE_DECLARE_PREALLOCATED_KNOWN_TYPE(29, c10::qint8) +CAFFE_DECLARE_PREALLOCATED_KNOWN_TYPE(30, c10::quint8) +CAFFE_DECLARE_PREALLOCATED_KNOWN_TYPE(31, c10::qint32) +CAFFE_DECLARE_PREALLOCATED_KNOWN_TYPE(32, _CaffeHighestPreallocatedTypeId) + } // namespace caffe2 diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index d19d88aa0d3c..96c2ed92b81a 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -54,11 +54,6 @@ else() # See cmake/Codegen.cmake for header installation endif() -# ---[ Torch build -if(BUILD_TORCH) - add_subdirectory(../torch torch) -endif() - # ---[ Caffe2 build # Note: the folders that are being commented out have not been properly # addressed yet. @@ -211,12 +206,630 @@ if(NOT BUILD_ATEN_ONLY) endif() endif() + +# ========================================================== +# formerly-libtorch +# ========================================================== + +set(TORCH_SRC_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../torch") +set(TORCH_ROOT "${TORCH_SRC_DIR}/..") + +if(NOT TORCH_INSTALL_BIN_DIR) + set(TORCH_INSTALL_BIN_DIR bin) +endif() + +if(NOT TORCH_INSTALL_INCLUDE_DIR) + set(TORCH_INSTALL_INCLUDE_DIR include) +endif() + +if(NOT TORCH_INSTALL_LIB_DIR) + set(TORCH_INSTALL_LIB_DIR lib) +endif() + + + +if (NOT INTERN_BUILD_MOBILE) + + + set(CMAKE_POSITION_INDEPENDENT_CODE TRUE) + + # Generate files + set(TOOLS_PATH "${TORCH_ROOT}/tools") + + configure_file("${TORCH_ROOT}/aten/src/ATen/common_with_cwrap.py" + "${TOOLS_PATH}/shared/cwrap_common.py" + COPYONLY) + + configure_file("${TORCH_SRC_DIR}/_utils_internal.py" + "${TOOLS_PATH}/shared/_utils_internal.py" + COPYONLY) + + + set(GENERATED_CXX_TORCH + "${TORCH_SRC_DIR}/csrc/autograd/generated/Functions.cpp" + "${TORCH_SRC_DIR}/csrc/autograd/generated/VariableType_0.cpp" + "${TORCH_SRC_DIR}/csrc/autograd/generated/VariableType_1.cpp" + "${TORCH_SRC_DIR}/csrc/autograd/generated/VariableType_2.cpp" + "${TORCH_SRC_DIR}/csrc/autograd/generated/VariableType_3.cpp" + "${TORCH_SRC_DIR}/csrc/autograd/generated/VariableType_4.cpp" + "${TORCH_SRC_DIR}/csrc/jit/generated/register_aten_ops_0.cpp" + "${TORCH_SRC_DIR}/csrc/jit/generated/register_aten_ops_1.cpp" + "${TORCH_SRC_DIR}/csrc/jit/generated/register_aten_ops_2.cpp" + ) + + set(GENERATED_H_TORCH + "${TORCH_SRC_DIR}/csrc/autograd/generated/VariableType.h" + "${TORCH_SRC_DIR}/csrc/autograd/generated/Functions.h" + "${TORCH_SRC_DIR}/csrc/autograd/generated/variable_factories.h" + ) + + set(GENERATED_THNN_CXX_CUDA ${TORCH_SRC_DIR}/csrc/nn/THCUNN.cpp) + set(GENERATED_THNN_CXX ${TORCH_SRC_DIR}/csrc/nn/THNN.cpp) + + set(GENERATED_CXX_PYTHON + "${TORCH_SRC_DIR}/csrc/autograd/generated/python_functions.cpp" + "${TORCH_SRC_DIR}/csrc/autograd/generated/python_variable_methods.cpp" + "${TORCH_SRC_DIR}/csrc/autograd/generated/python_torch_functions.cpp" + "${TORCH_SRC_DIR}/csrc/autograd/generated/python_nn_functions.cpp" + ) + + set(GENERATED_H_PYTHON + "${TORCH_SRC_DIR}/csrc/autograd/generated/python_functions.h" + "${TORCH_SRC_DIR}/csrc/autograd/generated/python_variable_methods_dispatch.h" + "${TORCH_SRC_DIR}/csrc/autograd/generated/python_torch_functions_dispatch.h" + "${TORCH_SRC_DIR}/csrc/autograd/generated/python_nn_functions.h" + "${TORCH_SRC_DIR}/csrc/autograd/generated/python_nn_functions_dispatch.h" + ) + + set(GENERATED_THNN_SOURCES + ${GENERATED_THNN_CXX} + ${GENERATED_THNN_CXX_CUDA} + ) + + set(TORCH_GENERATED_CODE + ${GENERATED_CXX_TORCH} + ${GENERATED_THNN_SOURCES} + ${GENERATED_H_TORCH} + ${GENERATED_CXX_PYTHON} + ${GENERATED_H_PYTHON} + ) + + add_custom_command( + OUTPUT + ${TORCH_GENERATED_CODE} + COMMAND + "${PYTHON_EXECUTABLE}" tools/setup_helpers/generate_code.py + --declarations-path "${CMAKE_BINARY_DIR}/aten/src/ATen/Declarations.yaml" + --nn-path "aten/src" + DEPENDS + "${CMAKE_BINARY_DIR}/aten/src/ATen/Declarations.yaml" + "${CMAKE_CURRENT_LIST_DIR}/../aten/src/THNN/generic/THNN.h" + "${TOOLS_PATH}/autograd/templates/VariableType.h" + "${TOOLS_PATH}/autograd/templates/VariableType.cpp" + "${TOOLS_PATH}/autograd/templates/Functions.h" + "${TOOLS_PATH}/autograd/templates/Functions.cpp" + "${TOOLS_PATH}/autograd/templates/python_functions.h" + "${TOOLS_PATH}/autograd/templates/python_functions.cpp" + "${TOOLS_PATH}/autograd/templates/python_variable_methods.cpp" + "${TOOLS_PATH}/autograd/templates/python_variable_methods_dispatch.h" + "${TOOLS_PATH}/autograd/templates/python_torch_functions.cpp" + "${TOOLS_PATH}/autograd/templates/python_torch_functions_dispatch.h" + "${TOOLS_PATH}/autograd/templates/python_nn_functions.cpp" + "${TOOLS_PATH}/autograd/templates/python_nn_functions.h" + "${TOOLS_PATH}/autograd/templates/python_nn_functions_dispatch.h" + "${TOOLS_PATH}/autograd/templates/variable_factories.h" + "${TOOLS_PATH}/autograd/deprecated.yaml" + "${TOOLS_PATH}/autograd/derivatives.yaml" + "${TOOLS_PATH}/autograd/gen_autograd_functions.py" + "${TOOLS_PATH}/autograd/gen_autograd.py" + "${TOOLS_PATH}/autograd/gen_python_functions.py" + "${TOOLS_PATH}/autograd/gen_variable_factories.py" + "${TOOLS_PATH}/autograd/gen_variable_type.py" + "${TOOLS_PATH}/autograd/load_derivatives.py" + "${TOOLS_PATH}/autograd/nested_dict.py" + "${TOOLS_PATH}/autograd/utils.py" + "${TOOLS_PATH}/jit/gen_jit_dispatch.py" + "${TOOLS_PATH}/jit/templates/register_aten_ops.cpp" + WORKING_DIRECTORY "${TORCH_ROOT}") + + + # Required workaround for libtorch_python.so build + # see https://samthursfield.wordpress.com/2015/11/21/cmake-dependencies-between-targets-and-files-and-custom-commands/#custom-commands-in-different-directories + add_custom_target( + generate-torch-sources + DEPENDS ${TORCH_GENERATED_CODE} + ) + + + set(TORCH_SRCS + ${GENERATED_CXX_TORCH} + ${GENERATED_H_TORCH} + ${TORCH_SRC_DIR}/csrc/autograd/anomaly_mode.cpp + ${TORCH_SRC_DIR}/csrc/autograd/engine.cpp + ${TORCH_SRC_DIR}/csrc/autograd/function.cpp + ${TORCH_SRC_DIR}/csrc/autograd/function_hook.cpp + ${TORCH_SRC_DIR}/csrc/autograd/functions/accumulate_grad.cpp + ${TORCH_SRC_DIR}/csrc/autograd/functions/basic_ops.cpp + ${TORCH_SRC_DIR}/csrc/autograd/functions/tensor.cpp + ${TORCH_SRC_DIR}/csrc/autograd/functions/utils.cpp + ${TORCH_SRC_DIR}/csrc/autograd/grad_mode.cpp + ${TORCH_SRC_DIR}/csrc/autograd/input_buffer.cpp + ${TORCH_SRC_DIR}/csrc/autograd/profiler.cpp + ${TORCH_SRC_DIR}/csrc/autograd/record_function.cpp + ${TORCH_SRC_DIR}/csrc/autograd/saved_variable.cpp + ${TORCH_SRC_DIR}/csrc/autograd/variable.cpp + ${TORCH_SRC_DIR}/csrc/autograd/VariableTypeManual.cpp + ${TORCH_SRC_DIR}/csrc/jit/autodiff.cpp + ${TORCH_SRC_DIR}/csrc/jit/attributes.cpp + ${TORCH_SRC_DIR}/csrc/jit/argument_spec.cpp + ${TORCH_SRC_DIR}/csrc/jit/export.cpp + ${TORCH_SRC_DIR}/csrc/jit/pass_manager.cpp + ${TORCH_SRC_DIR}/csrc/jit/pickler.cpp + ${TORCH_SRC_DIR}/csrc/jit/graph_executor.cpp + ${TORCH_SRC_DIR}/csrc/jit/import_source.cpp + ${TORCH_SRC_DIR}/csrc/jit/import.cpp + ${TORCH_SRC_DIR}/csrc/jit/import_export_helpers.cpp + ${TORCH_SRC_DIR}/csrc/jit/interpreter.cpp + ${TORCH_SRC_DIR}/csrc/jit/constants.cpp + ${TORCH_SRC_DIR}/csrc/jit/node_hashing.cpp + ${TORCH_SRC_DIR}/csrc/jit/ir.cpp + ${TORCH_SRC_DIR}/csrc/jit/irparser.cpp + ${TORCH_SRC_DIR}/csrc/jit/netdef_converter.cpp + ${TORCH_SRC_DIR}/csrc/jit/operator.cpp + ${TORCH_SRC_DIR}/csrc/jit/register_c10_ops.cpp + ${TORCH_SRC_DIR}/csrc/jit/subgraph_matcher.cpp + ${TORCH_SRC_DIR}/csrc/jit/symbolic_script.cpp + ${TORCH_SRC_DIR}/csrc/jit/profiling_record.cpp + ${TORCH_SRC_DIR}/csrc/jit/profiling_graph_executor_impl.cpp + ${TORCH_SRC_DIR}/csrc/jit/passes/alias_analysis.cpp + ${TORCH_SRC_DIR}/csrc/jit/passes/batch_mm.cpp + ${TORCH_SRC_DIR}/csrc/jit/passes/canonicalize.cpp + ${TORCH_SRC_DIR}/csrc/jit/passes/constant_propagation.cpp + ${TORCH_SRC_DIR}/csrc/jit/passes/constant_pooling.cpp + ${TORCH_SRC_DIR}/csrc/jit/passes/common_subexpression_elimination.cpp + ${TORCH_SRC_DIR}/csrc/jit/passes/create_autodiff_subgraphs.cpp + ${TORCH_SRC_DIR}/csrc/jit/passes/inline_autodiff_subgraphs.cpp + ${TORCH_SRC_DIR}/csrc/jit/passes/insert_guards.cpp + ${TORCH_SRC_DIR}/csrc/jit/passes/dead_code_elimination.cpp + ${TORCH_SRC_DIR}/csrc/jit/passes/decompose_ops.cpp + ${TORCH_SRC_DIR}/csrc/jit/passes/canonicalize_ops.cpp + ${TORCH_SRC_DIR}/csrc/jit/passes/erase_number_types.cpp + ${TORCH_SRC_DIR}/csrc/jit/passes/inline_fork_wait.cpp + ${TORCH_SRC_DIR}/csrc/jit/passes/graph_fuser.cpp + ${TORCH_SRC_DIR}/csrc/jit/passes/inplace_check.cpp + ${TORCH_SRC_DIR}/csrc/jit/passes/loop_unrolling.cpp + ${TORCH_SRC_DIR}/csrc/jit/passes/lower_grad_of.cpp + ${TORCH_SRC_DIR}/csrc/jit/passes/lower_tuples.cpp + ${TORCH_SRC_DIR}/csrc/jit/passes/peephole.cpp + ${TORCH_SRC_DIR}/csrc/jit/passes/remove_expands.cpp + ${TORCH_SRC_DIR}/csrc/jit/passes/remove_inplace_ops.cpp + ${TORCH_SRC_DIR}/csrc/jit/passes/shape_analysis.cpp + ${TORCH_SRC_DIR}/csrc/jit/passes/requires_grad_analysis.cpp + ${TORCH_SRC_DIR}/csrc/jit/passes/specialize_autogradzero.cpp + ${TORCH_SRC_DIR}/csrc/jit/passes/subgraph_rewrite.cpp + ${TORCH_SRC_DIR}/csrc/jit/passes/python_print.cpp + ${TORCH_SRC_DIR}/csrc/jit/passes/utils/subgraph_utils.cpp + ${TORCH_SRC_DIR}/csrc/jit/passes/utils/check_alias_annotation.cpp + ${TORCH_SRC_DIR}/csrc/jit/passes/utils/memory_dag.cpp + ${TORCH_SRC_DIR}/csrc/jit/passes/quantization.cpp + ${TORCH_SRC_DIR}/csrc/jit/fuser/interface.cpp + ${TORCH_SRC_DIR}/csrc/jit/register_prim_ops.cpp + ${TORCH_SRC_DIR}/csrc/jit/register_special_ops.cpp + ${TORCH_SRC_DIR}/csrc/jit/register_quantized_ops.cpp + ${TORCH_SRC_DIR}/csrc/jit/scope.cpp + ${TORCH_SRC_DIR}/csrc/jit/script/compiler.cpp + ${TORCH_SRC_DIR}/csrc/api/src/jit.cpp + ${TORCH_SRC_DIR}/csrc/jit/testing/file_check.cpp + ${TORCH_SRC_DIR}/csrc/jit/script/final_returns.cpp + ${TORCH_SRC_DIR}/csrc/jit/script/schema_matching.cpp + ${TORCH_SRC_DIR}/csrc/jit/script/script_type_parser.cpp + ${TORCH_SRC_DIR}/csrc/jit/script/sugared_value.cpp + ${TORCH_SRC_DIR}/csrc/jit/script/class_type.cpp + ${TORCH_SRC_DIR}/csrc/jit/script/parser.cpp + ${TORCH_SRC_DIR}/csrc/jit/script/builtin_functions.cpp + ${TORCH_SRC_DIR}/csrc/jit/script/edit_distance.cpp + ${TORCH_SRC_DIR}/csrc/jit/script/logging.cpp + ${TORCH_SRC_DIR}/csrc/jit/script/module.cpp + ${TORCH_SRC_DIR}/csrc/jit/script/jit_exception.cpp + ${TORCH_SRC_DIR}/csrc/jit/tracer.cpp + ${TORCH_SRC_DIR}/csrc/jit/hooks_for_testing.cpp + ${TORCH_SRC_DIR}/csrc/utils/tensor_flatten.cpp + ${TORCH_SRC_DIR}/csrc/utils/variadic.cpp + ${TORCH_SRC_DIR}/csrc/jit/fuser/kernel_cache.cpp + ${TORCH_SRC_DIR}/csrc/jit/fuser/compiler.cpp + ${TORCH_SRC_DIR}/csrc/jit/fuser/executor.cpp + ${TORCH_SRC_DIR}/csrc/jit/fuser/codegen.cpp + ${TORCH_SRC_DIR}/csrc/jit/fuser/fallback.cpp + ${TORCH_ROOT}/test/cpp/jit/test.cpp + ) + + if (WIN32) + list(APPEND TORCH_SRCS + ${TORCH_SRC_DIR}/csrc/jit/fuser/cpu/dynamic_library_win.cpp + ) + else () + list(APPEND TORCH_SRCS + ${TORCH_SRC_DIR}/csrc/jit/fuser/cpu/dynamic_library_unix.cpp + ${TORCH_SRC_DIR}/csrc/jit/fuser/cpu/fused_kernel.cpp + ) + if (USE_CUDA AND NOT USE_ROCM) + list(APPEND Caffe2_GPU_SRCS + ${TORCH_SRC_DIR}/csrc/jit/fuser/cuda/fused_kernel.cpp + ) + add_library(thnvrtc SHARED ${TORCH_SRC_DIR}/csrc/jit/fuser/cuda/thnvrtc.cpp) + target_link_libraries(thnvrtc ${CUDA_NVRTC} ${CUDA_CUDA_LIB} ${CUDA_NVRTC_LIB}) + target_include_directories(thnvrtc PRIVATE ${CUDA_INCLUDE_DIRS}) + install(TARGETS thnvrtc DESTINATION "${TORCH_INSTALL_LIB_DIR}") + + endif() + endif () + + if (USE_CUDA) + list(APPEND Caffe2_GPU_SRCS + ${TORCH_SRC_DIR}/csrc/autograd/profiler_cuda.cpp + ${TORCH_SRC_DIR}/csrc/autograd/functions/comm.cpp + ${TORCH_SRC_DIR}/csrc/cuda/comm.cpp + ) + endif() + + if (USE_ROCM) + list(APPEND Caffe2_HIP_SRCS + ${TORCH_SRC_DIR}/csrc/autograd/functions/comm.cpp + ${TORCH_SRC_DIR}/csrc/cuda/comm.cpp + ) + endif() + + if (NOT NO_API) + list(APPEND TORCH_SRCS + ${TORCH_SRC_DIR}/csrc/api/src/cuda.cpp + ${TORCH_SRC_DIR}/csrc/api/src/data/datasets/mnist.cpp + ${TORCH_SRC_DIR}/csrc/api/src/data/samplers/distributed.cpp + ${TORCH_SRC_DIR}/csrc/api/src/data/samplers/random.cpp + ${TORCH_SRC_DIR}/csrc/api/src/data/samplers/sequential.cpp + ${TORCH_SRC_DIR}/csrc/api/src/data/samplers/stream.cpp + ${TORCH_SRC_DIR}/csrc/api/src/jit.cpp + ${TORCH_SRC_DIR}/csrc/api/src/nn/init.cpp + ${TORCH_SRC_DIR}/csrc/api/src/nn/module.cpp + ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/batchnorm.cpp + ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/conv.cpp + ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/dropout.cpp + ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/embedding.cpp + ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/functional.cpp + ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/linear.cpp + ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/named_any.cpp + ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/rnn.cpp + ${TORCH_SRC_DIR}/csrc/api/src/optim/adagrad.cpp + ${TORCH_SRC_DIR}/csrc/api/src/optim/adam.cpp + ${TORCH_SRC_DIR}/csrc/api/src/optim/lbfgs.cpp + ${TORCH_SRC_DIR}/csrc/api/src/optim/optimizer.cpp + ${TORCH_SRC_DIR}/csrc/api/src/optim/rmsprop.cpp + ${TORCH_SRC_DIR}/csrc/api/src/optim/serialize.cpp + ${TORCH_SRC_DIR}/csrc/api/src/optim/sgd.cpp + ${TORCH_SRC_DIR}/csrc/api/src/serialize/input-archive.cpp + ${TORCH_SRC_DIR}/csrc/api/src/serialize/output-archive.cpp + ) + endif() + + + list(APPEND Caffe2_CPU_SRCS ${TORCH_SRCS}) +endif() + +# ========================================================== +# END formerly-libtorch sources +# ========================================================== + + # Compile exposed libraries. add_library(caffe2 ${Caffe2_CPU_SRCS}) -if (NOT WIN32) + + +option(TORCH_STATIC "Build libtorch.a rather than libtorch.so" OFF) + + +# This is required for older versions of CMake, which don't allow +# specifying add_library() without a list of source files +set(DUMMY_EMPTY_FILE ${CMAKE_BINARY_DIR}/empty.cpp) + +if (MSVC) + set(DUMMY_FILE_CONTENT "__declspec(dllexport) int ignore_this_library_placeholder(){return 0\\;}") +else() + set(DUMMY_FILE_CONTENT "") +endif() + +file(WRITE ${DUMMY_EMPTY_FILE} ${DUMMY_FILE_CONTENT}) + + +# Wrapper library for transition to merged libcaffe and libtorch. +# Only necessary on Windows? +# Contains "caffe2" and "caffe2_gpu". +if (TORCH_STATIC) + add_library(torch STATIC ${DUMMY_EMPTY_FILE}) +else() + add_library(torch SHARED ${DUMMY_EMPTY_FILE}) +endif() + + +target_link_libraries(torch caffe2) + +# ========================================================== +# formerly-libtorch flags +# ========================================================== + +if (NOT INTERN_BUILD_MOBILE) + + # Forces caffe2.pb.h to be generated before its dependents are compiled. + # Adding the generated header file to the ${TORCH_SRCS} list is not sufficient + # to establish the dependency, since the generation procedure is declared in a different CMake file. + # See https://samthursfield.wordpress.com/2015/11/21/cmake-dependencies-between-targets-and-files-and-custom-commands/#custom-commands-in-different-directories + add_dependencies(caffe2 Caffe2_PROTO) + + target_compile_definitions(caffe2 PUBLIC _THP_CORE) + + + # until they can be unified, keep these lists synced with setup.py + if(MSVC) + + if (MSVC_Z7_OVERRIDE) + set(MSVC_DEBINFO_OPTION "/Z7") + else() + set(MSVC_DEBINFO_OPTION "/Zi") + endif() + + target_compile_options(caffe2 PUBLIC + ${MSVC_RUNTIME_LIBRARY_OPTION} + ${MSVC_DEBINFO_OPTION} + /EHa + /DNOMINMAX + /wd4267 + /wd4251 + /wd4522 + /wd4522 + /wd4838 + /wd4305 + /wd4244 + /wd4190 + /wd4101 + /wd4996 + /wd4275 + /bigobj + ) + else() + target_compile_options(caffe2 PUBLIC + # -std=c++11 + -Wall + -Wextra + -Wno-unused-parameter + -Wno-missing-field-initializers + -Wno-write-strings + -Wno-unknown-pragmas + # Clang has an unfixed bug leading to spurious missing braces + # warnings, see https://bugs.llvm.org/show_bug.cgi?id=21629 + -Wno-missing-braces + ) + + if(NOT APPLE) + target_compile_options(caffe2 PRIVATE + # Considered to be flaky. See the discussion at + # https://github.com/pytorch/pytorch/pull/9608 + -Wno-maybe-uninitialized) + endif() + + endif() + + if (MSVC) + elseif (WERROR) + target_compile_options(caffe2 PRIVATE -Werror -Wno-strict-overflow) + endif() + + + if (NOT NO_API) + target_include_directories(caffe2 PRIVATE + ${TORCH_SRC_DIR}/csrc/api + ${TORCH_SRC_DIR}/csrc/api/include) + endif() + + if(USE_CUDA) + if(MSVC) + if (NOT NVTOOLEXT_HOME) + set(NVTOOLEXT_HOME "C:/Program Files/NVIDIA Corporation/NvToolsExt") + endif() + if ($ENV{NVTOOLEXT_HOME}) + set(NVTOOLEXT_HOME $ENV{NVTOOLEXT_HOME}) + endif() + set(TORCH_CUDA_LIBRARIES + ${NVTOOLEXT_HOME}/lib/x64/nvToolsExt64_1.lib + ${CUDA_LIBRARIES}) + target_include_directories(caffe2 PUBLIC "${NVTOOLEXT_HOME}/include") + + elseif(APPLE) + set(TORCH_CUDA_LIBRARIES + ${CUDA_TOOLKIT_ROOT_DIR}/lib/libcudart.dylib + ${CUDA_TOOLKIT_ROOT_DIR}/lib/libnvrtc.dylib + ${CUDA_TOOLKIT_ROOT_DIR}/lib/libnvToolsExt.dylib + ${CUDA_LIBRARIES}) + + else() + find_library(LIBNVTOOLSEXT libnvToolsExt.so PATHS ${CUDA_TOOLKIT_ROOT_DIR}/lib64/) + set(TORCH_CUDA_LIBRARIES + ${LIBNVTOOLSEXT} + ${CUDA_LIBRARIES}) + endif() + + target_compile_definitions(caffe2 PRIVATE USE_CUDA) + endif() + + + set(TH_CPU_INCLUDE + # dense + aten/src/TH + ${CMAKE_CURRENT_BINARY_DIR}/aten/src/TH + ${TORCH_ROOT}/aten/src + ${CMAKE_CURRENT_BINARY_DIR}/aten/src + ${CMAKE_BINARY_DIR}/aten/src) + target_include_directories(caffe2 PRIVATE ${TH_CPU_INCLUDE}) + + set(ATen_CPU_INCLUDE + ${TORCH_ROOT}/aten/src + ${CMAKE_CURRENT_BINARY_DIR}/../aten/src + ${CMAKE_CURRENT_BINARY_DIR}/../aten/src/ATen + ${CMAKE_BINARY_DIR}/aten/src) + target_include_directories(caffe2 PRIVATE ${ATen_CPU_INCLUDE}) + + target_include_directories(caffe2 PRIVATE + ${TORCH_SRC_DIR}/csrc) + + target_include_directories(caffe2 PRIVATE + ${TORCH_ROOT}/third_party/miniz-2.0.8) + + + set_property(TARGET caffe2 PROPERTY CXX_STANDARD 11) + + + # Prevent the unused functions being optimized away + # Otherwise torch.dll will be linked without caffe2_gpu.dll + if (MSVC) + # TODO What to do with this line? + set_target_properties(caffe2 PROPERTIES LINK_FLAGS "/OPT:NOREF") + endif() + + install(DIRECTORY "${TORCH_SRC_DIR}/csrc" + DESTINATION ${TORCH_INSTALL_INCLUDE_DIR}/torch + FILES_MATCHING PATTERN "*.h") + install(FILES "${TORCH_SRC_DIR}/script.h" "${TORCH_SRC_DIR}/extension.h" + DESTINATION ${TORCH_INSTALL_INCLUDE_DIR}/torch) + + + if (BUILD_TEST AND NOT MSVC AND NOT USE_ROCM) + add_subdirectory(${TORCH_ROOT}/test/cpp/jit ${CMAKE_BINARY_DIR}/test_jit) + endif() + + if (BUILD_TEST AND NOT NO_API) + add_subdirectory(${TORCH_ROOT}/test/cpp/api ${CMAKE_BINARY_DIR}/test_api) + endif() + + + # XXX This ABI check cannot be run with arm-linux-androideabi-g++ + if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") + if (DEFINED GLIBCXX_USE_CXX11_ABI) + message(STATUS "_GLIBCXX_USE_CXX11_ABI is already defined as a cmake variable") + else() + message(STATUS "${CMAKE_CXX_COMPILER} ${TORCH_SRC_DIR}/abi-check.cpp -o ${CMAKE_BINARY_DIR}/abi-check") + execute_process( + COMMAND + "${CMAKE_CXX_COMPILER}" + "${TORCH_SRC_DIR}/abi-check.cpp" + "-o" + "${CMAKE_BINARY_DIR}/abi-check" + RESULT_VARIABLE ABI_CHECK_COMPILE_RESULT) + if (ABI_CHECK_COMPILE_RESULT) + message(FATAL_ERROR "Could not compile ABI Check: ${ABI_CHECK_COMPILE_RESULT}") + endif() + execute_process( + COMMAND "${CMAKE_BINARY_DIR}/abi-check" + RESULT_VARIABLE ABI_CHECK_RESULT + OUTPUT_VARIABLE GLIBCXX_USE_CXX11_ABI) + if (ABI_CHECK_RESULT) + message(WARNING "Could not run ABI Check: ${ABI_CHECK_RESULT}") + endif() + endif() + message(STATUS "Determined _GLIBCXX_USE_CXX11_ABI=${GLIBCXX_USE_CXX11_ABI}") + endif() + + # CMake config for external projects. + configure_file( + ${PROJECT_SOURCE_DIR}/cmake/TorchConfigVersion.cmake.in + ${PROJECT_BINARY_DIR}/TorchConfigVersion.cmake + @ONLY) + configure_file( + ${TORCH_ROOT}/cmake/TorchConfig.cmake.in + ${PROJECT_BINARY_DIR}/TorchConfig.cmake + @ONLY) + install(FILES + ${PROJECT_BINARY_DIR}/TorchConfigVersion.cmake + ${PROJECT_BINARY_DIR}/TorchConfig.cmake + DESTINATION share/cmake/Torch) + + if (USE_DISTRIBUTED) + add_subdirectory(${TORCH_SRC_DIR}/lib/THD lib_THD) + if (NOT MSVC AND NOT APPLE) + add_subdirectory(${TORCH_SRC_DIR}/lib/c10d lib_c10d) + endif() + endif() + + + # ---[ Torch python bindings build + add_subdirectory(../torch torch) + + +endif() +# ========================================================== +# END formerly-libtorch flags +# ========================================================== + + + + + + + + + + + + + + + + + + +if (NOT NO_API) + target_include_directories(caffe2 PUBLIC + $ + $) +endif() + + +find_package(OpenMP QUIET) +if(USE_OPENMP AND OPENMP_FOUND) + message(STATUS "pytorch is compiling with OpenMP. \n" + "OpenMP CXX_FLAGS: ${OpenMP_CXX_FLAGS}. \n" + "OpenMP libraries: ${OpenMP_CXX_LIBRARIES}.") + target_compile_options(caffe2 INTERFACE ${OpenMP_CXX_FLAGS}) + target_link_libraries(caffe2 PRIVATE ${OpenMP_CXX_LIBRARIES}) +endif() + + +if(USE_ROCM) + + # XXX kostmo +# target_link_libraries(caffe2 PUBLIC caffe2_hip_library) + + target_compile_definitions(caffe2 PRIVATE + USE_ROCM + __HIP_PLATFORM_HCC__ + ) + target_include_directories(caffe2 PRIVATE + /opt/rocm/include + /opt/rocm/hcc/include + /opt/rocm/rocblas/include + /opt/rocm/hipsparse/include + ) +endif() + + +if (NOT WIN32 AND NOT USE_ASAN) + # Enable hidden visibility by default to make it easier to debug issues with + # TORCH_API annotations. Hidden visibility with selective default visibility + # behaves close enough to Windows' dllimport/dllexport. + # + # Unfortunately, hidden visibility messes up some ubsan warnings because + # templated classes crossing library boundary get duplicated (but identical) + # definitions. It's easier to just disable it. target_compile_options(caffe2 PRIVATE "-fvisibility=hidden") endif() + if(NOT BUILD_ATEN_ONLY) caffe2_interface_library(caffe2_protos caffe2_protos_whole) target_link_libraries(caffe2 PRIVATE caffe2_protos_whole) @@ -244,9 +857,9 @@ target_include_directories(caffe2 SYSTEM PRIVATE "${Caffe2_DEPENDENCY_INCLUDE}") # Set standard properties on the target torch_set_target_props(caffe2) -if (NOT MSVC) - target_compile_options(caffe2 INTERFACE "$<$:-std=c++11>") -endif() +#if (NOT MSVC) +# target_compile_options(caffe2 INTERFACE "$<$:-std=c++11>") +#endif() target_compile_options(caffe2 PRIVATE "-DCAFFE2_BUILD_MAIN_LIB") if (MSVC AND NOT BUILD_SHARED_LIBS) @@ -325,7 +938,17 @@ if (MSVC AND BUILD_SHARED_LIBS) endif() # Use -O2 for release builds (-O3 doesn't improve perf, and -Os results in perf regression) target_compile_options(caffe2 PRIVATE "$<$,$>:-O2>") + + install(TARGETS caffe2 EXPORT Caffe2Targets DESTINATION lib) + +install(TARGETS torch DESTINATION "${TORCH_INSTALL_LIB_DIR}") + +if (MSVC AND NOT TORCH_STATIC) + install(FILES $ DESTINATION "${TORCH_INSTALL_LIB_DIR}" OPTIONAL) +endif() + + caffe2_interface_library(caffe2 caffe2_library) list(APPEND Caffe2_MAIN_LIBS caffe2_library) # Install PDB files for MSVC builds @@ -335,12 +958,15 @@ endif() # ---[ CUDA library. if(USE_CUDA) + set(CUDA_LINK_LIBRARIES_KEYWORD PRIVATE) torch_cuda_based_add_library(caffe2_gpu ${Caffe2_GPU_SRCS}) set(CUDA_LINK_LIBRARIES_KEYWORD) target_link_libraries(caffe2_gpu INTERFACE caffe2::cudart) target_link_libraries(caffe2_gpu PUBLIC c10_cuda) + target_link_libraries(caffe2_gpu PUBLIC ${TORCH_CUDA_LIBRARIES}) + target_include_directories( caffe2_gpu INTERFACE $) target_include_directories( @@ -375,10 +1001,14 @@ if(USE_CUDA) caffe2_interface_library(caffe2_gpu caffe2_gpu_library) list(APPEND Caffe2_MAIN_LIBS caffe2_gpu_library) + # Install PDB files for MSVC builds if (MSVC AND BUILD_SHARED_LIBS) install(FILES $ DESTINATION lib OPTIONAL) endif() + + target_link_libraries(torch caffe2_gpu) + endif() # ---[ Caffe2 HIP sources. diff --git a/caffe2/core/c10_operator.h b/caffe2/core/c10_operator.h index a82d0d8402d6..27d66523ad9d 100644 --- a/caffe2/core/c10_operator.h +++ b/caffe2/core/c10_operator.h @@ -1,10 +1,9 @@ #pragma once +#if !defined(CAFFE2_IS_XPLAT_BUILD) #include #include -#if !defined(CAFFE2_IS_XPLAT_BUILD) #include -#endif #include namespace caffe2 { @@ -156,7 +155,6 @@ inline std::unique_ptr noCache() { * - If your operator has a variable number of input tensors, make the first (!) * input an input of type TensorList. There must be no other tensor inputs. */ -#if !defined(CAFFE2_IS_XPLAT_BUILD) #define C10_DECLARE_CAFFE2_OPERATOR(OperatorName) \ namespace caffe2 { \ namespace _c10_ops { \ @@ -180,24 +178,26 @@ inline std::unique_ptr noCache() { static auto registry_##OperatorName##_##__COUNTER__ = \ ::c10::RegisterOperators().op( \ ::caffe2::_c10_ops::schema_##OperatorName(), \ - ::c10::kernel( \ - &::caffe2::detail::call_caffe2_op_from_c10< \ - ::caffe2::_c10_ops::schema_##OperatorName, \ - OperatorClass>, \ - &::caffe2::detail::noCache), \ - ::c10::dispatchKey(::c10::CPUTensorId())); + ::c10::RegisterOperators::options() \ + .kernel( \ + &::caffe2::detail::call_caffe2_op_from_c10< \ + ::caffe2::_c10_ops::schema_##OperatorName, \ + OperatorClass>, \ + &::caffe2::detail::noCache) \ + .dispatchKey(::c10::CPUTensorId())); #define C10_REGISTER_CAFFE2_OPERATOR_CUDA(OperatorName, OperatorClass) \ /* Register call_caffe2_op_from_c10 as a kernel with the c10 dispatcher */ \ static auto registry_##OperatorName##_##__COUNTER__ = \ ::c10::RegisterOperators().op( \ ::caffe2::_c10_ops::schema_##OperatorName(), \ - ::c10::kernel( \ - &::caffe2::detail::call_caffe2_op_from_c10< \ - ::caffe2::_c10_ops::schema_##OperatorName, \ - OperatorClass>, \ - &::caffe2::detail::noCache), \ - ::c10::dispatchKey(::c10::CUDATensorId())); + ::c10::RegisterOperators::options() \ + .kernel( \ + &::caffe2::detail::call_caffe2_op_from_c10< \ + ::caffe2::_c10_ops::schema_##OperatorName, \ + OperatorClass>, \ + &::caffe2::detail::noCache) \ + .dispatchKey(::c10::CUDATensorId())); // You should never manually call the C10_REGISTER_CAFFE2_OPERATOR_HIP macro. // The C10_REGISTER_CAFFE2_OPERATOR_CUDA macro from above will be automatically @@ -207,12 +207,13 @@ inline std::unique_ptr noCache() { static auto registry_##OperatorName##_##__COUNTER__ = \ ::c10::RegisterOperators().op( \ ::caffe2::_c10_ops::schema_##OperatorName(), \ - ::c10::kernel( \ - &::caffe2::detail::call_caffe2_op_from_c10< \ - ::caffe2::_c10_ops::schema_##OperatorName, \ - OperatorClass>, \ - &::caffe2::detail::noCache), \ - ::c10::dispatchKey(::c10::HIPTensorId())); + ::c10::RegisterOperators().options() \ + .kernel( \ + &::caffe2::detail::call_caffe2_op_from_c10< \ + ::caffe2::_c10_ops::schema_##OperatorName, \ + OperatorClass>, \ + &::caffe2::detail::noCache) \ + .dispatchKey(::c10::HIPTensorId())); #else // Don't use c10 dispatcher on mobile because of binary size diff --git a/caffe2/core/net.cc b/caffe2/core/net.cc index 7e4c6e51bfeb..41a314403489 100644 --- a/caffe2/core/net.cc +++ b/caffe2/core/net.cc @@ -190,6 +190,15 @@ int ExecutorHelper::GetNumWorkers() const { CAFFE_THROW("Not implemented"); } +// benchmark an individual run so that we can FeedBlobs with new inputs +// no warmup +// return time taken in microseconds +float NetBase::TEST_Benchmark_One_Run() { + Timer timer; + CAFFE_ENFORCE(Run(), "Run has failed."); + return timer.MicroSeconds(); +} + std::vector NetBase::TEST_Benchmark( const int warmup_runs, const int main_runs, diff --git a/caffe2/core/net.h b/caffe2/core/net.h index 17a7e329cf7a..7bfb47fe1ca0 100644 --- a/caffe2/core/net.h +++ b/caffe2/core/net.h @@ -17,7 +17,6 @@ #include "caffe2/core/observer.h" #include "caffe2/core/operator_schema.h" #include "caffe2/core/tensor.h" -#include "caffe2/core/workspace.h" #include "caffe2/proto/caffe2_pb.h" #include "caffe2/utils/simple_queue.h" @@ -63,6 +62,13 @@ class CAFFE2_API NetBase : public Observable { virtual bool RunAsync(); + /* Benchmarks a network for one individual run so that we can feed new + * inputs on additional calls. + * This function returns the number of microseconds spent + * during the benchmark + */ + virtual float TEST_Benchmark_One_Run(); + /** * Benchmarks a network. * diff --git a/caffe2/core/operator.cc b/caffe2/core/operator.cc index 4bd3e1a715ef..57e93df605f6 100644 --- a/caffe2/core/operator.cc +++ b/caffe2/core/operator.cc @@ -29,8 +29,14 @@ C10_DEFINE_bool( C10_DEFINE_bool( caffe2_operator_throw_if_fp_exceptions, false, - "If set, throws if floating point exceptions (FE_DIVBYZERO, FE_INVALID, " - "FE_OVERFLOW) are detected when running any operator."); + "If set, throws if floating point exceptions (FE_DIVBYZERO, FE_INVALID) " + "are detected when running any operator. FE_OVERFLOW is handled separately " + "by caffe2_operator_throw_if_fp_overflow_exceptions option."); +C10_DEFINE_bool( + caffe2_operator_throw_if_fp_overflow_exceptions, + false, + "If set, throws if floating point exception FE_OVERFLOW is detected when " + "running any operator."); namespace caffe2 { @@ -63,8 +69,11 @@ OperatorBase::OperatorBase(const OperatorDef& operator_def, Workspace* ws) type_ = operator_def.type(); } +#if !defined(CAFFE2_IS_XPLAT_BUILD) namespace { -int compute_input_size_(const std::vector& inputs) { +int +C10_UNUSED // Suppress unused function warning on mobile. +compute_input_size_(const std::vector& inputs) { if (inputs.empty()) { return 0; } @@ -103,6 +112,7 @@ OperatorBase::OperatorBase( input_tensors_.resize(input_size_); output_tensors_.resize(newstyle_outputs_.size()); } +#endif vector OperatorBase::InputTensorShapes() const { vector tps; @@ -577,18 +587,43 @@ TensorShapes InferBlobShapesAndTypes( return tps; } -void LoadInt8TensorInfoOfBlob(float* scale, float* offset, const Blob* b) { - const int8::Int8TensorCPU* i8tc = +void LoadInt8TensorInfoOfBlob( + std::vector* scale, + std::vector* offset, + uint32_t* axis, + const Blob* b) { + const int8::Int8TensorCPU* int8_tensor = static_cast(b->GetRaw()); - *scale = i8tc->scale; - *offset = i8tc->zero_point; + scale->clear(); + offset->clear(); + scale->push_back(int8_tensor->scale); + offset->push_back(int8_tensor->zero_point); + *axis = 1; } TensorShape GetTensorShapeOfBlob(const Blob* b) { - TypeCall type_fun = GetTypeCallFunction(b->meta().id()); - TensorInfoCall tensor_info_fun = GetTensorInfoFunction(b->meta().id()); TensorShape tp; +#ifndef C10_MOBILE + auto function_ptr = + ExternalTensorFunctionsBaseRegistry()->Create(b->meta().id()); + if (function_ptr != nullptr) { + // This is dnnlowp tensor and we cant deal with it using regular path + auto dtype = function_ptr->GetExternalTensorType(b->GetRaw()); + tp.set_data_type(TypeMetaToDataType(dtype)); + size_t _capacity; + DeviceOption _device; + auto dshape = + function_ptr->GetExternalTensorInfo(b->GetRaw(), &_capacity, &_device); + for (auto d : dshape) { + tp.add_dims(d); + } + return tp; + } +#endif + + TypeCall type_fun = GetTypeCallFunction(b->meta().id()); + TensorInfoCall tensor_info_fun = GetTensorInfoFunction(b->meta().id()); if (type_fun) { tp.set_data_type(TypeMetaToDataType(type_fun(b->GetRaw()))); } @@ -737,9 +772,21 @@ std::function GetOperatorLogger() { c10::optional OperatorBase::argumentIndexWithName( const std::string& name) const { +#if !defined(CAFFE2_IS_XPLAT_BUILD) return getFunctionSchema().argumentIndexWithName(name); +#else + CAFFE_THROW("Non-legacy operators are not legal in xplat/caffe2"); +#endif } OperatorBase::~OperatorBase() noexcept = default; +#ifndef C10_MOBILE +C10_DEFINE_TYPED_REGISTRY( + ExternalTensorFunctionsBaseRegistry, + TypeIdentifier, + ExternalTensorFunctionsBase, + std::unique_ptr); +#endif + } // namespace caffe2 diff --git a/caffe2/core/operator.h b/caffe2/core/operator.h index 9e4c5952a82f..b094036a0706 100644 --- a/caffe2/core/operator.h +++ b/caffe2/core/operator.h @@ -6,12 +6,15 @@ #include #include #include +#include #include +#include #include #include -#include "c10/macros/Macros.h" -#include "c10/util/Registry.h" +#include +#include +#include #include "caffe2/core/blob.h" #include "caffe2/core/common.h" #include "caffe2/core/net.h" @@ -26,9 +29,12 @@ #include "caffe2/utils/proto_utils.h" #include +#if !defined(CAFFE2_IS_XPLAT_BUILD) #include +#endif C10_DECLARE_bool(caffe2_operator_throw_if_fp_exceptions); +C10_DECLARE_bool(caffe2_operator_throw_if_fp_overflow_exceptions); namespace c10 { struct FunctionSchema; @@ -50,10 +56,12 @@ class CAFFE2_API OperatorBase : public Observable { * Alternatively, inputs can be one tensor list ivalue followed by non-tensors * to represent operators with a variable number of inputs. */ +#if !defined(CAFFE2_IS_XPLAT_BUILD) explicit OperatorBase( const c10::FunctionSchema& schema, std::vector inputs, std::vector outputs); +#endif virtual ~OperatorBase() noexcept; @@ -61,12 +69,20 @@ class CAFFE2_API OperatorBase : public Observable { * New operators should be instantiated with FunctionSchema */ bool isLegacyOperator() const { +#if !defined(CAFFE2_IS_XPLAT_BUILD) return !fn_schema_; +#else + return true; +#endif } const c10::FunctionSchema& getFunctionSchema() const { CAFFE_ENFORCE(!isLegacyOperator()); +#if !defined(CAFFE2_IS_XPLAT_BUILD) return *fn_schema_.get(); +#else + CAFFE_THROW("Non-legacy operators are not legal in xplat/caffe2"); +#endif } /** @brief Checks if the operator has an argument of the given name. @@ -88,10 +104,14 @@ class CAFFE2_API OperatorBase : public Observable { return ArgumentHelper::GetSingleArgument( *operator_def_, name, default_value); } +#if !defined(CAFFE2_IS_XPLAT_BUILD) auto index = argumentIndexWithName(name); CAFFE_ENFORCE(index.has_value(), "Couldn't get index for argument!", name); const auto& value = newstyle_inputs_[index.value()]; return value.template to(); +#else + CAFFE_THROW("Non-legacy operators are not legal in xplat/caffe2"); +#endif } template @@ -100,10 +120,12 @@ class CAFFE2_API OperatorBase : public Observable { return ArgumentHelper::HasSingleArgumentOfType( *operator_def_, name); } +#if !defined(CAFFE2_IS_XPLAT_BUILD) template inline vector GetVectorFromIValueList(const c10::IValue& value) const { return value.template to>(); } +#endif template inline vector GetRepeatedArgument( @@ -114,10 +136,14 @@ class CAFFE2_API OperatorBase : public Observable { return ArgumentHelper::GetRepeatedArgument( *operator_def_, name, default_value); } +#if !defined(CAFFE2_IS_XPLAT_BUILD) auto index = argumentIndexWithName(name); CAFFE_ENFORCE(index.has_value(), "Couldn't get index for argument!", name); const auto& value = newstyle_inputs_[index.value()]; return GetVectorFromIValueList(value); +#else + CAFFE_THROW("Non-legacy operators are not legal in xplat/caffe2"); +#endif } // Get the inputs and outputs as specific types. @@ -165,6 +191,7 @@ class CAFFE2_API OperatorBase : public Observable { throw enf; } } +#if !defined(CAFFE2_IS_XPLAT_BUILD) DCHECK_LT(0, newstyle_inputs_.size()); IValue ival; if (newstyle_inputs_[0].isTensorList()) { @@ -186,6 +213,9 @@ class CAFFE2_API OperatorBase : public Observable { CAFFE_ENFORCE_EQ(tensor.GetDeviceType(), type); input_tensors_[idx] = std::move(tensor); return input_tensors_[idx]; +#else + CAFFE_THROW("Non-legacy operators are not legal in xplat/caffe2"); +#endif } template @@ -207,6 +237,7 @@ class CAFFE2_API OperatorBase : public Observable { // When you get a Tensor here it is not fully initialized return BlobGetMutableTensor(outputs_.at(idx), type); } +#if !defined(CAFFE2_IS_XPLAT_BUILD) auto& output = newstyle_outputs_[idx]; Tensor tensor = caffe2::Tensor(output); if (!tensor.defined() || tensor.GetDeviceType() != type) { @@ -216,6 +247,9 @@ class CAFFE2_API OperatorBase : public Observable { } output_tensors_[idx] = caffe2::Tensor(output); return &output_tensors_[idx]; +#else + CAFFE_THROW("Non-legacy operators are not legal in xplat/caffe2"); +#endif } inline Tensor @@ -232,10 +266,14 @@ class CAFFE2_API OperatorBase : public Observable { void SetOutputTensor(int idx, Tensor tensor) { if (!isLegacyOperator()) { +#if !defined(CAFFE2_IS_XPLAT_BUILD) newstyle_outputs_[idx] = at::Tensor(tensor); // also update the tensor in the hack output_tensors_[idx] = std::move(tensor); +#else + CAFFE_THROW("Non-legacy operators are not legal in xplat/caffe2"); +#endif } else { // update the tensor in the workspace BlobSetTensor(outputs_.at(idx), std::move(tensor)); @@ -257,6 +295,7 @@ class CAFFE2_API OperatorBase : public Observable { "device must be provided in options."); return BlobGetMutableTensor(outputs_.at(idx), dims, options); } +#if !defined(CAFFE2_IS_XPLAT_BUILD) auto& output = newstyle_outputs_[idx]; Tensor tensor = GetSizedTensorWithOptions(caffe2::Tensor(output), dims, options); @@ -265,6 +304,9 @@ class CAFFE2_API OperatorBase : public Observable { output_tensors_[idx] = caffe2::Tensor(output); return &output_tensors_[idx]; +#else + CAFFE_THROW("Non-legacy operators are not legal in xplat/caffe2"); +#endif } // Get output Tensor of the operator and CopyFrom the given Tensor @@ -349,7 +391,11 @@ class CAFFE2_API OperatorBase : public Observable { if (isLegacyOperator()) { return outputs_.size(); } +#if !defined(CAFFE2_IS_XPLAT_BUILD) return newstyle_outputs_.size(); +#else + CAFFE_THROW("Non-legacy operators are not legal in xplat/caffe2"); +#endif } inline const vector& Inputs() const { return inputs_; } inline const vector& Outputs() { return outputs_; } @@ -540,9 +586,11 @@ class CAFFE2_API OperatorBase : public Observable { return helper_; } +#if !defined(CAFFE2_IS_XPLAT_BUILD) std::vector move_newstyle_outputs() && { return std::move(newstyle_outputs_); } +#endif public: static const int kNoNetPositionSet = -1; @@ -556,9 +604,11 @@ class CAFFE2_API OperatorBase : public Observable { vector inputs_; vector outputs_; // Preferrably use c10::optional, but nvcc doesn't work +#if !defined(CAFFE2_IS_XPLAT_BUILD) std::unique_ptr fn_schema_; vector newstyle_inputs_; vector newstyle_outputs_; +#endif // HACK // We preserve the fact that Output() returns Tensor* // by storing Tensor in a vector owned by the @@ -618,6 +668,7 @@ inline NetDef OperatorBase::GetSingleArgument( return NetDef(); } +#if !defined(CAFFE2_IS_XPLAT_BUILD) template <> inline vector OperatorBase::GetVectorFromIValueList( const c10::IValue& value) const { @@ -649,6 +700,7 @@ inline vector OperatorBase::GetVectorFromIValueList( vector out; return out; } +#endif // OP_SINGLE_ARG provides a shorter initialization choice for initialization of // member variables for the class constructors. @@ -688,6 +740,7 @@ class Operator : public OperatorBase { // constructors will run on that device. context_.SwitchToDevice(); } +#if !defined(CAFFE2_IS_XPLAT_BUILD) explicit Operator( const c10::FunctionSchema& fn_schema, std::vector inputs, @@ -697,6 +750,7 @@ class Operator : public OperatorBase { // constructors will run on that device. context_.SwitchToDevice(); } +#endif ~Operator() noexcept override {} /// Retrieve a non-owning reference to the input at position 'idx' for this @@ -844,6 +898,8 @@ class Operator : public OperatorBase { CAFFE_ENFORCE( !std::fetestexcept(FE_INVALID), "Invalid floating point exception (FE_INVALID) reported."); + } + if (FLAGS_caffe2_operator_throw_if_fp_overflow_exceptions) { CAFFE_ENFORCE( !std::fetestexcept(FE_OVERFLOW), "Overflow floating point exception (FE_OVERFLOW) reported."); @@ -1342,8 +1398,11 @@ CAFFE2_API void SetOpEnginePref( const std::string& op_type, const CaffeMap& op_pref); -CAFFE2_API void -LoadInt8TensorInfoOfBlob(float* scale, float* offset, const Blob* b); +CAFFE2_API void LoadInt8TensorInfoOfBlob( + std::vector* scale, + std::vector* offset, + uint32_t* axis, + const Blob* b); CAFFE2_API TensorShape GetTensorShapeOfBlob(const Blob* b); @@ -1375,6 +1434,57 @@ CAFFE2_API std::set GetRegisteredOperators(); CAFFE2_API void SetOperatorLogger(std::function tracer); std::function GetOperatorLogger(); +#ifndef C10_MOBILE +// This is for transferring tensor data between C2 and backends. +struct ExternalTensorDescriptor { + uint64_t dataType; + uint32_t dimensions; + const uint64_t* shape; + uint32_t quantizationAxis; + uint64_t quantizationParams; + const float* scales; + const int32_t* biases; + uint64_t buffer; +}; + +class ExternalTensorFunctionsBase { + public: + explicit ExternalTensorFunctionsBase() {} + virtual ~ExternalTensorFunctionsBase() {} + virtual bool IsSameMetaType(TypeIdentifier id) = 0; + virtual void SetupExternalTensorDescriptor( + const Blob* blob, + std::vector>* shapes, + std::vector>* all_scales, + std::vector>* all_offsets, + ExternalTensorDescriptor* desc) = 0; + virtual void LoadInfoOfBlob( + const Blob* blob, + std::vector* scale, + std::vector* offset, + uint32_t* axis) = 0; + virtual TypeIdentifier GetTypeMetaId(const string& name) = 0; + virtual TypeMeta GetExternalTensorType(const void* c) = 0; + virtual vector GetExternalTensorInfo( + const void* c, + size_t* capacity, + DeviceOption* device) = 0; +}; + +C10_DECLARE_TYPED_REGISTRY( + ExternalTensorFunctionsBaseRegistry, + TypeIdentifier, + ExternalTensorFunctionsBase, + std::unique_ptr); + +#define REGISTER_EXTERNAL_TENSOR_FUNCTIONS(id, ...) \ + C10_REGISTER_TYPED_CLASS(ExternalTensorFunctionsBaseRegistry, id, __VA_ARGS__) +inline unique_ptr CreateExternalTensorFunctions( + TypeIdentifier id) { + return ExternalTensorFunctionsBaseRegistry()->Create(id); +} +#endif // C10_MOBILE + } // namespace caffe2 diff --git a/caffe2/core/operator_c10wrapper.h b/caffe2/core/operator_c10wrapper.h index d18d1fcb1858..ab9e636b0a70 100644 --- a/caffe2/core/operator_c10wrapper.h +++ b/caffe2/core/operator_c10wrapper.h @@ -1,5 +1,7 @@ #pragma once +// TODO Also register c10 operators on mobile +#if !defined(CAFFE2_IS_XPLAT_BUILD) #include #include #include @@ -180,7 +182,7 @@ class C10OperatorWrapper final : public Operator { if (default_value.has_value()) { return this->template GetSingleArgument(name, default_value->to()); } else { - AT_CHECK( + TORCH_CHECK( this->template HasSingleArgumentOfType(name), "Error in caffe2->c10 wrapper: Expected argument '", name, @@ -225,9 +227,8 @@ createC10OperatorWrapper(const char* op_name, const char* overload_name) { } } // namespace detail +} // namespace caffe2 -// TODO Also register c10 operators on mobile -#if !defined(CAFFE2_IS_XPLAT_BUILD) // TODO Currently we only register the CPU variant. This is going to be fixed // once the tensor detemplatization lands. #define REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_CPU( \ @@ -256,4 +257,3 @@ createC10OperatorWrapper(const char* op_name, const char* overload_name) { #define REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_HIP( \ OperatorName, Name) #endif -} // namespace caffe2 diff --git a/caffe2/core/tensor.cc b/caffe2/core/tensor.cc index 182b1563e5ee..67018bfcaea1 100644 --- a/caffe2/core/tensor.cc +++ b/caffe2/core/tensor.cc @@ -57,9 +57,11 @@ TypeMeta GetTensorType(const void* c) { const Tensor* tc = static_cast(c); return tc->dtype(); } + TypeMeta GetInt8TensorType(const void* c) { - const int8::Int8TensorCPU* i8tc = static_cast(c); - return (i8tc->t).dtype(); + const int8::Int8TensorCPU* int8_tensor = + static_cast(c); + return (int8_tensor->t).dtype(); } // TODO(jerryzh): Remove @@ -98,9 +100,11 @@ vector GetTensorInfo( vector GetInt8TensorInfo(const void* c, size_t* capacity, DeviceOption* device) { - const int8::Int8TensorCPU* i8tc = static_cast(c); - return GetTensorInfo(&(i8tc->t), capacity, device); + const int8::Int8TensorCPU* int8_tensor = + static_cast(c); + return GetTensorInfo(&(int8_tensor->t), capacity, device); } + // since we only have one tensor, probably need to remove this at some point? static CaffeMap tensor_info_call_registry_{ {TypeMeta::Id(), GetTensorInfo}, diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h index f65e7712c641..9fbc160b2509 100644 --- a/caffe2/core/tensor.h +++ b/caffe2/core/tensor.h @@ -519,8 +519,8 @@ class CAFFE2_API Tensor final { return impl_.get()->strides(); } - inline bool is_contiguous() const { - return impl_.get()->is_contiguous(); + inline bool is_contiguous(at::MemoryFormat memory_format=at::MemoryFormat::Any) const { + return impl_.get()->is_contiguous(memory_format); } /** @@ -649,6 +649,9 @@ Tensor TensorCPUFromValues(at::IntArrayRef dims, at::ArrayRef values) { return r; } +vector +GetTensorInfo(const void* c, size_t* capacity, DeviceOption* device); + class CAFFE2_API TensorPrinter { public: explicit TensorPrinter( diff --git a/caffe2/image/image_input_op.cc b/caffe2/image/image_input_op.cc index a01994c46592..be21e791ad16 100644 --- a/caffe2/image/image_input_op.cc +++ b/caffe2/image/image_input_op.cc @@ -19,21 +19,21 @@ REGISTER_CPU_OPERATOR(ImageInput, ImageInputOp); OPERATOR_SCHEMA(ImageInput) .NumInputs(0, 1) .NumOutputs(2, INT_MAX) - .TensorInferenceFunction( - [](const OperatorDef& def, const vector& /* unused */ ) { - vector out(2); - ArgumentHelper helper(def); - int batch_size = helper.GetSingleArgument("batch_size", 0); - int crop = helper.GetSingleArgument("crop", -1); - int color = helper.GetSingleArgument("color", 1); - CHECK_GT(crop, 0); - out[0] = CreateTensorShape( - vector{batch_size, crop, crop, color ? 3 : 1}, - TensorProto::FLOAT); - out[1] = - CreateTensorShape(vector{1, batch_size}, TensorProto::INT32); - return out; - }) + .TensorInferenceFunction([](const OperatorDef& def, + const vector& /* unused */) { + vector out(2); + ArgumentHelper helper(def); + int batch_size = helper.GetSingleArgument("batch_size", 0); + int crop = helper.GetSingleArgument("crop", -1); + int color = helper.GetSingleArgument("color", 1); + CHECK_GT(crop, 0); + out[0] = CreateTensorShape( + vector{batch_size, crop, crop, color ? 3 : 1}, + TensorProto::FLOAT); + out[1] = + CreateTensorShape(vector{1, batch_size}, TensorProto::INT32); + return out; + }) .SetDoc(R"DOC( Imports and processes images from a database. For each run of the operator, batch_size images will be processed. GPUs can optionally be used for @@ -51,76 +51,117 @@ The following transformations are applied to the image The dimension of the output image will always be cropxcrop )DOC") - .Arg("batch_size", "Number of images to output for each run of the operator" - ". Must be 1 or greater") + .Arg( + "batch_size", + "Number of images to output for each run of the operator" + ". Must be 1 or greater") .Arg("color", "Number of color channels (1 or 3). Defaults to 1") .Arg("color_jitter", "Whether or not to do color jitter. Defaults to 0") - .Arg("img_saturation", "Image saturation scale used in color jittering. " - "Defaults to 0.4") - .Arg("img_brightness", "Image brightness scale used in color jittering. " - "Defaults to 0.4") - .Arg("img_contrast", "Image contrast scale used in color jittering. " - "Defaults to 0.4") - .Arg("color_lighting", "Whether or not to do color lighting." - " Defaults to 0") - .Arg("color_lighting_std", "Std of normal distribution where color lighting" + .Arg( + "img_saturation", + "Image saturation scale used in color jittering. " + "Defaults to 0.4") + .Arg( + "img_brightness", + "Image brightness scale used in color jittering. " + "Defaults to 0.4") + .Arg( + "img_contrast", + "Image contrast scale used in color jittering. " + "Defaults to 0.4") + .Arg( + "color_lighting", + "Whether or not to do color lighting." + " Defaults to 0") + .Arg( + "color_lighting_std", + "Std of normal distribution where color lighting" " scaling factor is sampled. Defaults to 0.1") - .Arg("scale_jitter_type", "Type 0: No scale jittering " - "Type 1: Inception-style scale jittering") - .Arg("label_type", "Type 0: single integer label for multi-class " + .Arg( + "scale_jitter_type", + "Type 0: No scale jittering " + "Type 1: Inception-style scale jittering") + .Arg( + "label_type", + "Type 0: single integer label for multi-class " "classification. Type 1: sparse active label indices for multi-label " "classification. Type 2: dense label embedding vector for label " "embedding regression") - .Arg("scale", "Scale the size of the smallest dimension of the image to" - " this. Scale and minsize are mutually exclusive." - " Must be larger than crop") - .Arg("minsize", "Scale the size of the smallest dimension of the image to" - " this only if the size is initially smaller. Scale and minsize are" - " mutually exclusive. Must be larger than crop.") - .Arg("warp", "If 1, both dimensions of the image will be set to minsize or" - " scale; otherwise, the other dimension is proportionally scaled." - " Defaults to 0") + .Arg( + "scale", + "Scale the size of the smallest dimension of the image to" + " this. Scale and minsize are mutually exclusive." + " Must be larger than crop") + .Arg( + "minsize", + "Scale the size of the smallest dimension of the image to" + " this only if the size is initially smaller. Scale and minsize are" + " mutually exclusive. Must be larger than crop.") + .Arg( + "warp", + "If 1, both dimensions of the image will be set to minsize or" + " scale; otherwise, the other dimension is proportionally scaled." + " Defaults to 0") .Arg("crop", "Size to crop the image to. Must be provided") .Arg("mirror", "Whether or not to mirror the image. Defaults to 0") - .Arg("mean", "Mean by which to normalize color channels." - " Defaults to 0.") - .Arg("mean_per_channel", "Vector of means per color channel " - " (1 or 3 elements). Defaults to mean argument. Channel order BGR") - .Arg("std", "Standard deviation by which to normalize color channels." - " Defaults to 1.") - .Arg("std_per_channel", "Vector of standard dev. per color channel " - " (1 or 3 elements). Defaults to std argument. Channel order is BGR") + .Arg( + "mean", + "Mean by which to normalize color channels." + " Defaults to 0.") + .Arg( + "mean_per_channel", + "Vector of means per color channel " + " (1 or 3 elements). Defaults to mean argument. Channel order BGR") + .Arg( + "std", + "Standard deviation by which to normalize color channels." + " Defaults to 1.") + .Arg( + "std_per_channel", + "Vector of standard dev. per color channel " + " (1 or 3 elements). Defaults to std argument. Channel order is BGR") .Arg("bounding_ymin", "Bounding box coordinate. Defaults to -1 (none)") .Arg("bounding_xmin", "Bounding box coordinate. Defaults to -1 (none)") .Arg("bounding_height", "Bounding box coordinate. Defaults to -1 (none)") .Arg("bounding_width", "Bounding box coordinate. Defaults to -1 (none)") .ArgIsTest("Set to 1 to do deterministic cropping. Defaults to 0") .Arg("use_caffe_datum", "1 if the input is in Caffe format. Defaults to 0") - .Arg("use_gpu_transform", "1 if GPU acceleration should be used." - " Defaults to 0. Can only be 1 in a CUDAContext") - .Arg("decode_threads", "Number of CPU decode/transform threads." - " Defaults to 4") + .Arg( + "use_gpu_transform", + "1 if GPU acceleration should be used." + " Defaults to 0. Can only be 1 in a CUDAContext") + .Arg( + "decode_threads", + "Number of CPU decode/transform threads." + " Defaults to 4") .Arg("output_type", "If gpu_transform, can set to FLOAT or FLOAT16.") .Arg("db", "Name of the database (if not passed as input)") - .Arg("db_type", "Type of database (if not passed as input)." - " Defaults to leveldb") - .Arg("output_sizes", "The sizes of any outputs besides the data and label " - "(should have a number of elements equal to the number of additional " - "outputs)") - .Arg("random_scale", "[min, max] shortest-side desired for image resize. " - "Defaults to [-1, -1] or no random resize desired.") + .Arg( + "db_type", + "Type of database (if not passed as input)." + " Defaults to leveldb") + .Arg( + "output_sizes", + "The sizes of any outputs besides the data and label " + "(should have a number of elements equal to the number of additional " + "outputs)") + .Arg( + "random_scale", + "[min, max] shortest-side desired for image resize. " + "Defaults to [-1, -1] or no random resize desired.") .Input(0, "reader", "The input reader (a db::DBReader)") .Output(0, "data", "Tensor containing the images") .Output(1, "label", "Tensor containing the labels") - .Output(2, "additional outputs", "Any outputs after the first 2 will be " - "Tensors read from the input TensorProtos"); + .Output( + 2, + "additional outputs", + "Any outputs after the first 2 will be " + "Tensors read from the input TensorProtos"); NO_GRADIENT(ImageInput); #ifdef CAFFE2_USE_MKLDNN -REGISTER_IDEEP_OPERATOR( - ImageInput, - IDEEPFallbackOp>); +REGISTER_IDEEP_OPERATOR(ImageInput, IDEEPFallbackOp>); #endif -} // namespace caffe2 +} // namespace caffe2 diff --git a/caffe2/image/image_input_op.h b/caffe2/image/image_input_op.h index bd3fca17bef8..b85091634501 100644 --- a/caffe2/image/image_input_op.h +++ b/caffe2/image/image_input_op.h @@ -4,8 +4,8 @@ #include -#include #include +#include #include "c10/core/thread_pool.h" #include "caffe2/core/common.h" @@ -21,15 +21,15 @@ namespace caffe2 { class CUDAContext; template -class ImageInputOp final - : public PrefetchOperator { +class ImageInputOp final : public PrefetchOperator { // SINGLE_LABEL: single integer label for multi-class classification - // MULTI_LABEL_SPARSE: sparse active label indices for multi-label classification - // MULTI_LABEL_DENSE: dense label embedding vector for label embedding regression - // MULTI_LABEL_WEIGHTED_SPARSE: sparse active label indices with per-label weights - // for multi-label classification - // SINGLE_LABEL_WEIGHTED: single integer label for multi-class classification with weighted sampling - // EMBEDDING_LABEL: an array of floating numbers representing dense embedding. + // MULTI_LABEL_SPARSE: sparse active label indices for multi-label + // classification MULTI_LABEL_DENSE: dense label embedding vector for label + // embedding regression MULTI_LABEL_WEIGHTED_SPARSE: sparse active label + // indices with per-label weights for multi-label classification + // SINGLE_LABEL_WEIGHTED: single integer label for multi-class classification + // with weighted sampling EMBEDDING_LABEL: an array of floating numbers + // representing dense embedding. // It is useful for model distillation enum LABEL_TYPE { SINGLE_LABEL = 0, @@ -52,8 +52,7 @@ class ImageInputOp final using OperatorBase::OutputSize; using PrefetchOperator::context_; using PrefetchOperator::prefetch_thread_; - explicit ImageInputOp(const OperatorDef& operator_def, - Workspace* ws); + explicit ImageInputOp(const OperatorDef& operator_def, Workspace* ws); ~ImageInputOp() { PrefetchOperator::Finalize(); } @@ -73,19 +72,26 @@ class ImageInputOp final // Structure to store per-image information // This can be modified by the DecodeAnd* so needs // to be privatized per launch. - using PerImageArg = struct { - BoundingBox bounding_params; - }; + using PerImageArg = struct { BoundingBox bounding_params; }; bool GetImageAndLabelAndInfoFromDBValue( - const string& value, cv::Mat* img, PerImageArg& info, int item_id, + const string& value, + cv::Mat* img, + PerImageArg& info, + int item_id, std::mt19937* randgen); void DecodeAndTransform( - const std::string& value, float *image_data, int item_id, - const int channels, std::size_t thread_index); + const std::string& value, + float* image_data, + int item_id, + const int channels, + std::size_t thread_index); void DecodeAndTransposeOnly( - const std::string& value, uint8_t *image_data, int item_id, - const int channels, std::size_t thread_index); + const std::string& value, + uint8_t* image_data, + int item_id, + const int channels, + std::size_t thread_index); bool ApplyTransformOnGPU( const std::vector& dims, const c10::Device& type); @@ -201,8 +207,8 @@ ImageInputOp::ImageInputOp( 0)), num_decode_threads_( OperatorBase::template GetSingleArgument("decode_threads", 4)), - additional_output_sizes_(OperatorBase::template GetRepeatedArgument( - "output_sizes", {})), + additional_output_sizes_( + OperatorBase::template GetRepeatedArgument("output_sizes", {})), thread_pool_(std::make_shared(num_decode_threads_)), // output type only supported with CUDA and use_gpu_transform for now output_type_( @@ -221,96 +227,102 @@ ImageInputOp::ImageInputOp( } mean_ = OperatorBase::template GetRepeatedArgument( - "mean_per_channel", - {OperatorBase::template GetSingleArgument("mean", 0.)}); + "mean_per_channel", + {OperatorBase::template GetSingleArgument("mean", 0.)}); std_ = OperatorBase::template GetRepeatedArgument( - "std_per_channel", - {OperatorBase::template GetSingleArgument("std", 1.)}); + "std_per_channel", + {OperatorBase::template GetSingleArgument("std", 1.)}); if (additional_output_sizes_.size() == 0) { additional_output_sizes_ = std::vector(OutputSize() - 2, 1); } else { CAFFE_ENFORCE( - additional_output_sizes_.size() == OutputSize() - 2, - "If the output sizes are specified, they must be specified for all " - "additional outputs"); + additional_output_sizes_.size() == OutputSize() - 2, + "If the output sizes are specified, they must be specified for all " + "additional outputs"); } additional_inputs_count_ = OutputSize() - 2; default_arg_.bounding_params = { - false, - OperatorBase::template GetSingleArgument("bounding_ymin", -1), - OperatorBase::template GetSingleArgument("bounding_xmin", -1), - OperatorBase::template GetSingleArgument("bounding_height", -1), - OperatorBase::template GetSingleArgument("bounding_width", -1), + false, + OperatorBase::template GetSingleArgument("bounding_ymin", -1), + OperatorBase::template GetSingleArgument("bounding_xmin", -1), + OperatorBase::template GetSingleArgument("bounding_height", -1), + OperatorBase::template GetSingleArgument("bounding_width", -1), }; if (operator_def.input_size() == 0) { LOG(ERROR) << "You are using an old ImageInputOp format that creates " - "a local db reader. Consider moving to the new style " - "that takes in a DBReader blob instead."; - string db_name = - OperatorBase::template GetSingleArgument("db", ""); + "a local db reader. Consider moving to the new style " + "that takes in a DBReader blob instead."; + string db_name = OperatorBase::template GetSingleArgument("db", ""); CAFFE_ENFORCE_GT(db_name.size(), 0, "Must specify a db name."); owned_reader_.reset(new db::DBReader( - OperatorBase::template GetSingleArgument( - "db_type", "leveldb"), + OperatorBase::template GetSingleArgument("db_type", "leveldb"), db_name)); reader_ = owned_reader_.get(); } // hard-coded PCA eigenvectors and eigenvalues, based on RBG channel order color_lighting_eigvecs_.push_back( - std::vector{-144.7125f, 183.396f, 102.2295f}); + std::vector{-144.7125f, 183.396f, 102.2295f}); color_lighting_eigvecs_.push_back( - std::vector{-148.104f, -1.1475f, -207.57f}); + std::vector{-148.104f, -1.1475f, -207.57f}); color_lighting_eigvecs_.push_back( - std::vector{-148.818f, -177.174f, 107.1765f}); + std::vector{-148.818f, -177.174f, 107.1765f}); color_lighting_eigvals_ = std::vector{0.2175f, 0.0188f, 0.0045f}; CAFFE_ENFORCE_GT(batch_size_, 0, "Batch size should be nonnegative."); if (use_caffe_datum_) { - CAFFE_ENFORCE(label_type_ == SINGLE_LABEL || label_type_ == SINGLE_LABEL_WEIGHTED, - "Caffe datum only supports single integer label"); + CAFFE_ENFORCE( + label_type_ == SINGLE_LABEL || label_type_ == SINGLE_LABEL_WEIGHTED, + "Caffe datum only supports single integer label"); } - if (label_type_ != SINGLE_LABEL && label_type_ != SINGLE_LABEL_WEIGHTED) { - CAFFE_ENFORCE_GT(num_labels_, 0, - "Number of labels must be set for using either sparse label indices or dense label embedding."); + if (label_type_ != SINGLE_LABEL && label_type_ != SINGLE_LABEL_WEIGHTED) { + CAFFE_ENFORCE_GT( + num_labels_, + 0, + "Number of labels must be set for using either sparse label indices or dense label embedding."); } if (label_type_ == MULTI_LABEL_WEIGHTED_SPARSE || - label_type_ == SINGLE_LABEL_WEIGHTED) { + label_type_ == SINGLE_LABEL_WEIGHTED) { additional_inputs_offset_ = 3; } else { additional_inputs_offset_ = 2; } - CAFFE_ENFORCE((scale_ > 0) != (minsize_ > 0), - "Must provide one and only one of scaling or minsize"); + CAFFE_ENFORCE( + (scale_ > 0) != (minsize_ > 0), + "Must provide one and only one of scaling or minsize"); CAFFE_ENFORCE_GT(crop_, 0, "Must provide the cropping value."); CAFFE_ENFORCE_GE( - scale_ > 0 ? scale_ : minsize_, - crop_, "The scale/minsize value must be no smaller than the crop value."); + scale_ > 0 ? scale_ : minsize_, + crop_, + "The scale/minsize value must be no smaller than the crop value."); CAFFE_ENFORCE_EQ( mean_.size(), std_.size(), "The mean and std. dev vectors must be of the same size."); - CAFFE_ENFORCE(mean_.size() == 1 || mean_.size() == 3, - "The mean and std. dev vectors must be of size 1 or 3"); + CAFFE_ENFORCE( + mean_.size() == 1 || mean_.size() == 3, + "The mean and std. dev vectors must be of size 1 or 3"); CAFFE_ENFORCE( !use_caffe_datum_ || OutputSize() == 2, "There can only be 2 outputs if the Caffe datum format is used"); - CAFFE_ENFORCE(random_scale_.size() == 2, - "Must provide [scale_min, scale_max]"); - CAFFE_ENFORCE_GE(random_scale_[1], random_scale_[0], + CAFFE_ENFORCE( + random_scale_.size() == 2, "Must provide [scale_min, scale_max]"); + CAFFE_ENFORCE_GE( + random_scale_[1], + random_scale_[0], "random scale must provide a range [min, max]"); - if (default_arg_.bounding_params.ymin < 0 - || default_arg_.bounding_params.xmin < 0 - || default_arg_.bounding_params.height < 0 - || default_arg_.bounding_params.width < 0) { + if (default_arg_.bounding_params.ymin < 0 || + default_arg_.bounding_params.xmin < 0 || + default_arg_.bounding_params.height < 0 || + default_arg_.bounding_params.width < 0) { default_arg_.bounding_params.valid = false; } else { default_arg_.bounding_params.valid = true; @@ -334,11 +346,10 @@ ImageInputOp::ImageInputOp( LOG(INFO) << " Applying a default bounding box of Y [" << default_arg_.bounding_params.ymin << "; " << default_arg_.bounding_params.ymin + - default_arg_.bounding_params.height - << ") x X [" - << default_arg_.bounding_params.xmin << "; " + default_arg_.bounding_params.height + << ") x X [" << default_arg_.bounding_params.xmin << "; " << default_arg_.bounding_params.xmin + - default_arg_.bounding_params.width + default_arg_.bounding_params.width << ")"; } if (scale_ > 0 && !random_scaling_) { @@ -348,8 +359,7 @@ ImageInputOp::ImageInputOp( if (random_scaling_) { // randomly set min_size_ for each image LOG(INFO) << " Randomly scaling shortest side between " - << random_scale_[0] << " and " - << random_scale_[1]; + << random_scale_[0] << " and " << random_scale_[1]; } else { // Here, minsize_ > 0 LOG(INFO) << " Ensuring minimum image size of " << minsize_ @@ -365,16 +375,16 @@ ImageInputOp::ImageInputOp( auto mit = mean_.begin(); auto sit = std_.begin(); - for (int i = 0; - mit != mean_.end() && sit != std_.end(); - ++mit, ++sit, ++i) { + for (int i = 0; mit != mean_.end() && sit != std_.end(); ++mit, ++sit, ++i) { LOG(INFO) << " Default [Channel " << i << "] Subtract mean " << *mit << " and divide by std " << *sit << "."; // We actually will use the inverse of std, so inverse it here *sit = 1.f / *sit; } LOG(INFO) << " Outputting images as " - << OperatorBase::template GetSingleArgument("output_type", "unknown") << "."; + << OperatorBase::template GetSingleArgument( + "output_type", "unknown") + << "."; std::mt19937 meta_randgen(time(nullptr)); for (int i = 0; i < num_decode_threads_; ++i) { @@ -394,25 +404,17 @@ ImageInputOp::ImageInputOp( sizes = std::vector{batch_size_}; } // data type for prefetched_label_ is actually not known here.. - ReinitializeTensor( - &prefetched_label_, - sizes, - at::dtype().device(CPU)); + ReinitializeTensor(&prefetched_label_, sizes, at::dtype().device(CPU)); for (int i = 0; i < additional_output_sizes_.size(); ++i) { prefetched_additional_outputs_on_device_.emplace_back(); prefetched_additional_outputs_.emplace_back(); } - } // Inception-stype scale jittering template -bool RandomSizedCropping( - cv::Mat* img, - const int crop, - std::mt19937* randgen -) { +bool RandomSizedCropping(cv::Mat* img, const int crop, std::mt19937* randgen) { cv::Mat scaled_img; bool inception_scale_jitter = false; int im_height = img->rows, im_width = img->cols; @@ -426,20 +428,15 @@ bool RandomSizedCropping( float aspect_ratio = aspect_ratio_dis(*randgen); int nh = floor(std::sqrt(((float)target_area / aspect_ratio))); int nw = floor(std::sqrt(((float)target_area * aspect_ratio))); - if (nh >= 1 && nh <= im_height && nw >=1 && nw <= im_width) { - int height_offset = std::uniform_int_distribution<>( - 0, im_height - nh)(*randgen); - int width_offset = std::uniform_int_distribution<>( - 0,im_width - nw)(*randgen); + if (nh >= 1 && nh <= im_height && nw >= 1 && nw <= im_width) { + int height_offset = + std::uniform_int_distribution<>(0, im_height - nh)(*randgen); + int width_offset = + std::uniform_int_distribution<>(0, im_width - nw)(*randgen); cv::Rect ROI(width_offset, height_offset, nw, nh); cropping = (*img)(ROI); cv::resize( - cropping, - scaled_img, - cv::Size(crop, crop), - 0, - 0, - cv::INTER_AREA); + cropping, scaled_img, cv::Size(crop, crop), 0, 0, cv::INTER_AREA); *img = scaled_img; inception_scale_jitter = true; break; @@ -697,7 +694,8 @@ bool ImageInputOp::GetImageAndLabelAndInfoFromDBValue( if (out_c == src.channels()) { *img = src; } else { - cv::cvtColor(src, *img, (out_c == 1) ? cv::COLOR_BGR2GRAY : cv::COLOR_GRAY2BGR); + cv::cvtColor( + src, *img, (out_c == 1) ? cv::COLOR_BGR2GRAY : cv::COLOR_GRAY2BGR); } // Note(Yangqing): I believe that the mat should be created continuous. @@ -706,23 +704,26 @@ bool ImageInputOp::GetImageAndLabelAndInfoFromDBValue( // Sanity check now that we decoded everything // Ensure that the bounding box is legit - if (info.bounding_params.valid - && (src.rows < info.bounding_params.ymin + info.bounding_params.height - || src.cols < info.bounding_params.xmin + info.bounding_params.width - )) { + if (info.bounding_params.valid && + (src.rows < info.bounding_params.ymin + info.bounding_params.height || + src.cols < info.bounding_params.xmin + info.bounding_params.width)) { info.bounding_params.valid = false; } // Apply the bounding box if requested if (info.bounding_params.valid) { // If we reach here, we know the parameters are sane - cv::Rect bounding_box(info.bounding_params.xmin, info.bounding_params.ymin, - info.bounding_params.width, info.bounding_params.height); + cv::Rect bounding_box( + info.bounding_params.xmin, + info.bounding_params.ymin, + info.bounding_params.width, + info.bounding_params.height); *img = (*img)(bounding_box); /* LOG(INFO) << "Did bounding with ymin:" - << info.bounding_params.ymin << " xmin:" << info.bounding_params.xmin + << info.bounding_params.ymin << " xmin:" << + info.bounding_params.xmin << " height:" << info.bounding_params.height << " width:" << info.bounding_params.width << "\n"; LOG(INFO) << "Bounded matrix: " << img; @@ -736,52 +737,51 @@ bool ImageInputOp::GetImageAndLabelAndInfoFromDBValue( if (scale_jitter_type_ == INCEPTION_STYLE) { if (!is_test_) { // Inception-stype scale jittering is only used for training - inception_scale_jitter = RandomSizedCropping(img, crop_, randgen); + inception_scale_jitter = + RandomSizedCropping(img, crop_, randgen); // if a random crop is still not found, do simple random cropping later } } if ((scale_jitter_type_ == NO_SCALE_JITTER) || - (scale_jitter_type_ == INCEPTION_STYLE && !inception_scale_jitter)) { - int scaled_width, scaled_height; - int scale_to_use = scale_ > 0 ? scale_ : minsize_; - - // set the random minsize - if (random_scaling_) { - scale_to_use = std::uniform_int_distribution<>(random_scale_[0], - random_scale_[1])(*randgen); - } + (scale_jitter_type_ == INCEPTION_STYLE && !inception_scale_jitter)) { + int scaled_width, scaled_height; + int scale_to_use = scale_ > 0 ? scale_ : minsize_; - if (warp_) { - scaled_width = scale_to_use; - scaled_height = scale_to_use; - } else if (img->rows > img->cols) { - scaled_width = scale_to_use; - scaled_height = - static_cast(img->rows) * scale_to_use / img->cols; - } else { - scaled_height = scale_to_use; - scaled_width = - static_cast(img->cols) * scale_to_use / img->rows; - } - if ((scale_ > 0 && - (scaled_height != img->rows || scaled_width != img->cols)) - || (scaled_height > img->rows || scaled_width > img->cols)) { - // We rescale in all cases if we are using scale_ - // but only to make the image bigger if using minsize_ - /* - LOG(INFO) << "Scaling to " << scaled_width << " x " << scaled_height - << " From " << img->cols << " x " << img->rows; - */ - cv::resize( - *img, - scaled_img, - cv::Size(scaled_width, scaled_height), - 0, - 0, - cv::INTER_AREA); - *img = scaled_img; - } + // set the random minsize + if (random_scaling_) { + scale_to_use = std::uniform_int_distribution<>( + random_scale_[0], random_scale_[1])(*randgen); + } + + if (warp_) { + scaled_width = scale_to_use; + scaled_height = scale_to_use; + } else if (img->rows > img->cols) { + scaled_width = scale_to_use; + scaled_height = static_cast(img->rows) * scale_to_use / img->cols; + } else { + scaled_height = scale_to_use; + scaled_width = static_cast(img->cols) * scale_to_use / img->rows; + } + if ((scale_ > 0 && + (scaled_height != img->rows || scaled_width != img->cols)) || + (scaled_height > img->rows || scaled_width > img->cols)) { + // We rescale in all cases if we are using scale_ + // but only to make the image bigger if using minsize_ + /* + LOG(INFO) << "Scaling to " << scaled_width << " x " << scaled_height + << " From " << img->cols << " x " << img->rows; + */ + cv::resize( + *img, + scaled_img, + cv::Size(scaled_width, scaled_height), + 0, + 0, + cv::INTER_AREA); + *img = scaled_img; + } } // TODO(Yangqing): return false if any error happens. @@ -791,19 +791,18 @@ bool ImageInputOp::GetImageAndLabelAndInfoFromDBValue( // assume HWC order and color channels BGR template void Saturation( - float* img, - const int img_size, - const float alpha_rand, - std::mt19937* randgen -) { + float* img, + const int img_size, + const float alpha_rand, + std::mt19937* randgen) { float alpha = 1.0f + - std::uniform_real_distribution(-alpha_rand, alpha_rand)(*randgen); + std::uniform_real_distribution(-alpha_rand, alpha_rand)(*randgen); // BGR to Gray scale image: R -> 0.299, G -> 0.587, B -> 0.114 int p = 0; for (int h = 0; h < img_size; ++h) { for (int w = 0; w < img_size; ++w) { float gray_color = img[3 * p] * 0.114f + img[3 * p + 1] * 0.587f + - img[3 * p + 2] * 0.299f; + img[3 * p + 2] * 0.299f; for (int c = 0; c < 3; ++c) { img[3 * p + c] = img[3 * p + c] * alpha + gray_color * (1.0f - alpha); } @@ -815,13 +814,12 @@ void Saturation( // assume HWC order and color channels BGR template void Brightness( - float* img, - const int img_size, - const float alpha_rand, - std::mt19937* randgen -) { + float* img, + const int img_size, + const float alpha_rand, + std::mt19937* randgen) { float alpha = 1.0f + - std::uniform_real_distribution(-alpha_rand, alpha_rand)(*randgen); + std::uniform_real_distribution(-alpha_rand, alpha_rand)(*randgen); int p = 0; for (int h = 0; h < img_size; ++h) { for (int w = 0; w < img_size; ++w) { @@ -835,25 +833,24 @@ void Brightness( // assume HWC order and color channels BGR template void Contrast( - float* img, - const int img_size, - const float alpha_rand, - std::mt19937* randgen -){ + float* img, + const int img_size, + const float alpha_rand, + std::mt19937* randgen) { float gray_mean = 0; int p = 0; for (int h = 0; h < img_size; ++h) { for (int w = 0; w < img_size; ++w) { // BGR to Gray scale image: R -> 0.299, G -> 0.587, B -> 0.114 gray_mean += img[3 * p] * 0.114f + img[3 * p + 1] * 0.587f + - img[3 * p + 2] * 0.299f; + img[3 * p + 2] * 0.299f; p++; } } gray_mean /= (img_size * img_size); float alpha = 1.0f + - std::uniform_real_distribution(-alpha_rand, alpha_rand)(*randgen); + std::uniform_real_distribution(-alpha_rand, alpha_rand)(*randgen); p = 0; for (int h = 0; h < img_size; ++h) { for (int w = 0; w < img_size; ++w) { @@ -868,19 +865,20 @@ void Contrast( // assume HWC order and color channels BGR template void ColorJitter( - float* img, - const int img_size, - const float saturation, - const float brightness, - const float contrast, - std::mt19937* randgen -) { - std::srand (unsigned(std::time(0))); + float* img, + const int img_size, + const float saturation, + const float brightness, + const float contrast, + std::mt19937* randgen) { + std::srand(unsigned(std::time(0))); std::vector jitter_order{0, 1, 2}; // obtain a time-based seed: unsigned seed = std::chrono::system_clock::now().time_since_epoch().count(); - std::shuffle(jitter_order.begin(), jitter_order.end(), - std::default_random_engine(seed)); + std::shuffle( + jitter_order.begin(), + jitter_order.end(), + std::default_random_engine(seed)); for (int i = 0; i < 3; ++i) { if (jitter_order[i] == 0) { @@ -896,13 +894,12 @@ void ColorJitter( // assume HWC order and color channels BGR template void ColorLighting( - float* img, - const int img_size, - const float alpha_std, - const std::vector>& eigvecs, - const std::vector& eigvals, - std::mt19937* randgen -) { + float* img, + const int img_size, + const float alpha_std, + const std::vector>& eigvecs, + const std::vector& eigvals, + std::mt19937* randgen) { std::normal_distribution d(0, alpha_std); std::vector alphas(3); for (int i = 0; i < 3; ++i) { @@ -924,19 +921,17 @@ void ColorLighting( } } } - } // assume HWC order and color channels BGR // mean subtraction and scaling. template void ColorNormalization( - float* img, - const int img_size, - const int channels, - const std::vector& mean, - const std::vector& std -) { + float* img, + const int img_size, + const int channels, + const std::vector& mean, + const std::vector& std) { int p = 0; for (int h = 0; h < img_size; ++h) { for (int w = 0; w < img_size; ++w) { @@ -981,9 +976,9 @@ void TransformImage( height_offset = (scaled_img.rows - crop) / 2; } else { width_offset = - std::uniform_int_distribution<>(0, scaled_img.cols - crop)(*randgen); + std::uniform_int_distribution<>(0, scaled_img.cols - crop)(*randgen); height_offset = - std::uniform_int_distribution<>(0, scaled_img.rows - crop)(*randgen); + std::uniform_int_distribution<>(0, scaled_img.rows - crop)(*randgen); } float* image_data_ptr = image_data; @@ -1010,12 +1005,17 @@ void TransformImage( } if (color_jitter && channels == 3 && !is_test) { - ColorJitter(image_data, crop, saturation, brightness, contrast, - randgen); + ColorJitter( + image_data, crop, saturation, brightness, contrast, randgen); } if (color_lighting && channels == 3 && !is_test) { - ColorLighting(image_data, crop, color_lighting_std, - color_lighting_eigvecs, color_lighting_eigvals, randgen); + ColorLighting( + image_data, + crop, + color_lighting_std, + color_lighting_eigvecs, + color_lighting_eigvals, + randgen); } // Color normalization @@ -1026,11 +1026,15 @@ void TransformImage( // Only crop / transose the image // leave in uint8_t dataType template -void CropTransposeImage(const cv::Mat& scaled_img, const int channels, - uint8_t *cropped_data, const int crop, - const bool mirror, std::mt19937 *randgen, - std::bernoulli_distribution *mirror_this_image, - bool is_test = false) { +void CropTransposeImage( + const cv::Mat& scaled_img, + const int channels, + uint8_t* cropped_data, + const int crop, + const bool mirror, + std::mt19937* randgen, + std::bernoulli_distribution* mirror_this_image, + bool is_test = false) { CAFFE_ENFORCE_GE( scaled_img.rows, crop, "Image height must be bigger than crop."); CAFFE_ENFORCE_GE( @@ -1043,16 +1047,16 @@ void CropTransposeImage(const cv::Mat& scaled_img, const int channels, height_offset = (scaled_img.rows - crop) / 2; } else { width_offset = - std::uniform_int_distribution<>(0, scaled_img.cols - crop)(*randgen); + std::uniform_int_distribution<>(0, scaled_img.cols - crop)(*randgen); height_offset = - std::uniform_int_distribution<>(0, scaled_img.rows - crop)(*randgen); + std::uniform_int_distribution<>(0, scaled_img.rows - crop)(*randgen); } if (mirror && (*mirror_this_image)(*randgen)) { // Copy mirrored image. for (int h = height_offset; h < height_offset + crop; ++h) { for (int w = width_offset + crop - 1; w >= width_offset; --w) { - const uint8_t* cv_data = scaled_img.ptr(h) + w*channels; + const uint8_t* cv_data = scaled_img.ptr(h) + w * channels; for (int c = 0; c < channels; ++c) { *(cropped_data++) = cv_data[c]; } @@ -1062,7 +1066,7 @@ void CropTransposeImage(const cv::Mat& scaled_img, const int channels, // Copy normally. for (int h = height_offset; h < height_offset + crop; ++h) { for (int w = width_offset; w < width_offset + crop; ++w) { - const uint8_t* cv_data = scaled_img.ptr(h) + w*channels; + const uint8_t* cv_data = scaled_img.ptr(h) + w * channels; for (int c = 0; c < channels; ++c) { *(cropped_data++) = cv_data[c]; } @@ -1075,9 +1079,11 @@ void CropTransposeImage(const cv::Mat& scaled_img, const int channels, // Intended as entry point for binding to thread pool template void ImageInputOp::DecodeAndTransform( - const std::string& value, float *image_data, int item_id, - const int channels, std::size_t thread_index) { - + const std::string& value, + float* image_data, + int item_id, + const int channels, + std::size_t thread_index) { CAFFE_ENFORCE((int)thread_index < num_decode_threads_); std::bernoulli_distribution mirror_this_image(0.5f); @@ -1089,18 +1095,34 @@ void ImageInputOp::DecodeAndTransform( CHECK( GetImageAndLabelAndInfoFromDBValue(value, &img, info, item_id, randgen)); // Factor out the image transformation - TransformImage(img, channels, image_data, - color_jitter_, img_saturation_, img_brightness_, img_contrast_, - color_lighting_, color_lighting_std_, color_lighting_eigvecs_, - color_lighting_eigvals_, crop_, mirror_, mean_, std_, - randgen, &mirror_this_image, is_test_); + TransformImage( + img, + channels, + image_data, + color_jitter_, + img_saturation_, + img_brightness_, + img_contrast_, + color_lighting_, + color_lighting_std_, + color_lighting_eigvecs_, + color_lighting_eigvals_, + crop_, + mirror_, + mean_, + std_, + randgen, + &mirror_this_image, + is_test_); } template void ImageInputOp::DecodeAndTransposeOnly( - const std::string& value, uint8_t *image_data, int item_id, - const int channels, std::size_t thread_index) { - + const std::string& value, + uint8_t* image_data, + int item_id, + const int channels, + std::size_t thread_index) { CAFFE_ENFORCE((int)thread_index < num_decode_threads_); std::bernoulli_distribution mirror_this_image(0.5f); @@ -1113,11 +1135,17 @@ void ImageInputOp::DecodeAndTransposeOnly( GetImageAndLabelAndInfoFromDBValue(value, &img, info, item_id, randgen)); // Factor out the image transformation - CropTransposeImage(img, channels, image_data, crop_, mirror_, - randgen, &mirror_this_image, is_test_); + CropTransposeImage( + img, + channels, + image_data, + crop_, + mirror_, + randgen, + &mirror_this_image, + is_test_); } - template bool ImageInputOp::Prefetch() { if (!owned_reader_.get()) { @@ -1146,16 +1174,16 @@ bool ImageInputOp::Prefetch() { reader_->Read(&key, &value); // determine label type based on first item - if( item_id == 0 ) { - if( use_caffe_datum_ ) { + if (item_id == 0) { + if (use_caffe_datum_) { prefetched_label_.mutable_data(); } else { TensorProtos protos; CAFFE_ENFORCE(protos.ParseFromString(value)); TensorProto_DataType labeldt = protos.protos(1).data_type(); - if( labeldt == TensorProto::INT32 ) { + if (labeldt == TensorProto::INT32) { prefetched_label_.mutable_data(); - } else if ( labeldt == TensorProto::FLOAT) { + } else if (labeldt == TensorProto::FLOAT) { prefetched_label_.mutable_data(); } else { LOG(FATAL) << "Unsupported label type."; @@ -1164,7 +1192,8 @@ bool ImageInputOp::Prefetch() { for (int i = 0; i < additional_inputs_count_; ++i) { int index = additional_inputs_offset_ + i; TensorProto additional_output_proto = protos.protos(index); - auto sizes = std::vector({batch_size_, additional_output_sizes_[i]}); + auto sizes = + std::vector({batch_size_, additional_output_sizes_[i]}); if (additional_output_proto.data_type() == TensorProto::FLOAT) { prefetched_additional_outputs_[i] = caffe2::empty(sizes, at::dtype().device(CPU)); @@ -1312,6 +1341,6 @@ bool ImageInputOp::CopyPrefetched() { } return true; } -} // namespace caffe2 +} // namespace caffe2 -#endif // CAFFE2_IMAGE_IMAGE_INPUT_OP_H_ +#endif // CAFFE2_IMAGE_IMAGE_INPUT_OP_H_ diff --git a/caffe2/image/image_input_op_gpu.cc b/caffe2/image/image_input_op_gpu.cc index 56d2f3dd317b..a484585770e0 100644 --- a/caffe2/image/image_input_op_gpu.cc +++ b/caffe2/image/image_input_op_gpu.cc @@ -35,4 +35,4 @@ bool ImageInputOp::ApplyTransformOnGPU( REGISTER_CUDA_OPERATOR(ImageInput, ImageInputOp); -} // namespace caffe2 +} // namespace caffe2 diff --git a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm index f556e9c7956c..0131c9179d7d 100644 --- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm +++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm @@ -2072,7 +2072,9 @@ bool RunOnDevice() override { OperatorBase::GetSingleArgument("post_nms_topN", 300)), rpn_nms_thresh_( OperatorBase::GetSingleArgument("nms_thresh", 0.7f)), - rpn_min_size_(OperatorBase::GetSingleArgument("min_size", 16)) {} + rpn_min_size_(OperatorBase::GetSingleArgument("min_size", 16)), + legacy_plus_one_( + this->template GetSingleArgument("legacy_plus_one", true)) {} template std::vector nms_metal( @@ -2207,14 +2209,21 @@ void ProposalsForOneImage( Eigen::Map(scores.data(), H * W, A) = Eigen::Map(scores_tensor.data(), A, H * W).transpose(); // Transform anchors into proposals via bbox transformations - auto proposals = utils::bbox_transform(all_anchors.array(), bbox_deltas); + auto proposals = utils::bbox_transform( + all_anchors.array(), + bbox_deltas, + std::vector{1.0, 1.0, 1.0, 1.0}, + utils::BBOX_XFORM_CLIP_DEFAULT, + legacy_plus_one_); // 2. clip proposals to image (may result in proposals with zero area // that will be removed in the next step) - proposals = utils::clip_boxes(proposals, im_info[0], im_info[1]); + proposals = utils::clip_boxes( + proposals, im_info[0], im_info[1], 1.0, legacy_plus_one_); // 3. remove predicted boxes with either height or width < min_size - auto keep = utils::filter_boxes(proposals, min_size, im_info); + auto keep = + utils::filter_boxes(proposals, min_size, im_info, legacy_plus_one_); DCHECK_LE(keep.size(), scores.size()); @@ -2334,6 +2343,8 @@ bool RunOnDevice() override { float rpn_nms_thresh_{0.7}; // RPN_MIN_SIZE float rpn_min_size_{16}; + // The infamous "+ 1" for box width and height dating back to the DPM days + bool legacy_plus_one_{true}; // threads per thread group, used in nms ushort maxThreadsPerThreadgroup{32}; diff --git a/caffe2/operators/batch_matmul_op.cc b/caffe2/operators/batch_matmul_op.cc index 62ad5ad57121..aeefc131f002 100644 --- a/caffe2/operators/batch_matmul_op.cc +++ b/caffe2/operators/batch_matmul_op.cc @@ -129,7 +129,7 @@ from 0 to (dim0 * dim1 ...) - 1. rank(A) == rank(B) >= 2. In case of A and B bei two diemnsional, it behaves like normal matrix multiplication. )DOC") .Input(0, "A", "tensor of shape (dim0, dim1 ... M, K)") - .Input(1, "B", "tensor of shpae (dim0, dim2 ... K, N)") + .Input(1, "B", "tensor of shape (dim0, dim1 ... K, N)") .Output(0, "Y", "tensor of shape (dim0, dim1 ... M, N)") .Arg( "trans_a", diff --git a/caffe2/operators/bbox_transform_op.cc b/caffe2/operators/bbox_transform_op.cc index f8c795c5dba0..e09d795320c9 100644 --- a/caffe2/operators/bbox_transform_op.cc +++ b/caffe2/operators/bbox_transform_op.cc @@ -154,11 +154,12 @@ bool BBoxTransformOp::RunOnDevice() { cur_deltas, weights_, utils::BBOX_XFORM_CLIP_DEFAULT, + legacy_plus_one_, angle_bound_on_, angle_bound_lo_, angle_bound_hi_); - EArrXXf clip_boxes = - utils::clip_boxes(trans_boxes, img_h, img_w, clip_angle_thresh_); + EArrXXf clip_boxes = utils::clip_boxes( + trans_boxes, img_h, img_w, clip_angle_thresh_, legacy_plus_one_); // Do not apply scale for angle in rotated boxes clip_boxes.leftCols(4) *= scale_after; new_boxes.block(offset, k * box_dim, num_rois, box_dim) = clip_boxes; @@ -184,6 +185,7 @@ bool BBoxTransformOp::RunOnDevice() { using BBoxTransformOpFloatCPU = caffe2::BBoxTransformOp; +// clang-format off C10_REGISTER_CAFFE2_OPERATOR_CPU( BBoxTransform, "_caffe2::BBoxTransform(" @@ -196,9 +198,11 @@ C10_REGISTER_CAFFE2_OPERATOR_CPU( "bool angle_bound_on, " "int angle_bound_lo, " "int angle_bound_hi, " - "float clip_angle_thresh" + "float clip_angle_thresh, " + "bool legacy_plus_one" ") -> (" "Tensor output_0, " "Tensor output_1" ")", BBoxTransformOpFloatCPU); +// clang-format on diff --git a/caffe2/operators/bbox_transform_op.h b/caffe2/operators/bbox_transform_op.h index e2bcf9b8c05d..57eef7533206 100644 --- a/caffe2/operators/bbox_transform_op.h +++ b/caffe2/operators/bbox_transform_op.h @@ -3,11 +3,11 @@ #ifndef BBOX_TRANSFORM_OP_H_ #define BBOX_TRANSFORM_OP_H_ +#include "caffe2/core/c10_operator.h" #include "caffe2/core/context.h" #include "caffe2/core/logging.h" #include "caffe2/core/operator.h" #include "caffe2/utils/math.h" -#include "caffe2/core/c10_operator.h" C10_DECLARE_CAFFE2_OPERATOR(BBoxTransform) @@ -16,7 +16,7 @@ namespace caffe2 { template class BBoxTransformOp final : public Operator { public: - template + template explicit BBoxTransformOp(Args&&... args) : Operator(std::forward(args)...), weights_(this->template GetRepeatedArgument( @@ -32,7 +32,9 @@ class BBoxTransformOp final : public Operator { angle_bound_hi_( this->template GetSingleArgument("angle_bound_hi", 90)), clip_angle_thresh_( - this->template GetSingleArgument("clip_angle_thresh", 1.0)) { + this->template GetSingleArgument("clip_angle_thresh", 1.0)), + legacy_plus_one_( + this->template GetSingleArgument("legacy_plus_one", true)) { CAFFE_ENFORCE_EQ( weights_.size(), 4, @@ -62,6 +64,8 @@ class BBoxTransformOp final : public Operator { // tolerance for backward compatibility. Set to negative value for // no clipping. float clip_angle_thresh_{1.0}; + // The infamous "+ 1" for box width and height dating back to the DPM days + bool legacy_plus_one_{true}; }; } // namespace caffe2 diff --git a/caffe2/operators/box_with_nms_limit_op.cc b/caffe2/operators/box_with_nms_limit_op.cc index b780bc2994c4..c1890de4c60c 100644 --- a/caffe2/operators/box_with_nms_limit_op.cc +++ b/caffe2/operators/box_with_nms_limit_op.cc @@ -98,7 +98,9 @@ bool BoxWithNMSLimitOp::RunOnDevice() { soft_nms_sigma_, nms_thres_, soft_nms_min_score_thres_, - soft_nms_method_); + soft_nms_method_, + -1, /* topN */ + legacy_plus_one_); } else { std::sort( inds.data(), @@ -107,8 +109,13 @@ bool BoxWithNMSLimitOp::RunOnDevice() { return cur_scores(lhs) > cur_scores(rhs); }); int keep_max = detections_per_im_ > 0 ? detections_per_im_ : -1; - keeps[j] = - utils::nms_cpu(cur_boxes, cur_scores, inds, nms_thres_, keep_max); + keeps[j] = utils::nms_cpu( + cur_boxes, + cur_scores, + inds, + nms_thres_, + keep_max, + legacy_plus_one_); } total_keep_count += keeps[j].size(); } @@ -300,6 +307,7 @@ SHOULD_NOT_DO_GRADIENT(BoxWithNMSLimit); } // namespace } // namespace caffe2 +// clang-format off C10_REGISTER_CAFFE2_OPERATOR_CPU( BoxWithNMSLimit, "_caffe2::BoxWithNMSLimit(" @@ -316,7 +324,8 @@ C10_REGISTER_CAFFE2_OPERATOR_CPU( "bool rotated, " "bool cls_agnostic_bbox_reg, " "bool input_boxes_include_bg_cls, " - "bool output_classes_include_bg_cls " + "bool output_classes_include_bg_cls, " + "bool legacy_plus_one " ") -> (" "Tensor scores, " "Tensor boxes, " @@ -326,3 +335,4 @@ C10_REGISTER_CAFFE2_OPERATOR_CPU( //"Tensor keeps_size, " ")", caffe2::BoxWithNMSLimitOp); +// clang-format on diff --git a/caffe2/operators/box_with_nms_limit_op.h b/caffe2/operators/box_with_nms_limit_op.h index 090993fcef25..d0c7c6a37a3c 100644 --- a/caffe2/operators/box_with_nms_limit_op.h +++ b/caffe2/operators/box_with_nms_limit_op.h @@ -3,10 +3,9 @@ #ifndef BOX_WITH_NMS_AND_LIMIT_OP_H_ #define BOX_WITH_NMS_AND_LIMIT_OP_H_ +#include "caffe2/core/c10_operator.h" #include "caffe2/core/context.h" #include "caffe2/core/operator.h" -#include "caffe2/core/c10_operator.h" - C10_DECLARE_CAFFE2_OPERATOR(BoxWithNMSLimit) @@ -44,7 +43,9 @@ class BoxWithNMSLimitOp final : public Operator { true)), output_classes_include_bg_cls_(this->template GetSingleArgument( "output_classes_include_bg_cls", - true)) { + true)), + legacy_plus_one_( + this->template GetSingleArgument("legacy_plus_one", true)) { CAFFE_ENFORCE( soft_nms_method_str_ == "linear" || soft_nms_method_str_ == "gaussian", "Unexpected soft_nms_method"); @@ -91,6 +92,8 @@ class BoxWithNMSLimitOp final : public Operator { // The index where foreground starts in scoures. Eg. if 0 represents // background class then foreground class starts with 1. int input_scores_fg_cls_starting_id_{1}; + // The infamous "+ 1" for box width and height dating back to the DPM days + bool legacy_plus_one_{true}; // Map a class id (starting with background and then foreground) from (0, 1, // ..., NUM_FG_CLASSES) to it's matching value in box diff --git a/caffe2/operators/bucketize_op.cc b/caffe2/operators/bucketize_op.cc new file mode 100644 index 000000000000..a7e9229e70ff --- /dev/null +++ b/caffe2/operators/bucketize_op.cc @@ -0,0 +1,64 @@ +#include "caffe2/operators/bucketize_op.h" + +#include "caffe2/core/operator.h" +#include "caffe2/core/tensor.h" + +namespace caffe2 { + +template +bool BucketizeOp::RunOnDevice() { + auto& input = Input(X); + CAFFE_ENFORCE_GE(input.dim(), 1); + + auto N = input.numel(); + auto* output = Output(INDICES, input.sizes(), at::dtype()); + const auto* input_data = input.template data(); + auto* output_data = output->template mutable_data(); + + math::Set(output->numel(), 0.0, output_data, &context_); + + for (int64_t pos = 0; pos < N; pos++) { + // here we assume the boundary values for each feature are sorted + int64_t bucket_idx = + std::lower_bound( + boundaries_.begin(), boundaries_.end(), input_data[pos]) - + boundaries_.begin(); + output_data[pos] = bucket_idx; + } + + return true; +}; +REGISTER_CPU_OPERATOR(Bucketize, BucketizeOp); + +OPERATOR_SCHEMA(Bucketize) + .NumInputs(1) + .NumOutputs(1) + .SetDoc(R"DOC( +This operator works as bucketize in tensorflow and digitize +in numpy. It bucketizes the input 'X' based on argument 'boundaries'. +For each value x in input 'data', the operator returns index i given +boundaries[i-1] < x <= boundaries[i]. +If values in 'data' are beyond the bounds of boundaries, 0 or +len(boundaries) is returned as appropriate. +The boundaries need to be monotonically increasing. +For example + +If data = [2, 4, 1] and boundaries = [0.1, 2.5], then + +output = [1, 2, 1] + +If data = [[2, 3], [4, 1], [2, 5]] and boundaries = [0.1, 2.5], then + +output = [[1, 2], [2, 1], [1, 2]] + +)DOC") + .Input(0, "data", "input tensor") + .Output( + 0, + "output", + "indices of bins given by boundaries to which each value" + "in data belongs") + .Arg("boundaries", "bucketization boundaries"); + +NO_GRADIENT(BucketizeOp); +} // namespace caffe2 diff --git a/caffe2/operators/bucketize_op.cu b/caffe2/operators/bucketize_op.cu new file mode 100644 index 000000000000..1864c08fa637 --- /dev/null +++ b/caffe2/operators/bucketize_op.cu @@ -0,0 +1,7 @@ +#include "caffe2/core/context_gpu.h" +#include "caffe2/operators/bucketize_op.h" +#include "caffe2/operators/operator_fallback_gpu.h" + +namespace caffe2 { +REGISTER_CUDA_OPERATOR(Bucketize, GPUFallbackOp); +} // namespace caffe2 diff --git a/caffe2/operators/bucketize_op.h b/caffe2/operators/bucketize_op.h new file mode 100644 index 000000000000..7e536b701a61 --- /dev/null +++ b/caffe2/operators/bucketize_op.h @@ -0,0 +1,37 @@ +// Copyright 2004-present Facebook. All Rights Reserved. + +#ifndef CAFFE2_OPERATORS_BUCKETIZE_OP_H_ +#define CAFFE2_OPERATORS_BUCKETIZE_OP_H_ + +#include "caffe2/core/context.h" +#include "caffe2/core/logging.h" +#include "caffe2/core/operator.h" +#include "caffe2/utils/math.h" + +namespace caffe2 { + +template +class BucketizeOp final : public Operator { + public: + USE_OPERATOR_CONTEXT_FUNCTIONS; + BucketizeOp(const OperatorDef& operator_def, Workspace* ws) + : Operator(operator_def, ws), + boundaries_(this->template GetRepeatedArgument("boundaries")) { + CAFFE_ENFORCE( + std::is_sorted(boundaries_.begin(), boundaries_.end()), + "The boundaries need to be monotonically increasing"); + } + + bool RunOnDevice() override; + + protected: + INPUT_TAGS(X); + OUTPUT_TAGS(INDICES); + + private: + std::vector boundaries_; +}; + +} // namespace caffe2 + +#endif // CAFFE2_OPERATORS_BUCKETIZE_OP_H_ diff --git a/caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.cc b/caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.cc index 3e15a9e0ee82..53390ac6ed51 100644 --- a/caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.cc +++ b/caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.cc @@ -5,14 +5,14 @@ namespace caffe2 { namespace utils { // Compute the area of an array of boxes. -ERArrXXf BoxesArea(const ERArrXXf& boxes) { +ERArrXXf BoxesArea(const ERArrXXf& boxes, const bool legacy_plus_one) { // equivalent to python code // w = (boxes[:, 2] - boxes[:, 0] + 1) // h = (boxes[:, 3] - boxes[:, 1] + 1) // areas = w * h // assert np.all(areas >= 0), 'Negative areas founds' - const auto w = boxes.col(2) - boxes.col(0) + 1; - const auto h = boxes.col(3) - boxes.col(1) + 1; + const auto w = boxes.col(2) - boxes.col(0) + int(legacy_plus_one); + const auto h = boxes.col(3) - boxes.col(1) + int(legacy_plus_one); const ERArrXXf areas = w * h; CAFFE_ENFORCE((areas >= 0).all(), "Negative areas founds: ", boxes); return areas; @@ -20,11 +20,15 @@ ERArrXXf BoxesArea(const ERArrXXf& boxes) { // Determine which FPN level each RoI in a set of RoIs should map to based // on the heuristic in the FPN paper. -ERArrXXf MapRoIsToFpnLevels(Eigen::Ref rois, - const float k_min, const float k_max, - const float s0, const float lvl0) { +ERArrXXf MapRoIsToFpnLevels( + Eigen::Ref rois, + const float k_min, + const float k_max, + const float s0, + const float lvl0, + const bool legacy_plus_one) { // Compute level ids - ERArrXXf s = BoxesArea(rois).sqrt(); + ERArrXXf s = BoxesArea(rois, legacy_plus_one).sqrt(); // s0 = cfg.FPN.ROI_CANONICAL_SCALE # default: 224 // lvl0 = cfg.FPN.ROI_CANONICAL_LEVEL # default: 4 @@ -39,8 +43,10 @@ ERArrXXf MapRoIsToFpnLevels(Eigen::Ref rois, // Sort RoIs from highest to lowest individual RoI score based on // values from scores array and limit to n results -void SortAndLimitRoIsByScores(Eigen::Ref scores, int n, - ERArrXXf& rois) { +void SortAndLimitRoIsByScores( + Eigen::Ref scores, + int n, + ERArrXXf& rois) { CAFFE_ENFORCE(rois.rows() == scores.size(), "RoIs and scores count mismatch"); // Create index array with 0, 1, ... N std::vector idxs(rois.rows()); @@ -48,8 +54,12 @@ void SortAndLimitRoIsByScores(Eigen::Ref scores, int n, // Reuse a comparator based on scores and store a copy of RoIs that // will be truncated and manipulated below auto comp = [&scores](int lhs, int rhs) { - if (scores(lhs) > scores(rhs)) return true; - if (scores(lhs) < scores(rhs)) return false; + if (scores(lhs) > scores(rhs)) { + return true; + } + if (scores(lhs) < scores(rhs)) { + return false; + } // To ensure the sort is stable return lhs < rhs; }; @@ -86,9 +96,12 @@ void ArgSort(EArrXi& arr) { // Update out_filtered and out_indices with rows from rois where lvl matches // value in lvls passed in. -void RowsWhereRoILevelEquals(Eigen::Ref rois, - const ERArrXXf& lvls, const int lvl, - ERArrXXf* out_filtered, EArrXi* out_indices) { +void RowsWhereRoILevelEquals( + Eigen::Ref rois, + const ERArrXXf& lvls, + const int lvl, + ERArrXXf* out_filtered, + EArrXi* out_indices) { CAFFE_ENFORCE(out_filtered != nullptr, "Output filtered required"); CAFFE_ENFORCE(out_indices != nullptr, "Output indices required"); CAFFE_ENFORCE(rois.rows() == lvls.rows(), "RoIs and lvls count mismatch"); @@ -142,6 +155,7 @@ bool CollectAndDistributeFpnRpnProposalsOp::RunOnDevice() { rois.block(len, 0, n, 5) = roi; const auto& score_in = Input(num_rpn_lvls + i); + CAFFE_ENFORCE_EQ(score_in.size(0), n); // No need to squeeze, since we are reshaping when converting to Eigen // https://docs.scipy.org/doc/numpy/reference/generated/numpy.squeeze.html @@ -167,9 +181,8 @@ bool CollectAndDistributeFpnRpnProposalsOp::RunOnDevice() { const int canon_scale = roi_canonical_scale_; const int canon_level = roi_canonical_level_; auto rois_block = rois.block(0, 1, rois.rows(), 4); - auto lvls = utils::MapRoIsToFpnLevels(rois_block, - lvl_min, lvl_max, - canon_scale, canon_level); + auto lvls = utils::MapRoIsToFpnLevels( + rois_block, lvl_min, lvl_max, canon_scale, canon_level, legacy_plus_one_); // equivalent to python code // outputs[0].reshape(rois.shape) @@ -193,7 +206,8 @@ bool CollectAndDistributeFpnRpnProposalsOp::RunOnDevice() { // outputs[output_idx + 1].data[...] = blob_roi_level // rois_idx_order = np.concatenate((rois_idx_order, idx_lvl)) // rois_idx_restore = np.argsort(rois_idx_order) - // blob_utils.py_op_copy_blob(rois_idx_restore.astype(np.int32), outputs[-1]) + // blob_utils.py_op_copy_blob(rois_idx_restore.astype(np.int32), + // outputs[-1]) EArrXi rois_idx_restore; for (int i = 0, lvl = lvl_min; i < num_roi_lvls; i++, lvl++) { ERArrXXf blob_roi_level; @@ -213,7 +227,144 @@ bool CollectAndDistributeFpnRpnProposalsOp::RunOnDevice() { roi_out_mat = blob_roi_level; // Append indices from idx_lvl to rois_idx_restore - rois_idx_restore.conservativeResize(rois_idx_restore.size() + idx_lvl.size()); + rois_idx_restore.conservativeResize( + rois_idx_restore.size() + idx_lvl.size()); + rois_idx_restore.tail(idx_lvl.size()) = idx_lvl; + } + utils::ArgSort(rois_idx_restore); + + auto* rois_idx_restore_out = + Output(OutputSize() - 1, {rois_idx_restore.size()}, at::dtype()); + Eigen::Map rois_idx_restore_out_mat( + rois_idx_restore_out->template mutable_data(), + rois_idx_restore.size()); + rois_idx_restore_out_mat = rois_idx_restore; + + return true; +} + +template <> +bool CollectRpnProposalsOp::RunOnDevice() { + int num_rpn_lvls = rpn_max_level_ - rpn_min_level_ + 1; + CAFFE_ENFORCE_EQ(InputSize(), 2 * num_rpn_lvls); + + // Collect rois and scores in Eigen + // rois are in [[batch_idx, x0, y0, x1, y2], ...] format + // Combine predictions across all levels and retain the top scoring + // + // equivalent to python code + // roi_inputs = inputs[:num_rpn_lvls] + // score_inputs = inputs[num_rpn_lvls:] + // rois = np.concatenate([blob.data for blob in roi_inputs]) + // scores = np.concatenate([blob.data for blob in score_inputs]).squeeze() + int proposal_num = 0; + for (int i = 0; i < num_rpn_lvls; i++) { + const auto& roi_in = Input(i); + proposal_num += roi_in.size(0); + } + ERArrXXf rois(proposal_num, 5); + EArrXf scores(proposal_num); + int len = 0; + for (int i = 0; i < num_rpn_lvls; i++) { + const auto& roi_in = Input(i); + const int n = roi_in.size(0); + + Eigen::Map roi(roi_in.data(), n, 5); + rois.block(len, 0, n, 5) = roi; + + const auto& score_in = Input(num_rpn_lvls + i); + CAFFE_ENFORCE_EQ(score_in.size(0), n); + + // No need to squeeze, since we are reshaping when converting to Eigen + // https://docs.scipy.org/doc/numpy/reference/generated/numpy.squeeze.html + Eigen::Map score(score_in.data(), n); + scores.segment(len, n) = score; + + len += n; + } + + // Grab only top rpn_post_nms_topN rois + // equivalent to python code + // inds = np.argsort(-scores)[:rpn_post_nms_topN] + // rois = rois[inds, :] + utils::SortAndLimitRoIsByScores(scores, rpn_post_nms_topN_, rois); + + // equivalent to python code + // outputs[0].reshape(rois.shape) + // outputs[0].data[...] = rois + + auto* rois_out = Output(0, {rois.rows(), rois.cols()}, at::dtype()); + Eigen::Map rois_out_mat( + rois_out->template mutable_data(), rois.rows(), rois.cols()); + rois_out_mat = rois; + + return true; +} + +template <> +bool DistributeFpnProposalsOp::RunOnDevice() { + int num_roi_lvls = roi_max_level_ - roi_min_level_ + 1; + CAFFE_ENFORCE_EQ(OutputSize(), num_roi_lvls + 1); + + // Load Input(0) to rois + const auto& rois_in = Input(0); + const int num_rois = rois_in.size(0); + const int dim_rois = rois_in.size(1); + CAFFE_ENFORCE(dim_rois == 4 || dim_rois == 5); + Eigen::Map rois_4or5( + rois_in.data(), num_rois, dim_rois); + ERArrXXf rois = ERArrXXf::Zero(num_rois, 5); + rois.rightCols(dim_rois) = rois_4or5; + + // Distribute + // equivalent to python code + // lvl_min = cfg.FPN.ROI_MIN_LEVEL + // lvl_max = cfg.FPN.ROI_MAX_LEVEL + // lvls = fpn.map_rois_to_fpn_levels(rois[:, 1:5], lvl_min, lvl_max) + const int lvl_min = roi_min_level_; + const int lvl_max = roi_max_level_; + const int canon_scale = roi_canonical_scale_; + const int canon_level = roi_canonical_level_; + auto rois_block = rois.block(0, 1, rois.rows(), 4); + auto lvls = utils::MapRoIsToFpnLevels( + rois_block, lvl_min, lvl_max, canon_scale, canon_level, legacy_plus_one_); + + // Create new roi blobs for each FPN level + // (See: modeling.FPN.add_multilevel_roi_blobs which is similar but annoying + // to generalize to support this particular case.) + // + // equivalent to python code + // rois_idx_order = np.empty((0, )) + // for (output_idx, lvl in enumerate(range(lvl_min, lvl_max + 1))) + // idx_lvl = np.where(lvls == lvl)[0] + // blob_roi_level = rois[idx_lvl, :] + // outputs[output_idx + 1].reshape(blob_roi_level.shape) + // outputs[output_idx + 1].data[...] = blob_roi_level + // rois_idx_order = np.concatenate((rois_idx_order, idx_lvl)) + // rois_idx_restore = np.argsort(rois_idx_order) + // blob_utils.py_op_copy_blob(rois_idx_restore.astype(np.int32), + // outputs[-1]) + EArrXi rois_idx_restore; + for (int i = 0, lvl = lvl_min; i < num_roi_lvls; i++, lvl++) { + ERArrXXf blob_roi_level; + EArrXi idx_lvl; + utils::RowsWhereRoILevelEquals(rois, lvls, lvl, &blob_roi_level, &idx_lvl); + + // Output blob_roi_level + + auto* roi_out = Output( + i + 0, + {blob_roi_level.rows(), blob_roi_level.cols()}, + at::dtype()); + Eigen::Map roi_out_mat( + roi_out->template mutable_data(), + blob_roi_level.rows(), + blob_roi_level.cols()); + roi_out_mat = blob_roi_level; + + // Append indices from idx_lvl to rois_idx_restore + rois_idx_restore.conservativeResize( + rois_idx_restore.size() + idx_lvl.size()); rois_idx_restore.tail(idx_lvl.size()) = idx_lvl; } utils::ArgSort(rois_idx_restore); @@ -230,7 +381,13 @@ bool CollectAndDistributeFpnRpnProposalsOp::RunOnDevice() { namespace { -REGISTER_CPU_OPERATOR(CollectAndDistributeFpnRpnProposals, CollectAndDistributeFpnRpnProposalsOp); +REGISTER_CPU_OPERATOR( + CollectAndDistributeFpnRpnProposals, + CollectAndDistributeFpnRpnProposalsOp); +REGISTER_CPU_OPERATOR(CollectRpnProposals, CollectRpnProposalsOp); +REGISTER_CPU_OPERATOR( + DistributeFpnProposals, + DistributeFpnProposalsOp); OPERATOR_SCHEMA(CollectAndDistributeFpnRpnProposals) .NumInputs(2, INT_MAX) @@ -344,5 +501,175 @@ will change. SHOULD_NOT_DO_GRADIENT(CollectAndDistributeFpnRpnProposals); +OPERATOR_SCHEMA(CollectRpnProposals) + .NumInputs(2, INT_MAX) + .NumOutputs(1) + .SetDoc(R"DOC( +... +)DOC") + .Arg("rpn_max_level", "(int) RPN_MAX_LEVEL") + .Arg("rpn_min_level", "(int) RPN_MIN_LEVEL") + .Arg("rpn_post_nms_topN", "(int) RPN_POST_NMS_TOP_N") + .Input( + 0, + "rpn_rois_fpn2", + "RPN proposals for FPN level 2, " + "format (image_index, x1, y1, x2, y2). See rpn_rois " + "documentation from GenerateProposals.") + .Input( + 1, + "rpn_rois_fpn3", + "RPN proposals for FPN level 3, " + "format (image_index, x1, y1, x2, y2). See rpn_rois " + "documentation from GenerateProposals.") + .Input( + 2, + "rpn_rois_fpn4", + "RPN proposals for FPN level 4, " + "format (image_index, x1, y1, x2, y2). See rpn_rois " + "documentation from GenerateProposals.") + .Input( + 3, + "rpn_rois_fpn5", + "RPN proposals for FPN level 5, " + "format (image_index, x1, y1, x2, y2). See rpn_rois " + "documentation from GenerateProposals.") + .Input( + 4, + "rpn_rois_fpn6", + "RPN proposals for FPN level 6, " + "format (image_index, x1, y1, x2, y2). See rpn_rois " + "documentation from GenerateProposals.") + .Input( + 5, + "rpn_roi_probs_fpn2", + "RPN objectness probabilities for FPN level 2. " + "See rpn_roi_probs documentation from GenerateProposals.") + .Input( + 6, + "rpn_roi_probs_fpn3", + "RPN objectness probabilities for FPN level 3. " + "See rpn_roi_probs documentation from GenerateProposals.") + .Input( + 7, + "rpn_roi_probs_fpn4", + "RPN objectness probabilities for FPN level 4. " + "See rpn_roi_probs documentation from GenerateProposals.") + .Input( + 8, + "rpn_roi_probs_fpn5", + "RPN objectness probabilities for FPN level 5. " + "See rpn_roi_probs documentation from GenerateProposals.") + .Input( + 9, + "rpn_roi_probs_fpn6", + "RPN objectness probabilities for FPN level 6. " + "See rpn_roi_probs documentation from GenerateProposals.") + .Output( + 0, + "rois", + "Top proposals limited to rpn_post_nms_topN total, " + "format (image_index, x1, y1, x2, y2)"); + +SHOULD_NOT_DO_GRADIENT(CollectRpnProposals); + +OPERATOR_SCHEMA(DistributeFpnProposals) + .NumInputs(1) + .NumOutputs(2, INT_MAX) + .SetDoc(R"DOC( +... +)DOC") + .Arg("roi_canonical_scale", "(int) ROI_CANONICAL_SCALE") + .Arg("roi_canonical_level", "(int) ROI_CANONICAL_LEVEL") + .Arg("roi_max_level", "(int) ROI_MAX_LEVEL") + .Arg("roi_min_level", "(int) ROI_MIN_LEVEL") + .Input( + 0, + "rois", + "Top proposals limited to rpn_post_nms_topN total, " + "format (image_index, x1, y1, x2, y2)") + .Output( + 0, + "rois_fpn2", + "RPN proposals for ROI level 2, " + "format (image_index, x1, y1, x2, y2)") + .Output( + 1, + "rois_fpn3", + "RPN proposals for ROI level 3, " + "format (image_index, x1, y1, x2, y2)") + .Output( + 2, + "rois_fpn4", + "RPN proposals for ROI level 4, " + "format (image_index, x1, y1, x2, y2)") + .Output( + 3, + "rois_fpn5", + "RPN proposals for ROI level 5, " + "format (image_index, x1, y1, x2, y2)") + .Output( + 4, + "rois_idx_restore", + "Permutation on the concatenation of all " + "rois_fpni, i=min...max, such that when applied the RPN RoIs are " + "restored to their original order in the input blobs."); + +SHOULD_NOT_DO_GRADIENT(DistributeFpnProposals); + } // namespace } // namespace caffe2 + +// clang-format off +C10_REGISTER_CAFFE2_OPERATOR_CPU( + CollectAndDistributeFpnRpnProposals, + "_caffe2::CollectAndDistributeFpnRpnProposals(" + "Tensor[] input_list, " + "int roi_canonical_scale, " + "int roi_canonical_level, " + "int roi_max_level, " + "int roi_min_level, " + "int rpn_max_level, " + "int rpn_min_level, " + "int rpn_post_nms_topN, " + "bool legacy_plus_one" + ") -> (" + "Tensor rois, " + "Tensor rois_fpn2, " + "Tensor rois_fpn3, " + "Tensor rois_fpn4, " + "Tensor rois_fpn5, " + "Tensor rois_idx_restore_int32" + ")", + caffe2::CollectAndDistributeFpnRpnProposalsOp); + +C10_REGISTER_CAFFE2_OPERATOR_CPU( + CollectRpnProposals, + "_caffe2::CollectRpnProposals(" + "Tensor[] input_list, " + "int rpn_max_level, " + "int rpn_min_level, " + "int rpn_post_nms_topN" + ") -> (" + "Tensor rois" + ")", + caffe2::CollectRpnProposalsOp); + +C10_REGISTER_CAFFE2_OPERATOR_CPU( + DistributeFpnProposals, + "_caffe2::DistributeFpnProposals(" + "Tensor rois, " + "int roi_canonical_scale, " + "int roi_canonical_level, " + "int roi_max_level, " + "int roi_min_level, " + "bool legacy_plus_one" + ") -> (" + "Tensor rois_fpn2, " + "Tensor rois_fpn3, " + "Tensor rois_fpn4, " + "Tensor rois_fpn5, " + "Tensor rois_idx_restore_int32" + ")", + caffe2::DistributeFpnProposalsOp); +// clang-format on diff --git a/caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.h b/caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.h index a73ef60aa41a..ed44b4cb130c 100644 --- a/caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.h +++ b/caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.h @@ -1,28 +1,39 @@ #ifndef CAFFE2_OPERATORS_COLLECT_AND_DISTRIBUTE_FPN_RPN_PROPOSALS_OP_H_ #define CAFFE2_OPERATORS_COLLECT_AND_DISTRIBUTE_FPN_RPN_PROPOSALS_OP_H_ +#include "caffe2/core/c10_operator.h" #include "caffe2/core/context.h" #include "caffe2/core/operator.h" #include "caffe2/utils/eigen_utils.h" #include "caffe2/utils/math.h" +C10_DECLARE_CAFFE2_OPERATOR(CollectAndDistributeFpnRpnProposals); +C10_DECLARE_CAFFE2_OPERATOR(CollectRpnProposals); +C10_DECLARE_CAFFE2_OPERATOR(DistributeFpnProposals); + namespace caffe2 { namespace utils { // Compute the area of an array of boxes. -ERArrXXf BoxesArea(const ERArrXXf& boxes); +ERArrXXf BoxesArea(const ERArrXXf& boxes, const bool legacy_plus_one = false); // Determine which FPN level each RoI in a set of RoIs should map to based // on the heuristic in the FPN paper. -ERArrXXf MapRoIsToFpnLevels(Eigen::Ref rois, - const float k_min, const float k_max, - const float s0, const float lvl0); +ERArrXXf MapRoIsToFpnLevels( + Eigen::Ref rois, + const float k_min, + const float k_max, + const float s0, + const float lvl0, + const bool legacy_plus_one = false); // Sort RoIs from highest to lowest individual RoI score based on // values from scores array and limit to n results -void SortAndLimitRoIsByScores(Eigen::Ref scores, int n, - ERArrXXf& rois); +void SortAndLimitRoIsByScores( + Eigen::Ref scores, + int n, + ERArrXXf& rois); // Updates arr to be indices that would sort the array. Implementation of // https://docs.scipy.org/doc/numpy/reference/generated/numpy.argsort.html @@ -30,18 +41,22 @@ void ArgSort(EArrXi& arr); // Update out_filtered and out_indices with rows from rois where lvl matches // value in lvls passed in. -void RowsWhereRoILevelEquals(Eigen::Ref rois, - const ERArrXXf& lvls, const int lvl, - ERArrXXf* out_filtered, EArrXi* out_indices); +void RowsWhereRoILevelEquals( + Eigen::Ref rois, + const ERArrXXf& lvls, + const int lvl, + ERArrXXf* out_filtered, + EArrXi* out_indices); } // namespace utils // C++ implementation of CollectAndDistributeFpnRpnProposalsOp // Merge RPN proposals generated at multiple FPN levels and then -// distribute those proposals to their appropriate FPN levels for Faster RCNN. -// An anchor at one FPN level may predict an RoI that will map to another -// level, hence the need to redistribute the proposals. -// Reference: facebookresearch/Detectron/detectron/ops/collect_and_distribute_fpn_rpn_proposals.py +// distribute those proposals to their appropriate FPN levels for Faster +// RCNN. An anchor at one FPN level may predict an RoI that will map to +// another level, hence the need to redistribute the proposals. +// Reference: +// facebookresearch/Detectron/detectron/ops/collect_and_distribute_fpn_rpn_proposals.py template class CollectAndDistributeFpnRpnProposalsOp final : public Operator { public: @@ -62,7 +77,9 @@ class CollectAndDistributeFpnRpnProposalsOp final : public Operator { rpn_min_level_( this->template GetSingleArgument("rpn_min_level", 2)), rpn_post_nms_topN_( - this->template GetSingleArgument("rpn_post_nms_topN", 2000)) { + this->template GetSingleArgument("rpn_post_nms_topN", 2000)), + legacy_plus_one_( + this->template GetSingleArgument("legacy_plus_one", true)) { CAFFE_ENFORCE_GE( roi_max_level_, roi_min_level_, @@ -77,7 +94,7 @@ class CollectAndDistributeFpnRpnProposalsOp final : public Operator { c10::to_string(rpn_min_level_) + "."); } - ~CollectAndDistributeFpnRpnProposalsOp() {} + ~CollectAndDistributeFpnRpnProposalsOp() override {} bool RunOnDevice() override; @@ -96,6 +113,84 @@ class CollectAndDistributeFpnRpnProposalsOp final : public Operator { int rpn_min_level_{2}; // RPN_POST_NMS_TOP_N int rpn_post_nms_topN_{2000}; + // The infamous "+ 1" for box width and height dating back to the DPM days + bool legacy_plus_one_{true}; +}; + +template +class CollectRpnProposalsOp final : public Operator { + public: + USE_OPERATOR_CONTEXT_FUNCTIONS; + template + explicit CollectRpnProposalsOp(Args&&... args) + : Operator(std::forward(args)...), + rpn_max_level_( + this->template GetSingleArgument("rpn_max_level", 6)), + rpn_min_level_( + this->template GetSingleArgument("rpn_min_level", 2)), + rpn_post_nms_topN_( + this->template GetSingleArgument("rpn_post_nms_topN", 2000)) { + CAFFE_ENFORCE_GE( + rpn_max_level_, + rpn_min_level_, + "rpn_max_level " + c10::to_string(rpn_max_level_) + + " must be greater than or equal to rpn_min_level " + + c10::to_string(rpn_min_level_) + "."); + } + + ~CollectRpnProposalsOp() override {} + + bool RunOnDevice() override; + + protected: + // RPN_MAX_LEVEL + int rpn_max_level_{6}; + // RPN_MIN_LEVEL + int rpn_min_level_{2}; + // RPN_POST_NMS_TOP_N + int rpn_post_nms_topN_{2000}; +}; + +template +class DistributeFpnProposalsOp final : public Operator { + public: + USE_OPERATOR_CONTEXT_FUNCTIONS; + template + explicit DistributeFpnProposalsOp(Args&&... args) + : Operator(std::forward(args)...), + roi_canonical_scale_( + this->template GetSingleArgument("roi_canonical_scale", 224)), + roi_canonical_level_( + this->template GetSingleArgument("roi_canonical_level", 4)), + roi_max_level_( + this->template GetSingleArgument("roi_max_level", 5)), + roi_min_level_( + this->template GetSingleArgument("roi_min_level", 2)), + legacy_plus_one_( + this->template GetSingleArgument("legacy_plus_one", true)) { + CAFFE_ENFORCE_GE( + roi_max_level_, + roi_min_level_, + "roi_max_level " + c10::to_string(roi_max_level_) + + " must be greater than or equal to roi_min_level " + + c10::to_string(roi_min_level_) + "."); + } + + ~DistributeFpnProposalsOp() override {} + + bool RunOnDevice() override; + + protected: + // ROI_CANONICAL_SCALE + int roi_canonical_scale_{224}; + // ROI_CANONICAL_LEVEL + int roi_canonical_level_{4}; + // ROI_MAX_LEVEL + int roi_max_level_{5}; + // ROI_MIN_LEVEL + int roi_min_level_{2}; + // The infamous "+ 1" for box width and height dating back to the DPM days + bool legacy_plus_one_{true}; }; } // namespace caffe2 diff --git a/caffe2/operators/experimental/c10/cpu/add_cpu.cc b/caffe2/operators/experimental/c10/cpu/add_cpu.cc index 052cf1e6623b..a06e6e3781c9 100644 --- a/caffe2/operators/experimental/c10/cpu/add_cpu.cc +++ b/caffe2/operators/experimental/c10/cpu/add_cpu.cc @@ -71,18 +71,10 @@ void add_op_cpu_impl( } static auto registry = c10::RegisterOperators().op( - FunctionSchema( - "_c10_experimental::Add", - "", - (std::vector{ - c10::Argument("input1"), - c10::Argument("input2"), - c10::Argument("output"), - c10::Argument("legacy_broadcast", BoolType::get()), - c10::Argument("axis", IntType::get())}), - (std::vector{})), - c10::kernel), &add_op_cpu_impl>(), - c10::dispatchKey(CPUTensorId())); + "_c10_experimental::Add", + c10::RegisterOperators::options() + .kernel), &add_op_cpu_impl>() + .dispatchKey(CPUTensorId())); } // namespace diff --git a/caffe2/operators/experimental/c10/cpu/averaged_loss_cpu.cc b/caffe2/operators/experimental/c10/cpu/averaged_loss_cpu.cc index 069281cd6ff2..592c4e4c1b78 100644 --- a/caffe2/operators/experimental/c10/cpu/averaged_loss_cpu.cc +++ b/caffe2/operators/experimental/c10/cpu/averaged_loss_cpu.cc @@ -44,14 +44,10 @@ class averaged_loss_cpu final : public c10::OperatorKernel { }; static auto registry = c10::RegisterOperators().op( - FunctionSchema( - "_c10_experimental::AveragedLoss", - "", - (std::vector{c10::Argument("input"), - c10::Argument("output")}), - (std::vector{})), - c10::kernel>(), - c10::dispatchKey(CPUTensorId())); + "_c10_experimental::AveragedLoss", + c10::RegisterOperators::options() + .kernel>() + .dispatchKey(CPUTensorId())); } // namespace diff --git a/caffe2/operators/experimental/c10/cpu/batch_gather_cpu.cc b/caffe2/operators/experimental/c10/cpu/batch_gather_cpu.cc index c293ababa8a7..ed1e58ecdb39 100644 --- a/caffe2/operators/experimental/c10/cpu/batch_gather_cpu.cc +++ b/caffe2/operators/experimental/c10/cpu/batch_gather_cpu.cc @@ -65,15 +65,10 @@ void batch_gather_op_cpu(const at::Tensor& data, } static auto registry = c10::RegisterOperators().op( - FunctionSchema( - "_c10_experimental::BatchGather", - "", - (std::vector{c10::Argument("data"), - c10::Argument("indices"), - c10::Argument("output")}), - (std::vector{})), - c10::kernel(), - c10::dispatchKey(CPUTensorId())); + "_c10_experimental::BatchGather", + c10::RegisterOperators::options() + .kernel() + .dispatchKey(CPUTensorId())); } // namespace diff --git a/caffe2/operators/experimental/c10/cpu/batch_matmul_cpu.cc b/caffe2/operators/experimental/c10/cpu/batch_matmul_cpu.cc index bbd10cc4510e..e782a4c8beae 100644 --- a/caffe2/operators/experimental/c10/cpu/batch_matmul_cpu.cc +++ b/caffe2/operators/experimental/c10/cpu/batch_matmul_cpu.cc @@ -269,19 +269,10 @@ class batch_matmul_cpu final : public c10::OperatorKernel { }; static auto registry = c10::RegisterOperators().op( - FunctionSchema( - "_c10_experimental::BatchMatmul", - "", - (std::vector{ - c10::Argument("A"), - c10::Argument("B"), - c10::Argument("output"), - c10::Argument("trans_a", IntType::get()), - c10::Argument("trans_b", IntType::get()), - c10::Argument("broadcast", IntType::get())}), - (std::vector{})), - c10::kernel>(), - c10::dispatchKey(CPUTensorId())); + "_c10_experimental::BatchMatmul", + c10::RegisterOperators::options() + .kernel>() + .dispatchKey(CPUTensorId())); } // namespace diff --git a/caffe2/operators/experimental/c10/cpu/cast_cpu.cc b/caffe2/operators/experimental/c10/cpu/cast_cpu.cc index 84db51298a96..8668429f03b2 100644 --- a/caffe2/operators/experimental/c10/cpu/cast_cpu.cc +++ b/caffe2/operators/experimental/c10/cpu/cast_cpu.cc @@ -87,17 +87,10 @@ void cast_op_cpu( } static auto registry = c10::RegisterOperators().op( - FunctionSchema( - "_c10_experimental::Cast", - "", - (std::vector{ - c10::Argument("input"), - c10::Argument("output"), - c10::Argument("to_dtype", IntType::get()), - }), - (std::vector{})), - c10::kernel(), - c10::dispatchKey(CPUTensorId())); + "_c10_experimental::Cast", + c10::RegisterOperators::options() + .kernel() + .dispatchKey(CPUTensorId())); } // namespace diff --git a/caffe2/operators/experimental/c10/cpu/concat_cpu.cc b/caffe2/operators/experimental/c10/cpu/concat_cpu.cc index e84e8d07a5a3..999b002e5e80 100644 --- a/caffe2/operators/experimental/c10/cpu/concat_cpu.cc +++ b/caffe2/operators/experimental/c10/cpu/concat_cpu.cc @@ -13,7 +13,7 @@ namespace caffe2 { namespace { template void concat_op_cpu_impl( - ArrayRef inputs, + std::vector inputs, const at::Tensor& output_, const at::Tensor& split_, int64_t axis, @@ -105,20 +105,12 @@ void concat_op_cpu_impl( } static auto registry = c10::RegisterOperators().op( - FunctionSchema( - "_c10_experimental::Concat", - "", - (std::vector{ - c10::Argument("inputs", ListType::ofTensors()), - c10::Argument("output"), - c10::Argument("split_info"), - c10::Argument("add", IntType::get()), - c10::Argument("add_axis", IntType::get())}), - (std::vector{})), - c10::kernel< + "_c10_experimental::Concat", + c10::RegisterOperators::options() + .kernel< decltype(concat_op_cpu_impl), - &concat_op_cpu_impl>(), - c10::dispatchKey(CPUTensorId())); + &concat_op_cpu_impl>() + .dispatchKey(CPUTensorId())); } // namespace diff --git a/caffe2/operators/experimental/c10/cpu/enforce_finite_cpu.cc b/caffe2/operators/experimental/c10/cpu/enforce_finite_cpu.cc index 69a4e75479fd..f73fb0284c54 100644 --- a/caffe2/operators/experimental/c10/cpu/enforce_finite_cpu.cc +++ b/caffe2/operators/experimental/c10/cpu/enforce_finite_cpu.cc @@ -25,15 +25,12 @@ void enforce_finite_op_impl_cpu(const at::Tensor& input_) { } static auto registry = c10::RegisterOperators().op( - FunctionSchema( - "_c10_experimental::EnforceFinite", - "", - (std::vector{c10::Argument("input")}), - (std::vector{})), - c10::kernel< + "_c10_experimental::EnforceFinite", + c10::RegisterOperators::options() + .kernel< decltype(enforce_finite_op_impl_cpu), - &enforce_finite_op_impl_cpu>(), - c10::dispatchKey(CPUTensorId())); + &enforce_finite_op_impl_cpu>() + .dispatchKey(CPUTensorId())); } // namespace diff --git a/caffe2/operators/experimental/c10/cpu/expand_dims_cpu.cc b/caffe2/operators/experimental/c10/cpu/expand_dims_cpu.cc index 780ff2945a88..60cf805dcb8b 100644 --- a/caffe2/operators/experimental/c10/cpu/expand_dims_cpu.cc +++ b/caffe2/operators/experimental/c10/cpu/expand_dims_cpu.cc @@ -14,12 +14,12 @@ class expand_dims_cpu final : public c10::OperatorKernel { void operator()( const at::Tensor& input_, const at::Tensor& output_, - ArrayRef dims) { + std::vector dims) { Tensor input(input_); Tensor output(output_); if (!initialized_) { - dims_ = dims.vec(); + dims_ = std::move(dims); auto originalSize = dims_.size(); CAFFE_ENFORCE(originalSize > 0, "Parameter `dims` must be provided."); std::sort(dims_.begin(), dims_.end()); @@ -55,15 +55,10 @@ class expand_dims_cpu final : public c10::OperatorKernel { }; static auto registry = c10::RegisterOperators().op( - FunctionSchema( - "_c10_experimental::ExpandDims", - "", - (std::vector{c10::Argument("input"), - c10::Argument("output"), - c10::Argument("dims", ListType::ofInts())}), - (std::vector{})), - c10::kernel>(), - c10::dispatchKey(CPUTensorId())); + "_c10_experimental::ExpandDims", + c10::RegisterOperators::options() + .kernel>() + .dispatchKey(CPUTensorId())); } // namespace diff --git a/caffe2/operators/experimental/c10/cpu/fc_cpu.cc b/caffe2/operators/experimental/c10/cpu/fc_cpu.cc index eac61b5a529a..3993da003053 100644 --- a/caffe2/operators/experimental/c10/cpu/fc_cpu.cc +++ b/caffe2/operators/experimental/c10/cpu/fc_cpu.cc @@ -129,18 +129,10 @@ class fc_op_cpu final : public c10::OperatorKernel { }; static auto registry = c10::RegisterOperators().op( - FunctionSchema( - "_c10_experimental::FullyConnected", - "", - (std::vector{c10::Argument("X"), - c10::Argument("W"), - c10::Argument("b"), - c10::Argument("output"), - c10::Argument("axis", IntType::get()), - c10::Argument("axis_w", IntType::get())}), - (std::vector{})), - c10::kernel>(), - c10::dispatchKey(CPUTensorId())); + "_c10_experimental::FullyConnected", + c10::RegisterOperators::options() + .kernel>() + .dispatchKey(CPUTensorId())); } // namespace diff --git a/caffe2/operators/experimental/c10/cpu/filler_cpu.cc b/caffe2/operators/experimental/c10/cpu/filler_cpu.cc index 3e4966253858..40122cc3d803 100644 --- a/caffe2/operators/experimental/c10/cpu/filler_cpu.cc +++ b/caffe2/operators/experimental/c10/cpu/filler_cpu.cc @@ -45,10 +45,10 @@ void filler_init( template void given_tensor_fill_op_cpu_impl( - ArrayRef inputs, + std::vector inputs, const at::Tensor& output_, - ArrayRef shape, - ArrayRef extra_shape, + std::vector shape, + std::vector extra_shape, bool input_as_shape, const at::Tensor& values_) { Tensor output(output_); @@ -70,10 +70,10 @@ void given_tensor_fill_op_cpu_impl( } void constant_fill_op_cpu_impl( - ArrayRef inputs, + std::vector inputs, const at::Tensor& output_, - ArrayRef shape, - ArrayRef extra_shape, + std::vector shape, + std::vector extra_shape, bool input_as_shape, int64_t dtype, c10::Scalar value) { @@ -110,10 +110,10 @@ void constant_fill_op_cpu_impl( } void uniform_fill_op_cpu_impl( - ArrayRef inputs, + std::vector inputs, const at::Tensor& output_, - ArrayRef shape, - ArrayRef extra_shape, + std::vector shape, + std::vector extra_shape, bool input_as_shape, double min, double max) { @@ -145,86 +145,36 @@ void uniform_fill_op_cpu_impl( static auto registry = c10::RegisterOperators() - .op(FunctionSchema( - "_c10_experimental::ConstantFill", - "", - (std::vector{ - c10::Argument("inputs", ListType::ofTensors()), - c10::Argument("output"), - c10::Argument("shape", ListType::ofInts()), - c10::Argument("extra_shape", ListType::ofInts()), - c10::Argument("input_as_shape", BoolType::get()), - c10::Argument("dtype", IntType::get()), - c10::Argument("value", NumberType::get())}), - (std::vector{})), - c10::kernel< + .op("_c10_experimental::ConstantFill", + c10::RegisterOperators::options() + .kernel< decltype(constant_fill_op_cpu_impl), - &constant_fill_op_cpu_impl>(), - c10::dispatchKey(CPUTensorId())) - .op(FunctionSchema( - "_c10_experimental::UniformFill", - "", - (std::vector{ - c10::Argument("inputs", ListType::ofTensors()), - c10::Argument("output"), - c10::Argument("shape", ListType::ofInts()), - c10::Argument("extra_shape", ListType::ofInts()), - c10::Argument("input_as_shape", BoolType::get()), - c10::Argument("min", FloatType::get()), - c10::Argument("max", FloatType::get())}), - (std::vector{})), - c10::kernel< + &constant_fill_op_cpu_impl>() + .dispatchKey(CPUTensorId())) + .op("_c10_experimental::UniformFill", + c10::RegisterOperators::options() + .kernel< decltype(uniform_fill_op_cpu_impl), - &uniform_fill_op_cpu_impl>(), - c10::dispatchKey(CPUTensorId())) - .op(FunctionSchema( - "_c10_experimental::GivenTensorFill", - "", - (std::vector{ - c10::Argument("inputs", ListType::ofTensors()), - c10::Argument("output"), - c10::Argument("shape", ListType::ofInts()), - c10::Argument("extra_shape", ListType::ofInts()), - c10::Argument("input_as_shape", BoolType::get()), - c10::Argument("values"), - }), - (std::vector{})), - c10::kernel< + &uniform_fill_op_cpu_impl>() + .dispatchKey(CPUTensorId())) + .op("_c10_experimental::GivenTensorFill", + c10::RegisterOperators::options() + .kernel< decltype(given_tensor_fill_op_cpu_impl), - &given_tensor_fill_op_cpu_impl>(), - c10::dispatchKey(CPUTensorId())) - .op(FunctionSchema( - "_c10_experimental::GivenTensorIntFill", - "", - (std::vector{ - c10::Argument("inputs", ListType::ofTensors()), - c10::Argument("output"), - c10::Argument("shape", ListType::ofInts()), - c10::Argument("extra_shape", ListType::ofInts()), - c10::Argument("input_as_shape", BoolType::get()), - c10::Argument("values"), - }), - (std::vector{})), - c10::kernel< + &given_tensor_fill_op_cpu_impl>() + .dispatchKey(CPUTensorId())) + .op("_c10_experimental::GivenTensorIntFill", + c10::RegisterOperators::options() + .kernel< decltype(given_tensor_fill_op_cpu_impl), - &given_tensor_fill_op_cpu_impl>(), - c10::dispatchKey(CPUTensorId())) - .op(FunctionSchema( - "_c10_experimental::GivenTensorInt64Fill", - "", - (std::vector{ - c10::Argument("inputs", ListType::ofTensors()), - c10::Argument("output"), - c10::Argument("shape", ListType::ofInts()), - c10::Argument("extra_shape", ListType::ofInts()), - c10::Argument("input_as_shape", BoolType::get()), - c10::Argument("values"), - }), - (std::vector{})), - c10::kernel< + &given_tensor_fill_op_cpu_impl>() + .dispatchKey(CPUTensorId())) + .op("_c10_experimental::GivenTensorInt64Fill", + c10::RegisterOperators::options() + .kernel< decltype(given_tensor_fill_op_cpu_impl), - &given_tensor_fill_op_cpu_impl>(), - c10::dispatchKey(CPUTensorId())); + &given_tensor_fill_op_cpu_impl>() + .dispatchKey(CPUTensorId())); } // namespace diff --git a/caffe2/operators/experimental/c10/cpu/flatten_cpu.cc b/caffe2/operators/experimental/c10/cpu/flatten_cpu.cc index 70cae21810a1..a2357f05a3a6 100644 --- a/caffe2/operators/experimental/c10/cpu/flatten_cpu.cc +++ b/caffe2/operators/experimental/c10/cpu/flatten_cpu.cc @@ -27,17 +27,12 @@ void flatten_op_cpu_impl( } static auto registry = c10::RegisterOperators().op( - FunctionSchema( - "_c10_experimental::Flatten", - "", - (std::vector{c10::Argument("input"), - c10::Argument("output"), - c10::Argument("axis", IntType::get())}), - (std::vector{})), - c10::kernel< + "_c10_experimental::Flatten", + c10::RegisterOperators::options() + .kernel< decltype(flatten_op_cpu_impl), - &flatten_op_cpu_impl>(), - c10::dispatchKey(CPUTensorId())); + &flatten_op_cpu_impl>() + .dispatchKey(CPUTensorId())); } // namespace diff --git a/caffe2/operators/experimental/c10/cpu/mul_cpu.cc b/caffe2/operators/experimental/c10/cpu/mul_cpu.cc index b64cbac56dff..eae2bf3a3764 100644 --- a/caffe2/operators/experimental/c10/cpu/mul_cpu.cc +++ b/caffe2/operators/experimental/c10/cpu/mul_cpu.cc @@ -72,18 +72,10 @@ void mul_op_cpu_impl( } static auto registry = c10::RegisterOperators().op( - FunctionSchema( - "_c10_experimental::Mul", - "", - (std::vector{ - c10::Argument("input1"), - c10::Argument("input2"), - c10::Argument("output"), - c10::Argument("legacy_broadcast", BoolType::get()), - c10::Argument("axis", IntType::get())}), - (std::vector{})), - c10::kernel), &mul_op_cpu_impl>(), - c10::dispatchKey(CPUTensorId())); + "_c10_experimental::Mul", + c10::RegisterOperators::options() + .kernel), &mul_op_cpu_impl>() + .dispatchKey(CPUTensorId())); } // namespace diff --git a/caffe2/operators/experimental/c10/cpu/relu_cpu.cc b/caffe2/operators/experimental/c10/cpu/relu_cpu.cc index 0a0225cbbdba..d5eec0c87c73 100644 --- a/caffe2/operators/experimental/c10/cpu/relu_cpu.cc +++ b/caffe2/operators/experimental/c10/cpu/relu_cpu.cc @@ -41,14 +41,10 @@ void relu_op_cpu_impl( } static auto registry = c10::RegisterOperators().op( - FunctionSchema( - "_c10_experimental::Relu", - "", - (std::vector{c10::Argument("input"), - c10::Argument("output")}), - (std::vector{})), - c10::kernel), &relu_op_cpu_impl>(), - c10::dispatchKey(CPUTensorId())); + "_c10_experimental::Relu", + c10::RegisterOperators::options() + .kernel), &relu_op_cpu_impl>() + .dispatchKey(CPUTensorId())); } // namespace diff --git a/caffe2/operators/experimental/c10/cpu/sigmoid_cpu.cc b/caffe2/operators/experimental/c10/cpu/sigmoid_cpu.cc index cf60fd01a922..d78cb4f6192d 100644 --- a/caffe2/operators/experimental/c10/cpu/sigmoid_cpu.cc +++ b/caffe2/operators/experimental/c10/cpu/sigmoid_cpu.cc @@ -24,16 +24,12 @@ void sigmoid_op_cpu_impl( } static auto registry = c10::RegisterOperators().op( - FunctionSchema( - "_c10_experimental::Sigmoid", - "", - (std::vector{c10::Argument("input"), - c10::Argument("output")}), - (std::vector{})), - c10::kernel< + "_c10_experimental::Sigmoid", + c10::RegisterOperators::options() + .kernel< decltype(sigmoid_op_cpu_impl), - &sigmoid_op_cpu_impl>(), - c10::dispatchKey(CPUTensorId())); + &sigmoid_op_cpu_impl>() + .dispatchKey(CPUTensorId())); } // namespace diff --git a/caffe2/operators/experimental/c10/cpu/sigmoid_cross_entropy_with_logits_cpu.cc b/caffe2/operators/experimental/c10/cpu/sigmoid_cross_entropy_with_logits_cpu.cc index 460af4c2262a..f9bcabcf39f1 100644 --- a/caffe2/operators/experimental/c10/cpu/sigmoid_cross_entropy_with_logits_cpu.cc +++ b/caffe2/operators/experimental/c10/cpu/sigmoid_cross_entropy_with_logits_cpu.cc @@ -71,20 +71,12 @@ void sigmoid_cross_entropy_with_logits_op_cpu_impl( } static auto registry = c10::RegisterOperators().op( - FunctionSchema( - "_c10_experimental::SigmoidCrossEntropyWithLogits", - "", - (std::vector{ - c10::Argument("input1"), - c10::Argument("input2"), - c10::Argument("output"), - c10::Argument("log_D_trick", BoolType::get()), - c10::Argument("unjoined_lr_loss", BoolType::get())}), - (std::vector{})), - c10::kernel< + "_c10_experimental::SigmoidCrossEntropyWithLogits", + c10::RegisterOperators::options() + .kernel< decltype(sigmoid_cross_entropy_with_logits_op_cpu_impl), - &sigmoid_cross_entropy_with_logits_op_cpu_impl>(), - c10::dispatchKey(CPUTensorId())); + &sigmoid_cross_entropy_with_logits_op_cpu_impl>() + .dispatchKey(CPUTensorId())); } // namespace diff --git a/caffe2/operators/experimental/c10/cpu/sparse_lengths_sum_cpu.cc b/caffe2/operators/experimental/c10/cpu/sparse_lengths_sum_cpu.cc index 98108e0c4fac..775a1b72968f 100644 --- a/caffe2/operators/experimental/c10/cpu/sparse_lengths_sum_cpu.cc +++ b/caffe2/operators/experimental/c10/cpu/sparse_lengths_sum_cpu.cc @@ -82,18 +82,12 @@ void sparse_lengths_sum_op_cpu( } static auto registry = c10::RegisterOperators().op( - FunctionSchema( - "_c10_experimental::SparseLengthsSum", - "", - (std::vector{c10::Argument("data"), - c10::Argument("indices"), - c10::Argument("lengths"), - c10::Argument("output")}), - (std::vector{})), - c10::kernel< + "_c10_experimental::SparseLengthsSum", + c10::RegisterOperators::options() + .kernel< decltype(sparse_lengths_sum_op_cpu), - &sparse_lengths_sum_op_cpu>(), - c10::dispatchKey(CPUTensorId())); + &sparse_lengths_sum_op_cpu>() + .dispatchKey(CPUTensorId())); } // namespace diff --git a/caffe2/operators/experimental/c10/cpu/stop_gradient_cpu.cc b/caffe2/operators/experimental/c10/cpu/stop_gradient_cpu.cc index bbfc8910be0e..d43aa8ba6929 100644 --- a/caffe2/operators/experimental/c10/cpu/stop_gradient_cpu.cc +++ b/caffe2/operators/experimental/c10/cpu/stop_gradient_cpu.cc @@ -20,16 +20,12 @@ void stop_gradient_op_cpu_impl( } static auto registry = c10::RegisterOperators().op( - FunctionSchema( - "_c10_experimental::StopGradient", - "", - (std::vector{c10::Argument("input"), - c10::Argument("output")}), - (std::vector{})), - c10::kernel< + "_c10_experimental::StopGradient", + c10::RegisterOperators::options() + .kernel< decltype(stop_gradient_op_cpu_impl), - &stop_gradient_op_cpu_impl>(), - c10::dispatchKey(CPUTensorId())); + &stop_gradient_op_cpu_impl>() + .dispatchKey(CPUTensorId())); } // namespace diff --git a/caffe2/operators/generate_proposals_op.cc b/caffe2/operators/generate_proposals_op.cc index 7e6954fae146..7279ff17b761 100644 --- a/caffe2/operators/generate_proposals_op.cc +++ b/caffe2/operators/generate_proposals_op.cc @@ -228,17 +228,19 @@ void GenerateProposalsOp::ProposalsForOneImage( bbox_deltas_sorted, bbox_weights, utils::BBOX_XFORM_CLIP_DEFAULT, + legacy_plus_one_, angle_bound_on_, angle_bound_lo_, angle_bound_hi_); // 2. clip proposals to image (may result in proposals with zero area // that will be removed in the next step) - proposals = - utils::clip_boxes(proposals, im_info[0], im_info[1], clip_angle_thresh_); + proposals = utils::clip_boxes( + proposals, im_info[0], im_info[1], clip_angle_thresh_, legacy_plus_one_); // 3. remove predicted boxes with either height or width < min_size - auto keep = utils::filter_boxes(proposals, min_size, im_info); + auto keep = + utils::filter_boxes(proposals, min_size, im_info, legacy_plus_one_); DCHECK_LE(keep.size(), scores_sorted.size()); // 6. apply loose nms (e.g. threshold = 0.7) @@ -246,9 +248,15 @@ void GenerateProposalsOp::ProposalsForOneImage( // 8. return the top proposals (-> RoIs top) if (post_nms_topN > 0 && post_nms_topN < keep.size()) { keep = utils::nms_cpu( - proposals, scores_sorted, keep, nms_thresh, post_nms_topN); + proposals, + scores_sorted, + keep, + nms_thresh, + post_nms_topN, + legacy_plus_one_); } else { - keep = utils::nms_cpu(proposals, scores_sorted, keep, nms_thresh); + keep = utils::nms_cpu( + proposals, scores_sorted, keep, nms_thresh, -1, legacy_plus_one_); } // Generate outputs @@ -406,6 +414,7 @@ SHOULD_NOT_DO_GRADIENT(GenerateProposalsCPP); } // namespace caffe2 +// clang-format off C10_REGISTER_CAFFE2_OPERATOR_CPU( GenerateProposals, "_caffe2::GenerateProposals(" @@ -421,6 +430,8 @@ C10_REGISTER_CAFFE2_OPERATOR_CPU( "bool angle_bound_on, " "int angle_bound_lo, " "int angle_bound_hi, " - "float clip_angle_thresh" + "float clip_angle_thresh, " + "bool legacy_plus_one" ") -> (Tensor output_0, Tensor output_1)", caffe2::GenerateProposalsOp); +// clang-format on diff --git a/caffe2/operators/generate_proposals_op.cu b/caffe2/operators/generate_proposals_op.cu index 2bb1d9b2768b..fcae9d6fe822 100644 --- a/caffe2/operators/generate_proposals_op.cu +++ b/caffe2/operators/generate_proposals_op.cu @@ -23,6 +23,7 @@ __global__ void GeneratePreNMSUprightBoxesKernel( const float* d_img_info_vec, const int num_images, const float bbox_xform_clip, + const bool legacy_plus_one, float4* d_out_boxes, const int prenms_nboxes, // leading dimension of out_boxes float* d_inout_scores, @@ -81,35 +82,35 @@ __global__ void GeneratePreNMSUprightBoxesKernel( dh = fmin(dh, bbox_xform_clip); // Applying the deltas - float width = x2 - x1 + 1.0f; + float width = x2 - x1 + float(int(legacy_plus_one)); const float ctr_x = x1 + 0.5f * width; const float pred_ctr_x = ctr_x + width * dx; // TODO fuse madd const float pred_w = width * expf(dw); x1 = pred_ctr_x - 0.5f * pred_w; - x2 = pred_ctr_x + 0.5f * pred_w - 1.0f; + x2 = pred_ctr_x + 0.5f * pred_w - float(int(legacy_plus_one)); - float height = y2 - y1 + 1.0f; + float height = y2 - y1 + float(int(legacy_plus_one)); const float ctr_y = y1 + 0.5f * height; const float pred_ctr_y = ctr_y + height * dy; const float pred_h = height * expf(dh); y1 = pred_ctr_y - 0.5f * pred_h; - y2 = pred_ctr_y + 0.5f * pred_h - 1.0f; + y2 = pred_ctr_y + 0.5f * pred_h - float(int(legacy_plus_one)); // Clipping box to image const float img_height = d_img_info_vec[3 * image_index + 0]; const float img_width = d_img_info_vec[3 * image_index + 1]; const float min_size_scaled = min_size * d_img_info_vec[3 * image_index + 2]; - x1 = fmax(fmin(x1, img_width - 1.0f), 0.0f); - y1 = fmax(fmin(y1, img_height - 1.0f), 0.0f); - x2 = fmax(fmin(x2, img_width - 1.0f), 0.0f); - y2 = fmax(fmin(y2, img_height - 1.0f), 0.0f); + x1 = fmax(fmin(x1, img_width - float(int(legacy_plus_one))), 0.0f); + y1 = fmax(fmin(y1, img_height - float(int(legacy_plus_one))), 0.0f); + x2 = fmax(fmin(x2, img_width - float(int(legacy_plus_one))), 0.0f); + y2 = fmax(fmin(y2, img_height - float(int(legacy_plus_one))), 0.0f); // Filter boxes // Removing boxes with one dim < min_size // (center of box is in image, because of previous step) - width = x2 - x1 + 1.0f; // may have changed - height = y2 - y1 + 1.0f; + width = x2 - x1 + float(int(legacy_plus_one)); // may have changed + height = y2 - y1 + float(int(legacy_plus_one)); bool keep_box = fmin(width, height) >= min_size_scaled; // We are not deleting the box right now even if !keep_box @@ -140,6 +141,7 @@ __global__ void GeneratePreNMSRotatedBoxesKernel( const float* d_img_info_vec, const int num_images, const float bbox_xform_clip, + const bool legacy_plus_one, const bool angle_bound_on, const int angle_bound_lo, const int angle_bound_hi, @@ -229,22 +231,22 @@ __global__ void GeneratePreNMSRotatedBoxesKernel( min_size * d_img_info_vec[3 * image_index + 2]; if (fabs(box.a) <= clip_angle_thresh) { // Convert from [x_ctr, y_ctr, w, h] to [x1, y1, x2, y2] - float x1 = box.x_ctr - (box.w - 1.f) / 2.f; - float y1 = box.y_ctr - (box.h - 1.f) / 2.f; - float x2 = x1 + box.w - 1.f; - float y2 = y1 + box.h - 1.f; + float x1 = box.x_ctr - (box.w - float(int(legacy_plus_one))) / 2.f; + float y1 = box.y_ctr - (box.h - float(int(legacy_plus_one))) / 2.f; + float x2 = x1 + box.w - float(int(legacy_plus_one)); + float y2 = y1 + box.h - float(int(legacy_plus_one)); // Clip - x1 = fmax(fmin(x1, img_width - 1.0f), 0.0f); - y1 = fmax(fmin(y1, img_height - 1.0f), 0.0f); - x2 = fmax(fmin(x2, img_width - 1.0f), 0.0f); - y2 = fmax(fmin(y2, img_height - 1.0f), 0.0f); + x1 = fmax(fmin(x1, img_width - float(int(legacy_plus_one))), 0.0f); + y1 = fmax(fmin(y1, img_height - float(int(legacy_plus_one))), 0.0f); + x2 = fmax(fmin(x2, img_width - float(int(legacy_plus_one))), 0.0f); + y2 = fmax(fmin(y2, img_height - float(int(legacy_plus_one))), 0.0f); // Convert back to [x_ctr, y_ctr, w, h] box.x_ctr = (x1 + x2) / 2.f; box.y_ctr = (y1 + y2) / 2.f; - box.w = x2 - x1 + 1.f; - box.h = y2 - y1 + 1.f; + box.w = x2 - x1 + float(int(legacy_plus_one)); + box.h = y2 - y1 + float(int(legacy_plus_one)); } // Filter boxes. @@ -485,6 +487,7 @@ bool GenerateProposalsOp::RunOnDevice() { d_im_info_vec, num_images, utils::BBOX_XFORM_CLIP_DEFAULT, + legacy_plus_one_, reinterpret_cast(d_boxes), nboxes_to_generate, d_sorted_scores, @@ -507,6 +510,7 @@ bool GenerateProposalsOp::RunOnDevice() { d_im_info_vec, num_images, utils::BBOX_XFORM_CLIP_DEFAULT, + legacy_plus_one_, angle_bound_on_, angle_bound_lo_, angle_bound_hi_, @@ -597,6 +601,7 @@ bool GenerateProposalsOp::RunOnDevice() { d_image_prenms_boxes, prenms_nboxes, rpn_nms_thresh_, + legacy_plus_one_, d_image_boxes_keep_list, &nkeep, dev_nms_mask_, @@ -669,5 +674,4 @@ REGISTER_CUDA_OPERATOR(GenerateProposals, GenerateProposalsOp); C10_REGISTER_CAFFE2_OPERATOR_CUDA( GenerateProposals, - caffe2::GenerateProposalsOp -); + caffe2::GenerateProposalsOp); diff --git a/caffe2/operators/generate_proposals_op.h b/caffe2/operators/generate_proposals_op.h index f2c4a8f6ec1f..e55972c323fc 100644 --- a/caffe2/operators/generate_proposals_op.h +++ b/caffe2/operators/generate_proposals_op.h @@ -1,9 +1,9 @@ #ifndef CAFFE2_OPERATORS_GENERATE_PROPOSALS_OP_H_ #define CAFFE2_OPERATORS_GENERATE_PROPOSALS_OP_H_ +#include "caffe2/core/c10_operator.h" #include "caffe2/core/context.h" #include "caffe2/core/operator.h" -#include "caffe2/core/c10_operator.h" #include "caffe2/utils/eigen_utils.h" #include "caffe2/utils/math.h" @@ -79,7 +79,7 @@ template class GenerateProposalsOp final : public Operator { public: USE_OPERATOR_CONTEXT_FUNCTIONS; - template + template explicit GenerateProposalsOp(Args&&... args) : Operator(std::forward(args)...), spatial_scale_( @@ -99,7 +99,9 @@ class GenerateProposalsOp final : public Operator { angle_bound_hi_( this->template GetSingleArgument("angle_bound_hi", 90)), clip_angle_thresh_( - this->template GetSingleArgument("clip_angle_thresh", 1.0)) {} + this->template GetSingleArgument("clip_angle_thresh", 1.0)), + legacy_plus_one_( + this->template GetSingleArgument("legacy_plus_one", true)) {} ~GenerateProposalsOp() {} @@ -142,6 +144,8 @@ class GenerateProposalsOp final : public Operator { // tolerance for backward compatibility. Set to negative value for // no clipping. float clip_angle_thresh_{1.0}; + // The infamous "+ 1" for box width and height dating back to the DPM days + bool legacy_plus_one_{true}; // Scratch space required by the CUDA version // CUB buffers diff --git a/caffe2/operators/generate_proposals_op_gpu_test.cc b/caffe2/operators/generate_proposals_op_gpu_test.cc index da3f56a284ec..d328f81726c0 100644 --- a/caffe2/operators/generate_proposals_op_gpu_test.cc +++ b/caffe2/operators/generate_proposals_op_gpu_test.cc @@ -362,7 +362,8 @@ TEST(GenerateProposalsTest, TestRealDownSampledRotatedAngle0GPU) { rois_gt_xyxy.block(0, 0, rois_gt.rows(), 0); // rois_gt in [x_ctr, y_ctr, w, h] format rois_gt.block(0, 1, rois_gt.rows(), 4) = utils::bbox_xyxy_to_ctrwh( - rois_gt_xyxy.block(0, 1, rois_gt.rows(), 4).array()); + rois_gt_xyxy.block(0, 1, rois_gt.rows(), 4).array(), + true /* legacy_plus_one */); // Angle rois_gt.block(0, 5, rois_gt.rows(), 1) = ERMatXf::Constant(rois_gt.rows(), 1, angle); diff --git a/caffe2/operators/generate_proposals_op_test.cc b/caffe2/operators/generate_proposals_op_test.cc index f79cf6891241..def7f286e85c 100644 --- a/caffe2/operators/generate_proposals_op_test.cc +++ b/caffe2/operators/generate_proposals_op_test.cc @@ -147,7 +147,8 @@ TEST(GenerateProposalsTest, TestComputeAllAnchorsRotated) { // Convert to RRPN format and add angles ERMatXf anchors(3, 5); - anchors.block(0, 0, 3, 4) = utils::bbox_xyxy_to_ctrwh(anchors_xyxy.array()); + anchors.block(0, 0, 3, 4) = utils::bbox_xyxy_to_ctrwh( + anchors_xyxy.array(), true /* legacy_plus_one */); std::vector angles{0.0, 45.0, -120.0}; for (int i = 0; i < anchors.rows(); ++i) { anchors(i, 4) = angles[i % angles.size()]; @@ -170,8 +171,8 @@ TEST(GenerateProposalsTest, TestComputeAllAnchorsRotated) { // Convert gt to RRPN format and add angles ERMatXf all_anchors_gt(36, 5); - all_anchors_gt.block(0, 0, 36, 4) = - utils::bbox_xyxy_to_ctrwh(all_anchors_gt_xyxy.array()); + all_anchors_gt.block(0, 0, 36, 4) = utils::bbox_xyxy_to_ctrwh( + all_anchors_gt_xyxy.array(), true /* legacy_plus_one */); for (int i = 0; i < all_anchors_gt.rows(); ++i) { all_anchors_gt(i, 4) = angles[i % angles.size()]; } @@ -196,7 +197,8 @@ TEST(GenerateProposalsTest, TestComputeSortedAnchorsRotated) { // Convert to RRPN format and add angles ERMatXf anchors(3, 5); - anchors.block(0, 0, 3, 4) = utils::bbox_xyxy_to_ctrwh(anchors_xyxy.array()); + anchors.block(0, 0, 3, 4) = utils::bbox_xyxy_to_ctrwh( + anchors_xyxy.array(), true /* legacy_plus_one */); std::vector angles{0.0, 45.0, -120.0}; for (int i = 0; i < anchors.rows(); ++i) { anchors(i, 4) = angles[i % angles.size()]; @@ -524,7 +526,8 @@ TEST(GenerateProposalsTest, TestRealDownSampledRotatedAngle0) { rois_gt_xyxy.block(0, 0, rois_gt.rows(), 1); // rois_gt in [x_ctr, y_ctr, w, h] format rois_gt.block(0, 1, rois_gt.rows(), 4) = utils::bbox_xyxy_to_ctrwh( - rois_gt_xyxy.block(0, 1, rois_gt.rows(), 4).array()); + rois_gt_xyxy.block(0, 1, rois_gt.rows(), 4).array(), + true /* legacy_plus_one */); // Angle rois_gt.block(0, 5, rois_gt.rows(), 1) = ERMatXf::Constant(rois_gt.rows(), 1, angle); diff --git a/caffe2/operators/generate_proposals_op_util_boxes.h b/caffe2/operators/generate_proposals_op_util_boxes.h index 04dd1912f6a7..92f9714c5a10 100644 --- a/caffe2/operators/generate_proposals_op_util_boxes.h +++ b/caffe2/operators/generate_proposals_op_util_boxes.h @@ -39,7 +39,8 @@ EArrXXt bbox_transform_upright( const Eigen::ArrayBase& deltas, const std::vector& weights = std::vector{1.0, 1.0, 1.0, 1.0}, - const float bbox_xform_clip = BBOX_XFORM_CLIP_DEFAULT) { + const float bbox_xform_clip = BBOX_XFORM_CLIP_DEFAULT, + const bool legacy_plus_one = false) { using T = typename Derived1::Scalar; using EArrXX = EArrXXt; using EArrX = EArrXt; @@ -52,8 +53,8 @@ EArrXXt bbox_transform_upright( CAFFE_ENFORCE_EQ(boxes.cols(), 4); CAFFE_ENFORCE_EQ(deltas.cols(), 4); - EArrX widths = boxes.col(2) - boxes.col(0) + T(1.0); - EArrX heights = boxes.col(3) - boxes.col(1) + T(1.0); + EArrX widths = boxes.col(2) - boxes.col(0) + T(int(legacy_plus_one)); + EArrX heights = boxes.col(3) - boxes.col(1) + T(int(legacy_plus_one)); auto ctr_x = boxes.col(0) + T(0.5) * widths; auto ctr_y = boxes.col(1) + T(0.5) * heights; @@ -75,9 +76,9 @@ EArrXXt bbox_transform_upright( // y1 pred_boxes.col(1) = pred_ctr_y - T(0.5) * pred_h; // x2 - pred_boxes.col(2) = pred_ctr_x + T(0.5) * pred_w - T(1.0); + pred_boxes.col(2) = pred_ctr_x + T(0.5) * pred_w - T(int(legacy_plus_one)); // y2 - pred_boxes.col(3) = pred_ctr_y + T(0.5) * pred_h - T(1.0); + pred_boxes.col(3) = pred_ctr_y + T(0.5) * pred_h - T(int(legacy_plus_one)); return pred_boxes; } @@ -166,13 +167,15 @@ EArrXXt bbox_transform( const std::vector& weights = std::vector{1.0, 1.0, 1.0, 1.0}, const float bbox_xform_clip = BBOX_XFORM_CLIP_DEFAULT, + const bool legacy_plus_one = false, const bool angle_bound_on = true, const int angle_bound_lo = -90, const int angle_bound_hi = 90) { CAFFE_ENFORCE(boxes.cols() == 4 || boxes.cols() == 5); if (boxes.cols() == 4) { // Upright boxes - return bbox_transform_upright(boxes, deltas, weights, bbox_xform_clip); + return bbox_transform_upright( + boxes, deltas, weights, bbox_xform_clip, legacy_plus_one); } else { // Rotated boxes with angle info return bbox_transform_rotated( @@ -188,7 +191,8 @@ EArrXXt bbox_transform( template EArrXXt bbox_xyxy_to_ctrwh( - const Eigen::ArrayBase& boxes) { + const Eigen::ArrayBase& boxes, + bool legacy_plus_one = false) { CAFFE_ENFORCE_EQ(boxes.cols(), 4); const auto& x1 = boxes.col(0); @@ -199,14 +203,15 @@ EArrXXt bbox_xyxy_to_ctrwh( EArrXXt ret(boxes.rows(), 4); ret.col(0) = (x1 + x2) / 2.0; // x_ctr ret.col(1) = (y1 + y2) / 2.0; // y_ctr - ret.col(2) = x2 - x1 + 1.0; // w - ret.col(3) = y2 - y1 + 1.0; // h + ret.col(2) = x2 - x1 + int(legacy_plus_one); // w + ret.col(3) = y2 - y1 + int(legacy_plus_one); // h return ret; } template EArrXXt bbox_ctrwh_to_xyxy( - const Eigen::ArrayBase& boxes) { + const Eigen::ArrayBase& boxes, + const bool legacy_plus_one = false) { CAFFE_ENFORCE_EQ(boxes.cols(), 4); const auto& x_ctr = boxes.col(0); @@ -215,10 +220,10 @@ EArrXXt bbox_ctrwh_to_xyxy( const auto& h = boxes.col(3); EArrXXt ret(boxes.rows(), 4); - ret.col(0) = x_ctr - (w - 1) / 2.0; // x1 - ret.col(1) = y_ctr - (h - 1) / 2.0; // y1 - ret.col(2) = x_ctr + (w - 1) / 2.0; // x2 - ret.col(3) = y_ctr + (h - 1) / 2.0; // y2 + ret.col(0) = x_ctr - (w - int(legacy_plus_one)) / 2.0; // x1 + ret.col(1) = y_ctr - (h - int(legacy_plus_one)) / 2.0; // y1 + ret.col(2) = x_ctr + (w - int(legacy_plus_one)) / 2.0; // x2 + ret.col(3) = y_ctr + (h - int(legacy_plus_one)) / 2.0; // y2 return ret; } @@ -228,19 +233,20 @@ template EArrXXt clip_boxes_upright( const Eigen::ArrayBase& boxes, int height, - int width) { + int width, + bool legacy_plus_one = false) { CAFFE_ENFORCE(boxes.cols() == 4); EArrXXt ret(boxes.rows(), boxes.cols()); // x1 >= 0 && x1 < width - ret.col(0) = boxes.col(0).cwiseMin(width - 1).cwiseMax(0); + ret.col(0) = boxes.col(0).cwiseMin(width - int(legacy_plus_one)).cwiseMax(0); // y1 >= 0 && y1 < height - ret.col(1) = boxes.col(1).cwiseMin(height - 1).cwiseMax(0); + ret.col(1) = boxes.col(1).cwiseMin(height - int(legacy_plus_one)).cwiseMax(0); // x2 >= 0 && x2 < width - ret.col(2) = boxes.col(2).cwiseMin(width - 1).cwiseMax(0); + ret.col(2) = boxes.col(2).cwiseMin(width - int(legacy_plus_one)).cwiseMax(0); // y2 >= 0 && y2 < height - ret.col(3) = boxes.col(3).cwiseMin(height - 1).cwiseMax(0); + ret.col(3) = boxes.col(3).cwiseMin(height - int(legacy_plus_one)).cwiseMax(0); return ret; } @@ -263,7 +269,8 @@ EArrXXt clip_boxes_rotated( const Eigen::ArrayBase& boxes, int height, int width, - float angle_thresh = 1.0) { + float angle_thresh = 1.0, + bool legacy_plus_one = false) { CAFFE_ENFORCE(boxes.cols() == 5); const auto& angles = boxes.col(4); @@ -275,13 +282,13 @@ EArrXXt clip_boxes_rotated( // Convert to [x1, y1, x2, y2] format and clip them const auto& upright_boxes_xyxy = - bbox_ctrwh_to_xyxy(upright_boxes.leftCols(4)); + bbox_ctrwh_to_xyxy(upright_boxes.leftCols(4), legacy_plus_one); const auto& clipped_upright_boxes_xyxy = - clip_boxes_upright(upright_boxes_xyxy, height, width); + clip_boxes_upright(upright_boxes_xyxy, height, width, legacy_plus_one); // Convert back to [x_ctr, y_ctr, w, h, angle] and update upright boxes upright_boxes.block(0, 0, upright_boxes.rows(), 4) = - bbox_xyxy_to_ctrwh(clipped_upright_boxes_xyxy); + bbox_xyxy_to_ctrwh(clipped_upright_boxes_xyxy, legacy_plus_one); EArrXXt ret(boxes.rows(), boxes.cols()); ret = boxes; @@ -297,14 +304,16 @@ EArrXXt clip_boxes( const Eigen::ArrayBase& boxes, int height, int width, - float angle_thresh = 1.0) { + float angle_thresh = 1.0, + bool legacy_plus_one = false) { CAFFE_ENFORCE(boxes.cols() == 4 || boxes.cols() == 5); if (boxes.cols() == 4) { // Upright boxes - return clip_boxes_upright(boxes, height, width); + return clip_boxes_upright(boxes, height, width, legacy_plus_one); } else { // Rotated boxes with angle info - return clip_boxes_rotated(boxes, height, width, angle_thresh); + return clip_boxes_rotated( + boxes, height, width, angle_thresh, legacy_plus_one); } } @@ -316,7 +325,8 @@ template std::vector filter_boxes_upright( const Eigen::ArrayBase& boxes, double min_size, - const Eigen::Array3f& im_info) { + const Eigen::Array3f& im_info, + const bool legacy_plus_one = false) { CAFFE_ENFORCE_EQ(boxes.cols(), 4); // Scale min_size to match image scale @@ -325,8 +335,8 @@ std::vector filter_boxes_upright( using T = typename Derived::Scalar; using EArrX = EArrXt; - EArrX ws = boxes.col(2) - boxes.col(0) + T(1); - EArrX hs = boxes.col(3) - boxes.col(1) + T(1); + EArrX ws = boxes.col(2) - boxes.col(0) + T(int(legacy_plus_one)); + EArrX hs = boxes.col(3) - boxes.col(1) + T(int(legacy_plus_one)); EArrX x_ctr = boxes.col(0) + ws / T(2); EArrX y_ctr = boxes.col(1) + hs / T(2); @@ -368,11 +378,12 @@ template std::vector filter_boxes( const Eigen::ArrayBase& boxes, double min_size, - const Eigen::Array3f& im_info) { + const Eigen::Array3f& im_info, + const bool legacy_plus_one = false) { CAFFE_ENFORCE(boxes.cols() == 4 || boxes.cols() == 5); if (boxes.cols() == 4) { // Upright boxes - return filter_boxes_upright(boxes, min_size, im_info); + return filter_boxes_upright(boxes, min_size, im_info, legacy_plus_one); } else { // Rotated boxes with angle info return filter_boxes_rotated(boxes, min_size, im_info); diff --git a/caffe2/operators/generate_proposals_op_util_boxes_test.cc b/caffe2/operators/generate_proposals_op_util_boxes_test.cc index 1a08de7f422c..c36371c1ca14 100644 --- a/caffe2/operators/generate_proposals_op_util_boxes_test.cc +++ b/caffe2/operators/generate_proposals_op_util_boxes_test.cc @@ -31,7 +31,8 @@ TEST(UtilsBoxesTest, TestBboxTransformRandom) { bbox.array(), deltas.array(), std::vector{1.0, 1.0, 1.0, 1.0}, - BBOX_XFORM_CLIP); + BBOX_XFORM_CLIP, + true /* legacy_plus_one */); EXPECT_NEAR((result.matrix() - result_gt).norm(), 0.0, 1e-4); } @@ -64,6 +65,7 @@ TEST(UtilsBoxesTest, TestBboxTransformRotated) { deltas.array(), std::vector{1.0, 1.0, 1.0, 1.0}, BBOX_XFORM_CLIP, + true, /* legacy_plus_one */ false /* angle_bound_on */); EXPECT_NEAR((result.matrix() - result_gt).norm(), 0.0, 1e-2); } @@ -96,6 +98,7 @@ TEST(UtilsBoxesTest, TestBboxTransformRotatedNormalized) { deltas.array(), std::vector{1.0, 1.0, 1.0, 1.0}, BBOX_XFORM_CLIP, + true, /* legacy_plus_one */ true, /* angle_bound_on */ -90, /* angle_bound_lo */ 90 /* angle_bound_hi */); @@ -117,7 +120,8 @@ TEST(UtilsBoxesTest, ClipRotatedBoxes) { // Test with no clipping float angle_thresh = -1.0; - auto result = utils::clip_boxes(bbox.array(), height, width, angle_thresh); + auto result = utils::clip_boxes( + bbox.array(), height, width, angle_thresh, true /* legacy_plus_one */); EXPECT_NEAR((result.matrix() - bbox).norm(), 0.0, 1e-4); EMatXf result_gt(5, 5); @@ -127,7 +131,8 @@ TEST(UtilsBoxesTest, ClipRotatedBoxes) { // Test clipping with tolerance angle_thresh = 1.0; - result = utils::clip_boxes(bbox.array(), height, width, angle_thresh); + result = utils::clip_boxes( + bbox.array(), height, width, angle_thresh, true /* legacy_plus_one */); EXPECT_NEAR((result.matrix() - result_gt).norm(), 0.0, 1e-4); } diff --git a/caffe2/operators/generate_proposals_op_util_nms.h b/caffe2/operators/generate_proposals_op_util_nms.h index 8c5234e3474c..571d7d59bcb8 100644 --- a/caffe2/operators/generate_proposals_op_util_nms.h +++ b/caffe2/operators/generate_proposals_op_util_nms.h @@ -27,7 +27,8 @@ std::vector nms_cpu_upright( const Eigen::ArrayBase& scores, const std::vector& sorted_indices, float thresh, - int topN = -1) { + int topN = -1, + bool legacy_plus_one = false) { CAFFE_ENFORCE_EQ(proposals.rows(), scores.rows()); CAFFE_ENFORCE_EQ(proposals.cols(), 4); CAFFE_ENFORCE_EQ(scores.cols(), 1); @@ -40,7 +41,8 @@ std::vector nms_cpu_upright( auto x2 = proposals.col(2); auto y2 = proposals.col(3); - EArrX areas = (x2 - x1 + 1.0) * (y2 - y1 + 1.0); + EArrX areas = + (x2 - x1 + int(legacy_plus_one)) * (y2 - y1 + int(legacy_plus_one)); EArrXi order = AsEArrXt(sorted_indices); std::vector keep; @@ -59,8 +61,8 @@ std::vector nms_cpu_upright( EArrX xx2 = GetSubArray(x2, rest_indices).cwiseMin(x2[i]); EArrX yy2 = GetSubArray(y2, rest_indices).cwiseMin(y2[i]); - EArrX w = (xx2 - xx1 + 1.0).cwiseMax(0.0); - EArrX h = (yy2 - yy1 + 1.0).cwiseMax(0.0); + EArrX w = (xx2 - xx1 + int(legacy_plus_one)).cwiseMax(0.0); + EArrX h = (yy2 - yy1 + int(legacy_plus_one)).cwiseMax(0.0); EArrX inter = w * h; EArrX ovr = inter / (areas[i] + GetSubArray(areas, rest_indices) - inter); @@ -98,7 +100,8 @@ std::vector soft_nms_cpu_upright( float overlap_thresh = 0.3, float score_thresh = 0.001, unsigned int method = 1, - int topN = -1) { + int topN = -1, + bool legacy_plus_one = false) { CAFFE_ENFORCE_EQ(proposals.rows(), scores.rows()); CAFFE_ENFORCE_EQ(proposals.cols(), 4); CAFFE_ENFORCE_EQ(scores.cols(), 1); @@ -110,7 +113,8 @@ std::vector soft_nms_cpu_upright( const auto& x2 = proposals.col(2); const auto& y2 = proposals.col(3); - EArrX areas = (x2 - x1 + 1.0) * (y2 - y1 + 1.0); + EArrX areas = + (x2 - x1 + int(legacy_plus_one)) * (y2 - y1 + int(legacy_plus_one)); // Initialize out_scores with original scores. Will be iteratively updated // as Soft-NMS is applied. @@ -138,8 +142,8 @@ std::vector soft_nms_cpu_upright( EArrX xx2 = GetSubArray(x2, rest_indices).cwiseMin(x2[i]); EArrX yy2 = GetSubArray(y2, rest_indices).cwiseMin(y2[i]); - EArrX w = (xx2 - xx1 + 1.0).cwiseMax(0.0); - EArrX h = (yy2 - yy1 + 1.0).cwiseMax(0.0); + EArrX w = (xx2 - xx1 + int(legacy_plus_one)).cwiseMax(0.0); + EArrX h = (yy2 - yy1 + int(legacy_plus_one)).cwiseMax(0.0); EArrX inter = w * h; EArrX ovr = inter / (areas[i] + GetSubArray(areas, rest_indices) - inter); @@ -656,11 +660,13 @@ std::vector nms_cpu( const Eigen::ArrayBase& scores, const std::vector& sorted_indices, float thresh, - int topN = -1) { + int topN = -1, + bool legacy_plus_one = false) { CAFFE_ENFORCE(proposals.cols() == 4 || proposals.cols() == 5); if (proposals.cols() == 4) { // Upright boxes - return nms_cpu_upright(proposals, scores, sorted_indices, thresh, topN); + return nms_cpu_upright( + proposals, scores, sorted_indices, thresh, topN, legacy_plus_one); } else { // Rotated boxes with angle info return nms_cpu_rotated(proposals, scores, sorted_indices, thresh, topN); @@ -681,7 +687,8 @@ template std::vector nms_cpu( const Eigen::ArrayBase& proposals, const Eigen::ArrayBase& scores, - float thres) { + float thres, + bool legacy_plus_one = false) { std::vector indices(proposals.rows()); std::iota(indices.begin(), indices.end(), 0); std::sort( @@ -689,7 +696,13 @@ std::vector nms_cpu( indices.data() + indices.size(), [&scores](int lhs, int rhs) { return scores(lhs) > scores(rhs); }); - return nms_cpu(proposals, scores, indices, thres); + return nms_cpu( + proposals, + scores, + indices, + thres, + -1 /* topN */, + legacy_plus_one /* legacy_plus_one */); } template @@ -702,7 +715,8 @@ std::vector soft_nms_cpu( float overlap_thresh = 0.3, float score_thresh = 0.001, unsigned int method = 1, - int topN = -1) { + int topN = -1, + bool legacy_plus_one = false) { CAFFE_ENFORCE(proposals.cols() == 4 || proposals.cols() == 5); if (proposals.cols() == 4) { // Upright boxes @@ -715,7 +729,8 @@ std::vector soft_nms_cpu( overlap_thresh, score_thresh, method, - topN); + topN, + legacy_plus_one); } else { // Rotated boxes with angle info return soft_nms_cpu_rotated( @@ -740,7 +755,8 @@ std::vector soft_nms_cpu( float overlap_thresh = 0.3, float score_thresh = 0.001, unsigned int method = 1, - int topN = -1) { + int topN = -1, + bool legacy_plus_one = false) { std::vector indices(proposals.rows()); std::iota(indices.begin(), indices.end(), 0); return soft_nms_cpu( @@ -752,7 +768,8 @@ std::vector soft_nms_cpu( overlap_thresh, score_thresh, method, - topN); + topN, + legacy_plus_one); } } // namespace utils diff --git a/caffe2/operators/generate_proposals_op_util_nms_gpu.cu b/caffe2/operators/generate_proposals_op_util_nms_gpu.cu index 0cf157c99022..60cd996e45e3 100644 --- a/caffe2/operators/generate_proposals_op_util_nms_gpu.cu +++ b/caffe2/operators/generate_proposals_op_util_nms_gpu.cu @@ -29,6 +29,7 @@ __launch_bounds__( const Box* d_desc_sorted_boxes, const int nboxes, const float thresh, + const bool legacy_plus_one, const int mask_ld, int* d_delete_mask) { // Storing boxes used by this CUDA block in the shared memory @@ -45,7 +46,8 @@ __launch_bounds__( if (threadIdx.y == 0) { const Box box = d_desc_sorted_boxes[i_to_load]; shared_i_areas[threadIdx.x] = - (box.x2 - box.x1 + 1.0f) * (box.y2 - box.y1 + 1.0f); + (box.x2 - box.x1 + float(int(legacy_plus_one))) * + (box.y2 - box.y1 + float(int(legacy_plus_one))); shared_i_boxes[threadIdx.x] = box; } } @@ -68,7 +70,8 @@ __launch_bounds__( const Box j_box = d_desc_sorted_boxes[j]; const Box i_box = shared_i_boxes[threadIdx.x]; const float j_area = - (j_box.x2 - j_box.x1 + 1.0f) * (j_box.y2 - j_box.y1 + 1.0f); + (j_box.x2 - j_box.x1 + float(int(legacy_plus_one))) * + (j_box.y2 - j_box.y1 + float(int(legacy_plus_one))); const float i_area = shared_i_areas[threadIdx.x]; // The following code will not be valid with empty boxes if (i_area == 0.0f || j_area == 0.0f) @@ -79,8 +82,8 @@ __launch_bounds__( const float yy2 = fminf(i_box.y2, j_box.y2); // fdimf computes the positive difference between xx2+1 and xx1 - const float w = fdimf(xx2 + 1.0f, xx1); - const float h = fdimf(yy2 + 1.0f, yy1); + const float w = fdimf(xx2 + float(int(legacy_plus_one)), xx1); + const float h = fdimf(yy2 + float(int(legacy_plus_one)), yy1); const float intersection = w * h; // Testing for a/b > t @@ -109,6 +112,7 @@ void nms_gpu_upright( const float* d_desc_sorted_boxes_float_ptr, const int N, const float thresh, + const bool legacy_plus_one, int* d_keep_sorted_list, int* h_nkeep, TensorCUDA& dev_delete_mask, @@ -132,7 +136,7 @@ void nms_gpu_upright( CAFFE_CUDA_NUM_THREADS_2D, 0, context->cuda_stream()>>>( - d_desc_sorted_boxes, N, thresh, mask_ld, d_delete_mask); + d_desc_sorted_boxes, N, thresh, legacy_plus_one, mask_ld, d_delete_mask); host_delete_mask.Resize(N * mask_ld); int* h_delete_mask = host_delete_mask.template mutable_data(); @@ -554,6 +558,7 @@ void nms_gpu( const float* d_desc_sorted_boxes, const int N, const float thresh, + const bool legacy_plus_one, int* d_keep_sorted_list, int* h_nkeep, TensorCUDA& dev_delete_mask, @@ -566,6 +571,7 @@ void nms_gpu( d_desc_sorted_boxes, N, thresh, + legacy_plus_one, d_keep_sorted_list, h_nkeep, dev_delete_mask, diff --git a/caffe2/operators/generate_proposals_op_util_nms_gpu.h b/caffe2/operators/generate_proposals_op_util_nms_gpu.h index da7a8401ed12..10d081f1f38e 100644 --- a/caffe2/operators/generate_proposals_op_util_nms_gpu.h +++ b/caffe2/operators/generate_proposals_op_util_nms_gpu.h @@ -27,6 +27,7 @@ CAFFE2_API void nms_gpu_upright( const float* d_desc_sorted_boxes, const int N, const float thresh, + const bool legacy_plus_one, int* d_keep_sorted_list, int* h_nkeep, TensorCUDA& dev_delete_mask, @@ -55,6 +56,7 @@ CAFFE2_API void nms_gpu( const float* d_desc_sorted_boxes, const int N, const float thresh, + const bool legacy_plus_one, int* d_keep_sorted_list, int* h_nkeep, TensorCUDA& dev_delete_mask, diff --git a/caffe2/operators/generate_proposals_op_util_nms_gpu_test.cc b/caffe2/operators/generate_proposals_op_util_nms_gpu_test.cc index 372accae0af4..cd1428de0682 100644 --- a/caffe2/operators/generate_proposals_op_util_nms_gpu_test.cc +++ b/caffe2/operators/generate_proposals_op_util_nms_gpu_test.cc @@ -76,6 +76,7 @@ TEST(UtilsNMSTest, TestNMSGPU) { d_sorted_boxes, nboxes, thresh, + true, /* legacy_plus_one */ d_list, &list_nitems, dev_delete_mask, @@ -207,7 +208,13 @@ TEST(UtilsNMSTest, TestPerfNMS) { // Running ntests runs of CPU NMS auto cpu_start = std::chrono::steady_clock::now(); for (int itest = 0; itest < ntests; ++itest) { - utils::nms_cpu(proposals, scores, indices, thresh); + utils::nms_cpu( + proposals, + scores, + indices, + thresh, + -1, /* topN */ + true /* legacy_plus_one */); } auto cpu_stop = std::chrono::steady_clock::now(); @@ -244,6 +251,7 @@ TEST(UtilsNMSTest, TestPerfNMS) { d_sorted_boxes, nboxes, thresh, + true, /* legacy_plus_one */ d_list, &list_nitems, dev_delete_mask, @@ -342,13 +350,19 @@ TEST(UtilsNMSTest, GPUEqualsCPUCorrectnessTest) { // Running ntests runs of CPU NMS for (int itest = 0; itest < ntests; ++itest) { - std::vector keep = - utils::nms_cpu(eig_proposals, eig_scores, sorted_indices, thresh); + std::vector keep = utils::nms_cpu( + eig_proposals, + eig_scores, + sorted_indices, + thresh, + -1, /* topN */ + true /* legacy_plus_one */); int list_nitems; utils::nms_gpu( d_sorted_boxes, nboxes, thresh, + true, /* legacy_plus_one */ d_list, &list_nitems, dev_delete_mask, @@ -439,6 +453,7 @@ TEST(UtilsNMSTest, TestNMSGPURotatedAngle0) { d_sorted_boxes, nboxes, thresh, + true, /* legacy_plus_one */ d_list, &list_nitems, dev_delete_mask, @@ -507,7 +522,13 @@ TEST(UtilsNMSTest, TestPerfRotatedNMS) { // Running ntests runs of CPU NMS auto cpu_start = std::chrono::steady_clock::now(); for (int itest = 0; itest < ntests; ++itest) { - utils::nms_cpu(proposals, scores, indices, thresh); + utils::nms_cpu( + proposals, + scores, + indices, + thresh, + -1, /* topN */ + true /* legacy_plus_one */); } auto cpu_stop = std::chrono::steady_clock::now(); @@ -544,6 +565,7 @@ TEST(UtilsNMSTest, TestPerfRotatedNMS) { d_sorted_boxes, nboxes, thresh, + true, /* legacy_plus_one */ d_list, &list_nitems, dev_delete_mask, @@ -642,13 +664,19 @@ TEST(UtilsNMSTest, GPUEqualsCPURotatedCorrectnessTest) { // Running ntests runs of CPU NMS for (int itest = 0; itest < ntests; ++itest) { - std::vector keep = - utils::nms_cpu(eig_proposals, eig_scores, sorted_indices, thresh); + std::vector keep = utils::nms_cpu( + eig_proposals, + eig_scores, + sorted_indices, + thresh, + -1, /* topN */ + true /* legacy_plus_one */); int list_nitems; utils::nms_gpu( d_sorted_boxes, nboxes, thresh, + true, /* legacy_plus_one */ d_list, &list_nitems, dev_delete_mask, diff --git a/caffe2/operators/generate_proposals_op_util_nms_test.cc b/caffe2/operators/generate_proposals_op_util_nms_test.cc index 8e8b5f17afab..2d168446bc39 100644 --- a/caffe2/operators/generate_proposals_op_util_nms_test.cc +++ b/caffe2/operators/generate_proposals_op_util_nms_test.cc @@ -19,7 +19,8 @@ TEST(UtilsNMSTest, TestNMS) { auto proposals = input.block(0, 0, input.rows(), 4); auto scores = input.col(4); for (int i = 0; i < input_thresh.size(); i++) { - auto cur_out = utils::nms_cpu(proposals, scores, input_thresh[i]); + auto cur_out = utils::nms_cpu( + proposals, scores, input_thresh[i], true /* legacy_plus_one */); EXPECT_EQ(output_gt[i], cur_out); } @@ -31,7 +32,13 @@ TEST(UtilsNMSTest, TestNMS) { indices.data() + indices.size(), [&scores](int lhs, int rhs) { return scores(lhs) > scores(rhs); }); for (int i = 0; i < input_thresh.size(); i++) { - auto cur_out = utils::nms_cpu(proposals, scores, indices, input_thresh[i]); + auto cur_out = utils::nms_cpu( + proposals, + scores, + indices, + input_thresh[i], + -1, /* topN */ + true /* legacy_plus_one */); EXPECT_EQ(output_gt[i], cur_out); } @@ -39,8 +46,13 @@ TEST(UtilsNMSTest, TestNMS) { std::vector top_n = {1, 1, 2, 2, 3}; auto gt_out = output_gt; for (int i = 0; i < input_thresh.size(); i++) { - auto cur_out = - utils::nms_cpu(proposals, scores, indices, input_thresh[i], top_n[i]); + auto cur_out = utils::nms_cpu( + proposals, + scores, + indices, + input_thresh[i], + top_n[i], + true /* legacy_plus_one */); gt_out[i].resize(top_n[i]); EXPECT_EQ(gt_out[i], cur_out); } @@ -92,7 +104,8 @@ TEST(UtilsNMSTest, TestNMS1) { 18, 19, 21, 23, 24, 25, 26, 30, 32, 33, 34, 35, 37, 43, 44, 47, 50}; - auto cur_out = utils::nms_cpu(proposals, scores, 0.5); + auto cur_out = + utils::nms_cpu(proposals, scores, 0.5, true /* legacy_plus_one */); std::sort(cur_out.begin(), cur_out.end()); EXPECT_EQ(output_gt, cur_out); } @@ -148,7 +161,9 @@ TEST(UtilsNMSTest, TestSoftNMS) { 0.5, overlap_thresh[i], 0.0001, - method[i]); + method[i], + -1, /* topN */ + true /* legacy_plus_one */); EXPECT_EQ(keep, keep_gt); { auto diff = expected_scores - out_scores; @@ -165,7 +180,8 @@ TEST(UtilsNMSTest, TestSoftNMS) { overlap_thresh[i], 0.0001, method[i], - topN); + topN, + true /* legacy_plus_one */); std::vector expected_keep(keep_gt.begin(), keep_gt.begin() + topN); EXPECT_EQ(expected_keep, keep); } @@ -180,7 +196,9 @@ TEST(UtilsNMSTest, TestSoftNMS) { 0.5, overlap_thresh[i], 0.0001, - method[i]); + method[i], + -1, /* topN */ + true /* legacy_plus_one */); std::sort(keep.begin(), keep.end()); EXPECT_EQ(indices, keep); { @@ -198,7 +216,9 @@ TEST(UtilsNMSTest, TestSoftNMS) { 0.5, overlap_thresh[i], score_thresh, - method[i]); + method[i], + -1, /* topN */ + true /* legacy_plus_one */); { auto expected_keep = utils::GetArrayIndices(expected_scores >= score_thresh); @@ -235,7 +255,8 @@ TEST(UtilsNMSTest, TestNMSRotatedAngle0) { auto scores = input.col(4); for (int i = 0; i < input_thresh.size(); i++) { - auto cur_out = utils::nms_cpu(proposals, scores, input_thresh[i]); + auto cur_out = utils::nms_cpu( + proposals, scores, input_thresh[i], true /* legacy_plus_one */); EXPECT_EQ(output_gt[i], cur_out); } @@ -247,7 +268,13 @@ TEST(UtilsNMSTest, TestNMSRotatedAngle0) { indices.data() + indices.size(), [&scores](int lhs, int rhs) { return scores(lhs) > scores(rhs); }); for (int i = 0; i < input_thresh.size(); i++) { - auto cur_out = utils::nms_cpu(proposals, scores, indices, input_thresh[i]); + auto cur_out = utils::nms_cpu( + proposals, + scores, + indices, + input_thresh[i], + -1, /* topN */ + true /* legacy_plus_one */); EXPECT_EQ(output_gt[i], cur_out); } @@ -255,8 +282,13 @@ TEST(UtilsNMSTest, TestNMSRotatedAngle0) { std::vector top_n = {1, 1, 2, 2, 3}; auto gt_out = output_gt; for (int i = 0; i < input_thresh.size(); i++) { - auto cur_out = - utils::nms_cpu(proposals, scores, indices, input_thresh[i], top_n[i]); + auto cur_out = utils::nms_cpu( + proposals, + scores, + indices, + input_thresh[i], + top_n[i], + true /* legacy_plus_one */); gt_out[i].resize(top_n[i]); EXPECT_EQ(gt_out[i], cur_out); } @@ -322,7 +354,9 @@ TEST(UtilsNMSTest, TestSoftNMSRotatedAngle0) { 0.5, overlap_thresh[i], 0.0001, - method[i]); + method[i], + -1, /* topN */ + true /* legacy_plus_one */); EXPECT_EQ(keep, keep_gt); { auto diff = expected_scores - out_scores; @@ -339,7 +373,8 @@ TEST(UtilsNMSTest, TestSoftNMSRotatedAngle0) { overlap_thresh[i], 0.0001, method[i], - topN); + topN, + true /* legacy_plus_one */); std::vector expected_keep(keep_gt.begin(), keep_gt.begin() + topN); EXPECT_EQ(expected_keep, keep); } @@ -354,7 +389,9 @@ TEST(UtilsNMSTest, TestSoftNMSRotatedAngle0) { 0.5, overlap_thresh[i], 0.0001, - method[i]); + method[i], + -1, /* topN */ + true /* legacy_plus_one */); std::sort(keep.begin(), keep.end()); EXPECT_EQ(indices, keep); { @@ -372,7 +409,9 @@ TEST(UtilsNMSTest, TestSoftNMSRotatedAngle0) { 0.5, overlap_thresh[i], score_thresh, - method[i]); + method[i], + -1, /* topN */ + true /* legacy_plus_one */); { auto expected_keep = utils::GetArrayIndices(expected_scores >= score_thresh); diff --git a/caffe2/operators/given_tensor_fill_op.cc b/caffe2/operators/given_tensor_fill_op.cc index 6cf039c3d75b..bab8acca8322 100644 --- a/caffe2/operators/given_tensor_fill_op.cc +++ b/caffe2/operators/given_tensor_fill_op.cc @@ -7,6 +7,9 @@ REGISTER_CPU_OPERATOR( GivenTensorDoubleFill, GivenTensorFillOp); REGISTER_CPU_OPERATOR(GivenTensorBoolFill, GivenTensorFillOp); +REGISTER_CPU_OPERATOR( + GivenTensorInt16Fill, + GivenTensorFillOp); REGISTER_CPU_OPERATOR(GivenTensorIntFill, GivenTensorFillOp); REGISTER_CPU_OPERATOR( GivenTensorInt64Fill, @@ -18,6 +21,7 @@ REGISTER_CPU_OPERATOR( NO_GRADIENT(GivenTensorFill); NO_GRADIENT(GivenTensorDoubleFill); NO_GRADIENT(GivenTensorBoolFill); +NO_GRADIENT(GivenTensorInt16Fill); NO_GRADIENT(GivenTensorIntFill); NO_GRADIENT(GivenTensorInt64Fill); NO_GRADIENT(GivenTensorStringFill); @@ -141,6 +145,28 @@ OPERATOR_SCHEMA(GivenTensorBoolFill) "1D tensor containing the desired output shape. First input must be in CPU context.") .TensorInferenceFunction(FillerTensorInference); +OPERATOR_SCHEMA(GivenTensorInt16Fill) + .NumInputs(0, 1) + .NumOutputs(1) + .AllowInplace({{0, 0}}) + .Arg( + "values", + "The value for the elements of the output tensor.", + true /* required */) + .Arg( + "shape", + "The shape of the output tensor." + "Cannot set the shape argument and pass in an input at the same time.") + .Arg( + "extra_shape", + "The additional dimensions appended at the end of the shape indicated" + "by the input blob." + "Cannot set the extra_shape argument when there is no input blob.") + .Arg( + "input_as_shape", + "1D tensor containing the desired output shape. First input must be in CPU context.") + .TensorInferenceFunction(FillerTensorInference); + OPERATOR_SCHEMA(GivenTensorIntFill) .NumInputs(0, 1) .NumOutputs(1) diff --git a/caffe2/operators/given_tensor_fill_op.cu b/caffe2/operators/given_tensor_fill_op.cu index af0b8863fb2c..706c95c6277f 100644 --- a/caffe2/operators/given_tensor_fill_op.cu +++ b/caffe2/operators/given_tensor_fill_op.cu @@ -7,6 +7,9 @@ REGISTER_CUDA_OPERATOR(GivenTensorFill, GivenTensorFillOp); REGISTER_CUDA_OPERATOR( GivenTensorDoubleFill, GivenTensorFillOp); +REGISTER_CUDA_OPERATOR( + GivenTensorInt16Fill, + GivenTensorFillOp); REGISTER_CUDA_OPERATOR(GivenTensorIntFill, GivenTensorFillOp); REGISTER_CUDA_OPERATOR( GivenTensorInt64Fill, diff --git a/caffe2/operators/given_tensor_fill_op.h b/caffe2/operators/given_tensor_fill_op.h index 1ba9f08f5c58..202958685e83 100644 --- a/caffe2/operators/given_tensor_fill_op.h +++ b/caffe2/operators/given_tensor_fill_op.h @@ -34,6 +34,9 @@ class GivenTensorFillOp final : public FillerOp { case TensorProto_DataType_BOOL: ExtractValues(); break; + case TensorProto_DataType_INT16: + ExtractValues(); + break; case TensorProto_DataType_INT32: ExtractValues(); break; diff --git a/caffe2/operators/instance_norm_op.cc b/caffe2/operators/instance_norm_op.cc index 5730cef37955..abd19a03cef3 100644 --- a/caffe2/operators/instance_norm_op.cc +++ b/caffe2/operators/instance_norm_op.cc @@ -15,20 +15,31 @@ bool InstanceNormOp::RunOnDeviceWithOrderNHWC() { CAFFE_ENFORCE( !IsInputOutputAlias(INPUT, OUTPUT), "Can't run InstanceNorm NHWC in-place"); - auto* mean = OutputSize() > 1 ? Output(MEAN) : &mean_; - auto* inv_stdev = OutputSize() > 1 ? Output(INV_STDEV) : &inv_stdev_; + const int N = X.dim32(0); const int H = X.dim32(1); const int W = X.dim32(2); const int C = X.dim32(3); const size_t offset = H * W * C; - CAFFE_ENFORCE_EQ(Input(SCALE).numel(), C); CAFFE_ENFORCE_EQ(Input(BIAS).numel(), C); auto* Y = Output(OUTPUT, X.sizes(), at::dtype()); - mean->Resize(N, C); - inv_stdev->Resize(N, C); + Tensor* mean; + if (OutputSize() >= 2) { + mean = Output(MEAN, {N, C}, at::dtype().device(Context::GetDeviceType())); + } else { + ReinitializeTensor(&mean_, {N, C}, at::dtype().device(Context::GetDeviceType())); + mean = &mean_; + } + Tensor* inv_stdev; + if (OutputSize() >= 3) { + inv_stdev = Output(INV_STDEV, {N, C}, at::dtype().device(Context::GetDeviceType())); + } else { + ReinitializeTensor(&inv_stdev_, {N, C}, at::dtype().device(Context::GetDeviceType())); + inv_stdev = &inv_stdev_; + } + ConstEigenVectorArrayMap scale(Input(SCALE).template data(), C); ConstEigenVectorArrayMap bias(Input(BIAS).template data(), C); for (int n = 0; n < N; ++n) { @@ -66,19 +77,29 @@ bool InstanceNormOp::RunOnDeviceWithOrderNCHW() { const auto& scale = Input(SCALE); const auto& bias = Input(BIAS); - auto* mean = OutputSize() > 1 ? Output(MEAN) : &mean_; - auto* inv_stdev = OutputSize() > 1 ? Output(INV_STDEV) : &inv_stdev_; const int N = X.dim32(0); const int C = X.dim32(1); const int H = X.dim32(2); const int W = X.dim32(3); - CAFFE_ENFORCE_EQ(scale.numel(), C); CAFFE_ENFORCE_EQ(bias.numel(), C); auto* Y = Output(OUTPUT, X.sizes(), at::dtype()); - mean->Resize(N, C); - inv_stdev->Resize(N, C); + Tensor* mean; + if (OutputSize() >= 2) { + mean = Output(MEAN, {N, C}, at::dtype().device(Context::GetDeviceType())); + } else { + ReinitializeTensor(&mean_, {N, C}, at::dtype().device(Context::GetDeviceType())); + mean = &mean_; + } + + Tensor* inv_stdev; + if (OutputSize() >= 3) { + inv_stdev = Output(INV_STDEV, {N, C}, at::dtype().device(Context::GetDeviceType())); + } else { + ReinitializeTensor(&inv_stdev_, {N, C}, at::dtype().device(Context::GetDeviceType())); + inv_stdev = &inv_stdev_; + } const auto* Xdata = X.template data(); auto* Ydata = Y->template mutable_data(); diff --git a/caffe2/operators/instance_norm_op.cu b/caffe2/operators/instance_norm_op.cu index 31ada0c23f59..66d73597f745 100644 --- a/caffe2/operators/instance_norm_op.cu +++ b/caffe2/operators/instance_norm_op.cu @@ -188,8 +188,6 @@ bool InstanceNormOp::RunOnDeviceWithOrderNHWC() { const auto& scale = Input(SCALE); const auto& bias = Input(BIAS); - auto mean = OutputSize() >= 2 ? Output(MEAN) : &mean_; - auto inv_stdev = OutputSize() >= 3 ? Output(INV_STDEV) : &inv_stdev_; CAFFE_ENFORCE_EQ(4, input.dim()); const int N = input.dim32(0); const int H = input.dim32(1); @@ -200,8 +198,22 @@ bool InstanceNormOp::RunOnDeviceWithOrderNHWC() { CAFFE_ENFORCE_EQ(1, bias.dim()); CAFFE_ENFORCE_EQ(C, bias.dim32(0)); auto output = Output(OUTPUT, input.sizes(), at::dtype()); - mean->Resize(N, C); - inv_stdev->Resize(N, C); + + Tensor* mean; + if (OutputSize() >= 2) { + mean = Output(MEAN, {N, C}, at::dtype().device(CUDA)); + } else { + ReinitializeTensor(&mean_, {N, C}, at::dtype().device(CUDA)); + mean = &mean_; + } + + Tensor* inv_stdev; + if (OutputSize() >= 3) { + inv_stdev = Output(INV_STDEV, {N, C}, at::dtype().device(CUDA)); + } else { + ReinitializeTensor(&inv_stdev_, {N, C}, at::dtype().device(CUDA)); + inv_stdev = &inv_stdev_; + } const auto input_data = input.data(); const auto scale_data = scale.data(); @@ -265,8 +277,6 @@ bool InstanceNormOp::RunOnDeviceWithOrderNCHW() { const auto& scale = Input(SCALE); const auto& bias = Input(BIAS); - auto mean = OutputSize() >= 2 ? Output(MEAN) : &mean_; - auto inv_stdev = OutputSize() >= 3 ? Output(INV_STDEV) : &inv_stdev_; CAFFE_ENFORCE_EQ(4, input.dim()); const int N = input.dim32(0); const int C = input.dim32(1); @@ -277,8 +287,22 @@ bool InstanceNormOp::RunOnDeviceWithOrderNCHW() { CAFFE_ENFORCE_EQ(1, bias.dim()); CAFFE_ENFORCE_EQ(C, bias.dim32(0)); auto output = Output(OUTPUT, input.sizes(), at::dtype()); - mean->Resize(N, C); - inv_stdev->Resize(N, C); + + Tensor* mean; + if (OutputSize() >= 2) { + mean = Output(MEAN, {N, C}, at::dtype().device(CUDA)); + } else { + ReinitializeTensor(&mean_, {N, C}, at::dtype().device(CUDA)); + mean = &mean_; + } + + Tensor* inv_stdev; + if (OutputSize() >= 3) { + inv_stdev = Output(INV_STDEV, {N, C}, at::dtype().device(CUDA)); + } else { + ReinitializeTensor(&inv_stdev_, {N, C}, at::dtype().device(CUDA)); + inv_stdev = &inv_stdev_; + } const auto input_data = input.data(); const auto scale_data = scale.data(); @@ -471,9 +495,9 @@ bool InstanceNormGradientOp::RunOnDeviceWithOrderNCHW() { const auto& scale = Input(SCALE); const auto& bias = Input(BIAS); const auto& output_grad = Input(OUTPUT_GRAD); + const auto& mean = InputSize() >= 5 ? Input(MEAN) : mean_; const auto& inv_stdev = InputSize() >= 6 ? Input(INV_STDEV) : inv_stdev_; - CAFFE_ENFORCE_EQ(4, input.dim()); const int N = input.dim32(0); const int C = input.dim32(1); @@ -507,7 +531,7 @@ bool InstanceNormGradientOp::RunOnDeviceWithOrderNCHW() { const auto dim_stride = 1; if (InputSize() < 5) { - mean_.Resize(N, C); + ReinitializeTensor(&mean_, {N, C}, at::dtype().device(CUDA)); auto mean_mutable_data = mean_.mutable_data(); InstanceNormMeanKernel<<< CAFFE_GET_BLOCKS(N * C), @@ -530,7 +554,7 @@ bool InstanceNormGradientOp::RunOnDeviceWithOrderNCHW() { const auto mean_data = mean.data(); if (InputSize() < 6) { - inv_stdev_.Resize(N, C); + ReinitializeTensor(&inv_stdev_, {N, C}, at::dtype().device(CUDA)); auto inv_stdev_mutable_data = inv_stdev_.mutable_data(); InstanceNormInvStdevKernel<<< CAFFE_GET_BLOCKS(N * C), diff --git a/caffe2/operators/instance_norm_op.h b/caffe2/operators/instance_norm_op.h index cbd6bd44bf44..3a665ff891c7 100644 --- a/caffe2/operators/instance_norm_op.h +++ b/caffe2/operators/instance_norm_op.h @@ -41,8 +41,8 @@ class InstanceNormOp : public Operator { StorageOrder order_; // temp results that get passed to the gradient, but are otherwise stored here - Tensor mean_{Context::GetDeviceType()}; - Tensor inv_stdev_{Context::GetDeviceType()}; + Tensor mean_; + Tensor inv_stdev_; INPUT_TAGS(INPUT, SCALE, BIAS); OUTPUT_TAGS(OUTPUT, MEAN, INV_STDEV); diff --git a/caffe2/operators/onnxifi_op.cc b/caffe2/operators/onnxifi_op.cc index 78bf949902ea..b7666cb33645 100644 --- a/caffe2/operators/onnxifi_op.cc +++ b/caffe2/operators/onnxifi_op.cc @@ -95,22 +95,49 @@ void SetOutputTensorDescriptorTypeAndBuffer( desc->scales = &cpu_int8tensor->scale; desc->biases = &cpu_int8tensor->zero_point; } + +#ifndef C10_MOBILE +void CopyDescriptor( + const ExternalTensorDescriptor* from, + onnxTensorDescriptorV1* to) { + to->dataType = from->dataType; + to->buffer = from->buffer; + to->quantizationParams = from->quantizationParams; + to->quantizationAxis = from->quantizationAxis; + to->scales = from->scales; + to->biases = from->biases; + to->dimensions = from->dimensions; + to->shape = from->shape; +} +#endif + void BlobToTensorDescriptor( const std::string& name, Workspace* ws, onnxTensorDescriptorV1* desc, - std::vector>* shapes) { + std::vector>* shapes, + std::vector>* all_scales, + std::vector>* all_offsets) { const Blob* blob = ws->GetBlob(name); CAFFE_ENFORCE(blob, "Blob ", name, " doesn't exist"); const bool is_int8tensor = blob->meta().id() == TypeMeta::Id(); + bool is_external_tensor; +#ifndef C10_MOBILE + auto function_ptr = + ExternalTensorFunctionsBaseRegistry()->Create(blob->meta().id()); + is_external_tensor = function_ptr != nullptr; +#else + is_external_tensor = false; +#endif // Memory type // We only allow weights to be CPU tensor or int8tensor for now CAFFE_ENFORCE( - (BlobIsTensorType(*blob, CPU) || BlobIsInt8TensorCPUType(*blob)), + (BlobIsTensorType(*blob, CPU) || BlobIsInt8TensorCPUType(*blob) || + is_external_tensor), "Initialization blob ", name, - " needs to be TensorCPU or Int8TensorCPU"); + " needs to be TensorCPU or Int8TensorCPU or Int8FCDNNLowPPackedWeightBlob Based class"); desc->tag = ONNXIFI_TAG_TENSOR_DESCRIPTOR_V1; desc->memoryType = ONNXIFI_MEMORY_TYPE_CPU; @@ -124,6 +151,13 @@ void BlobToTensorDescriptor( desc->dimensions = shape.size(); shapes->emplace_back(shape.cbegin(), shape.cend()); desc->shape = shapes->back().data(); + } else if (is_external_tensor) { +#ifndef C10_MOBILE + ExternalTensorDescriptor ext_desc; + function_ptr->SetupExternalTensorDescriptor( + blob, shapes, all_scales, all_offsets, &ext_desc); + CopyDescriptor(&ext_desc, desc); +#endif } else { // Data type const auto& cpu_tensor = blob->template Get(); @@ -159,7 +193,8 @@ OnnxifiOp::buildInitializationList( weight_names->emplace_back(s); onnxTensorDescriptorV1 tensor_desc; tensor_desc.name = weight_names->back().c_str(); - BlobToTensorDescriptor(s, ws, &tensor_desc, weight_shapes); + BlobToTensorDescriptor( + s, ws, &tensor_desc, weight_shapes, &all_scales_, &all_offsets_); descs.push_back(tensor_desc); initialization_list.erase(it); } diff --git a/caffe2/operators/onnxifi_op.h b/caffe2/operators/onnxifi_op.h index 657f2cb5c180..4d5d71da5c5f 100644 --- a/caffe2/operators/onnxifi_op.h +++ b/caffe2/operators/onnxifi_op.h @@ -304,6 +304,10 @@ class OnnxifiOp final : public Operator { // dim but uint64_t for onnxDesciptor dim. Maybe we should just use int64_t c10::SmallVector tensor_dims_int64_; + // This is for multi group quantization info + std::vector> all_scales_; + std::vector> all_offsets_; + // output shape hints std::unordered_map output_shape_hints_; diff --git a/caffe2/operators/segment_reduction_op.cc b/caffe2/operators/segment_reduction_op.cc index 254f917a5c97..34d19c2d0c00 100644 --- a/caffe2/operators/segment_reduction_op.cc +++ b/caffe2/operators/segment_reduction_op.cc @@ -579,3 +579,36 @@ REGISTER_LENGTHS_OPS_MAIN_INPUT_AND_FORWARD_OUTPUT_GRADIENT( LengthsMaxWithMainInputAndForwardOutputGradient, AbstractLengthsDef); } // namespace caffe2 + +// Macro doesn't like comma +using LengthsSumCPUOp = caffe2::AbstractLengthsDef< + float, + int, + caffe2::CPUContext, + caffe2::SumReducerDef, + true>::ForwardOp; +using LengthsMeanCPUOp = caffe2::AbstractLengthsDef< + float, + int, + caffe2::CPUContext, + caffe2::MeanReducerDef, + true>::ForwardOp; +using LengthsMaxCPUOp = caffe2::AbstractLengthsDef< + float, + int, + caffe2::CPUContext, + caffe2::MaxReducerDef, + true>::ForwardOp; + +C10_REGISTER_CAFFE2_OPERATOR_CPU( + LengthsSum, + "_caffe2::LengthsSum(Tensor data, Tensor lengths) -> Tensor", + LengthsSumCPUOp); +C10_REGISTER_CAFFE2_OPERATOR_CPU( + LengthsMean, + "_caffe2::LengthsMean(Tensor data, Tensor lengths) -> Tensor", + LengthsMeanCPUOp); +C10_REGISTER_CAFFE2_OPERATOR_CPU( + LengthsMax, + "_caffe2::LengthsMax(Tensor data, Tensor lengths) -> Tensor", + LengthsMaxCPUOp); diff --git a/caffe2/operators/segment_reduction_op.h b/caffe2/operators/segment_reduction_op.h index a2678a67ac95..24d4d84e65a3 100644 --- a/caffe2/operators/segment_reduction_op.h +++ b/caffe2/operators/segment_reduction_op.h @@ -1,11 +1,16 @@ #ifndef CAFFE2_OPERATORS_SEGMENT_REDUCTION_OP_H_ #define CAFFE2_OPERATORS_SEGMENT_REDUCTION_OP_H_ +#include "caffe2/core/c10_operator.h" #include "caffe2/core/context.h" #include "caffe2/core/logging.h" #include "caffe2/core/operator.h" #include "caffe2/operators/reducer_functors.h" +C10_DECLARE_CAFFE2_OPERATOR(LengthsSum); +C10_DECLARE_CAFFE2_OPERATOR(LengthsMean); +C10_DECLARE_CAFFE2_OPERATOR(LengthsMax); + namespace caffe2 { template diff --git a/caffe2/operators/segment_reduction_op_gpu.cu b/caffe2/operators/segment_reduction_op_gpu.cu index 5e1099ef9dc1..fc4bc5a21c83 100644 --- a/caffe2/operators/segment_reduction_op_gpu.cu +++ b/caffe2/operators/segment_reduction_op_gpu.cu @@ -3,9 +3,9 @@ #include #include "caffe2/core/context_gpu.h" #include "caffe2/core/operator.h" +#include "caffe2/operators/segment_reduction_op.h" #include "caffe2/utils/math.h" - namespace caffe2 { namespace { @@ -411,8 +411,10 @@ template class CUDASparseLengthsSumOp : public Operator { public: USE_OPERATOR_CONTEXT_FUNCTIONS; - CUDASparseLengthsSumOp(const OperatorDef& operator_def, Workspace* ws) - : Operator(operator_def, ws) {} + + template + explicit CUDASparseLengthsSumOp(Args&&... args) + : Operator(std::forward(args)...) {} ~CUDASparseLengthsSumOp() {} @@ -531,8 +533,10 @@ template class CUDASparseLengthsMeanOp : public Operator { public: USE_OPERATOR_CONTEXT_FUNCTIONS; - CUDASparseLengthsMeanOp(const OperatorDef& operator_def, Workspace* ws) - : Operator(operator_def, ws) {} + + template + explicit CUDASparseLengthsMeanOp(Args&&... args) + : Operator(std::forward(args)...) {} ~CUDASparseLengthsMeanOp() {} @@ -652,8 +656,10 @@ template class CUDASparseLengthsMaxOp : public Operator { public: USE_OPERATOR_CONTEXT_FUNCTIONS; - CUDASparseLengthsMaxOp(const OperatorDef& operator_def, Workspace* ws) - : Operator(operator_def, ws) {} + + template + explicit CUDASparseLengthsMaxOp(Args&&... args) + : Operator(std::forward(args)...) {} ~CUDASparseLengthsMaxOp() {} @@ -966,7 +972,10 @@ class CUDAUnsortedSegmentSumOp : public Operator { context_.cuda_stream()); // the second call do the real computation. - ReinitializeTensor(&buffer_tensor_, {static_cast(tmp_storage_bytes)}, at::dtype().device(CUDA)); + ReinitializeTensor( + &buffer_tensor_, + {static_cast(tmp_storage_bytes)}, + at::dtype().device(CUDA)); cub::DeviceReduce::Max( static_cast(buffer_tensor_.mutable_data()), tmp_storage_bytes, @@ -996,46 +1005,47 @@ class CUDAUnsortedSegmentSumOp : public Operator { output->numel(), T(0), output->template mutable_data(), &context_); if (!mean) { - UnsortedSegmentSumKernel<<< - CAFFE_GET_BLOCKS(data.numel()), - CAFFE_CUDA_NUM_THREADS, - 0, - context_.cuda_stream()>>>( - data.numel(), - slize_sz, - segment_ids.template data(), - data.template data(), - output->template mutable_data(), - nullptr); + UnsortedSegmentSumKernel + <<>>( + data.numel(), + slize_sz, + segment_ids.template data(), + data.template data(), + output->template mutable_data(), + nullptr); } else { // For mean, we need to compute scaling factors - ReinitializeTensor(&scaling_factors_, {K + 1}, at::dtype().device(CUDA)); + ReinitializeTensor( + &scaling_factors_, {K + 1}, at::dtype().device(CUDA)); math::Set( scaling_factors_.numel(), int(0), scaling_factors_.template mutable_data(), &context_); - UnsortedSegmentSumKernel<<< - CAFFE_GET_BLOCKS(data.numel()), - CAFFE_CUDA_NUM_THREADS, - 0, - context_.cuda_stream()>>>( - data.numel(), - slize_sz, - segment_ids.template data(), - data.template data(), - output->template mutable_data(), - scaling_factors_.template mutable_data()); + UnsortedSegmentSumKernel + <<>>( + data.numel(), + slize_sz, + segment_ids.template data(), + data.template data(), + output->template mutable_data(), + scaling_factors_.template mutable_data()); // Divide by the scaling factors to get means - SegmentScalingKernel<<< - CAFFE_GET_BLOCKS(output->numel()), - CAFFE_CUDA_NUM_THREADS, - 0, - context_.cuda_stream()>>>( - output->numel(), - slize_sz, - scaling_factors_.template data(), - output->template mutable_data()); + SegmentScalingKernel + <<numel()), + CAFFE_CUDA_NUM_THREADS, + 0, + context_.cuda_stream()>>>( + output->numel(), + slize_sz, + scaling_factors_.template data(), + output->template mutable_data()); } return true; } @@ -1821,3 +1831,15 @@ REGISTER_CUDA_OPERATOR( LengthsIndicesInGradientMeanGradient, CUDASparseLengthsMeanGradientWithIndicesOp); } // namespace caffe2 + +// Macro doesn't like comma +using LengthsSumCUDAOp = + caffe2::CUDASparseLengthsSumOp; +using LengthsMeanCUDAOp = + caffe2::CUDASparseLengthsMeanOp; +using LengthsMaxCUDAOp = + caffe2::CUDASparseLengthsMaxOp; + +C10_REGISTER_CAFFE2_OPERATOR_CUDA(LengthsSum, LengthsSumCUDAOp); +C10_REGISTER_CAFFE2_OPERATOR_CUDA(LengthsMean, LengthsMeanCUDAOp); +C10_REGISTER_CAFFE2_OPERATOR_CUDA(LengthsMax, LengthsMaxCUDAOp); diff --git a/caffe2/operators/sparse_to_dense_mask_op.h b/caffe2/operators/sparse_to_dense_mask_op.h index 6867d87c85f4..62b27c603cfc 100644 --- a/caffe2/operators/sparse_to_dense_mask_op.h +++ b/caffe2/operators/sparse_to_dense_mask_op.h @@ -66,11 +66,10 @@ class SparseToDenseMaskOp : public SparseToDenseMaskBase { template explicit SparseToDenseMaskOp(Args&&... args) : SparseToDenseMaskBase(std::forward(args)...) { - returnPresenceMask_ = this->template GetSingleArgument( - "return_presence_mask", false); - maxSkippedSparseIndices_ = - this->template GetSingleArgument( - "max_skipped_indices", kMaxSkippedSparseIndices); + returnPresenceMask_ = + this->template GetSingleArgument("return_presence_mask", false); + maxSkippedRows_ = this->template GetSingleArgument( + "max_skipped_indices", kMaxSkippedSparseIndices); } bool RunOnDevice() override { @@ -151,15 +150,13 @@ class SparseToDenseMaskOp : public SparseToDenseMaskBase { int64_t offset = 0; for (int r = 0; r < rows; r++) { + bool skippedSparseIndex = false; for (int c = 0; c < lengths_vec[r]; c++) { const auto sparse_index = sparse_indices_vec[offset + c]; if (sparse_index < 0 || sparse_index >= std::numeric_limits::max()) { + skippedSparseIndex = true; LOG(WARNING) << "Skipping invalid sparse index: " << sparse_index; - CAFFE_ENFORCE_LT( - ++skippedSparseIndices_, - maxSkippedSparseIndices_, - "Too many sparse indices skipped"); continue; } int idx = this->getFeatureIdx(sparse_index); @@ -174,6 +171,11 @@ class SparseToDenseMaskOp : public SparseToDenseMaskBase { } } } + skippedRows_ += skippedSparseIndex; + CAFFE_ENFORCE_LT( + skippedRows_, + maxSkippedRows_, + "Too many rows with invalid sparse indices skipped"); offset += lengths_vec[r]; } @@ -181,11 +183,11 @@ class SparseToDenseMaskOp : public SparseToDenseMaskBase { } private: - static const uint32_t kMaxSkippedSparseIndices = 5; + static const uint32_t kMaxSkippedSparseIndices = 50; bool returnPresenceMask_; - uint32_t maxSkippedSparseIndices_ = 0; - uint32_t skippedSparseIndices_ = 0; + uint32_t maxSkippedRows_ = 0; + uint32_t skippedRows_ = 0; INPUT_TAGS(INDICES, VALUES, DEFAULT, LENGTHS); OUTPUT_TAGS(OUTPUTVALUE, PRESENCEMASK); diff --git a/caffe2/operators/utility_ops.cc b/caffe2/operators/utility_ops.cc index 0edc47efea31..10cec6c9ccb3 100644 --- a/caffe2/operators/utility_ops.cc +++ b/caffe2/operators/utility_ops.cc @@ -51,6 +51,7 @@ REGISTER_CPU_OPERATOR( ScatterWeightedSum, ScatterWeightedSumOp); REGISTER_CPU_OPERATOR(ScatterAssign, ScatterAssignOp); +REGISTER_CPU_OPERATOR(Scatter, ScatterOp); REGISTER_CPU_OPERATOR(LengthsToShape, LengthsToShapeOp); REGISTER_CPU_OPERATOR(HasElements, HasElementsOp); @@ -369,6 +370,38 @@ Currently only works on CPU because of access to INDICES. "Update slices, with shape len(INDICES) + shape(X_0)[1:]") .Output(0, "DATA", "Has to be exactly the same tensor as the input 0"); +OPERATOR_SCHEMA(Scatter) + .NumInputs(3) + .NumOutputs(1) + .AllowInplace({{0, 0}}) + .SetDoc(R"DOC( +Update values of the tensor by overriding current value specified by indices. + +Writes all values from the tensor UPDATES into DATA at the indices specified in the INDICES tensor. +For each value in DATA, its output index is specified by its index in UPDATES and by the corresponding value in INDICES for the specified axis. + +For a 3-D tensor, DATA is updated as: + +DATA[INDICES[i][j][k]][j][k] = UPDATES[i][j][k] # if axis == 0 +DATA[i][INDICES[i][j][k]][k] = UPDATES[i][j][k] # if axis == 1 +DATA[i][j][INDICES[i][j][k]] = UPDATES[i][j][k] # if axis == 2 + +Currently only works on CPU because of access to INDICES. +)DOC") + .Input(0, "DATA", "Tensor to be updated.") + .Input( + 1, + "INDICES", + "1-D list of indices on the first dimension" + "of X_0 that need to be updated") + .Input( + 2, + "UPDATES", + "Update slices, with shape len(INDICES) + shape(X_0)[1:]") + .Output(0, "OUTPUT", "The updated output.") + .Arg( + "axis", + "*(type: int; default: 1)* Which dimension to scatter on."); OPERATOR_SCHEMA(HasElements) .NumInputs(1) @@ -739,6 +772,7 @@ REGISTER_GRADIENT(Sum, GetSumGradient); SHOULD_NOT_DO_GRADIENT(ScatterWeightedSum); SHOULD_NOT_DO_GRADIENT(ScatterAssign); +SHOULD_NOT_DO_GRADIENT(Scatter); class GetWeightedSumGradient : public GradientMakerBase { using GradientMakerBase::GradientMakerBase; diff --git a/caffe2/operators/utility_ops.h b/caffe2/operators/utility_ops.h index 2b38d1b67f96..8dd1c45cc461 100644 --- a/caffe2/operators/utility_ops.h +++ b/caffe2/operators/utility_ops.h @@ -738,6 +738,106 @@ class ScatterAssignOp : public Operator { INPUT_TAGS(DATA, INDICES, SLICES); }; +template +class ScatterOp : public Operator { + public: + USE_OPERATOR_CONTEXT_FUNCTIONS; + + template + explicit ScatterOp(Args&&... args) + : Operator(std::forward(args)...), + OP_SINGLE_ARG(int, "axis", axis_, 1) { + } + + virtual ~ScatterOp() noexcept override {} + + bool RunOnDevice() override { + + TORCH_CHECK(Context::GetDeviceType() == kCPU, "ScatterOp currently only supports CPU.") + + return DispatchHelper>::call( + this, this->template Input(INDICES, CPU)); + } + + template + bool DoRunWithType() { + const Tensor& data = Input(DATA); + const Tensor& indices = Input(INDICES); + const Tensor& updates = Input(UPDATES); + const TypeMeta dataType = data.dtype(); + size_t item_bytesize = dataType.itemsize(); + + // ONNX allows negative axis to index from the back, valid range: [-r, r]. + axis_ = data.canonical_axis_index(axis_); + + CAFFE_ENFORCE_GE(data.dim(), axis_ + 1, "DATA should be at least [axis+1]-D"); + CAFFE_ENFORCE_GE(axis_, 0, "Axis should be non-negative"); + CAFFE_ENFORCE_LT(axis_, data.dim(), "Axis out of range"); + + Tensor* output = Output(0, data.sizes().vec(), at::dtype(dataType)); + output->CopyFrom(data); + char* out = static_cast(output->raw_mutable_data(dataType)); + + // Succeed if size of output is zero, which can happen for empty batch which + // would have data dimension size of 0. + // This *must* be done AFTER output->raw_mutable_data() above as that has + // important allocation side effect that we must see. + if (output->numel() == 0) { + return true; + } + + const IndexType* idxs = indices.template data(); + const char* src_base = static_cast(updates.raw_data()); + + const int64_t outer_dims_product = updates.size_to_dim(axis_); + const int64_t block_size = updates.size_from_dim(axis_ + 1); + const int64_t block_bytesize = block_size * item_bytesize; + + const int64_t src_indexing_axis_dim = updates.size(axis_); + const int64_t src_batch_bytesize = updates.size_from_dim(axis_) * item_bytesize; + const int64_t dst_batch_size = data.size_from_dim(axis_) * item_bytesize; + + const int64_t N = indices.size(axis_); + + check_indexarray_range(idxs, N, src_indexing_axis_dim); + + int64_t i = 0; + for (int64_t batch = 0; batch < outer_dims_product; ++batch) { + int64_t i_max = i + N; + for (; i < i_max && i < indices.numel(); ++i) { + auto idx = idxs[i]; + + auto src = src_base + batch * src_batch_bytesize + idx * block_bytesize; + auto dst = out + batch * dst_batch_size + (i - i_max + N) * block_bytesize; + context_.CopyItemsSameDevice(dataType, block_size, src, dst); + } + } + return true; + } + + INPUT_TAGS(DATA, INDICES, UPDATES); + + // Check that indices fall within dimension array size with CAFFE_ENFORCE. + template + static void check_indexarray_range( + const IndexType* indices, + int64_t n, + IndexType indexing_axis_dim) { + for (auto i = 0; i < n; ++i) { + auto idx = indices[i]; + CAFFE_ENFORCE( + 0 <= idx && idx < indexing_axis_dim, + "INDICES element is out of DATA bounds, id=", + idx, + " axis_dim=", + indexing_axis_dim); + } + } + + protected: + int axis_; +}; + template class LengthsToSegmentIdsOp : public Operator { public: diff --git a/caffe2/opt/backend_transformer_base.cc b/caffe2/opt/backend_transformer_base.cc index 21bca5826991..45f88bda400f 100644 --- a/caffe2/opt/backend_transformer_base.cc +++ b/caffe2/opt/backend_transformer_base.cc @@ -54,8 +54,16 @@ QTensorProto BackendTransformerBase::wrapShapeInfoIntoQTensorProto( "Only quantized shapeinfo can be extracted into QTensor!"); t.set_name(name); t.set_data_type(shape_info.shape.data_type()); - t.set_scale(shape_info.q_info.scale); - t.set_bias(shape_info.q_info.offset); + t.set_axis(shape_info.q_info.axis); + t.set_is_multiparam(true); + for (const auto i : shape_info.q_info.scale) { + t.add_scales(i); + } + t.set_scale(1.0); + for (const auto i : shape_info.q_info.offset) { + t.add_biases(i); + } + t.set_bias(0.0); // precision and is_signed is not used in onnxifi workflow, but it is required // field t.set_precision(0); @@ -119,9 +127,9 @@ ShapeInfoMap BackendTransformerBase::inferShapes( shape_map.emplace(s, shape_info); } } - BoundShapeInferencer eng(spec); - eng.InferBoundShapeAndType(*pred_net, shape_map); - const auto& out_map = eng.shape_info(); + auto eng = BoundShapeInferencerRegistry()->Create("C10", spec); + eng->InferBoundShapeAndType(*pred_net, shape_map, ws); + const auto& out_map = eng->shape_info(); shape_map.clear(); for (const auto& kv : out_map) { shape_map.emplace( diff --git a/caffe2/opt/bound_shape_inference_test.cc b/caffe2/opt/bound_shape_inference_test.cc index 9d4d274b3954..961a03397122 100644 --- a/caffe2/opt/bound_shape_inference_test.cc +++ b/caffe2/opt/bound_shape_inference_test.cc @@ -51,7 +51,7 @@ TEST(BoundShapeInference, SparseLengthsSum) { "Weights", makeTensorInfo(ShapeInfo::DimType::CONSTANT, {1000, 16})); BoundShapeSpec spec(20, 1000); BoundShapeInferencer eng(spec); - eng.InferBoundShapeAndType(net, shape_map); + eng.InferBoundShapeAndType(net, shape_map, nullptr); const auto& out_shape = eng.shape_info(); verifyShapeInfo( out_shape, "Weights", ShapeInfo::DimType::CONSTANT, {1000, 16}); @@ -86,7 +86,7 @@ TEST(BoundShapeInference, SparseLengthsSumFused8BitRowwise) { ShapeInfo::DimType::CONSTANT, {1000, 58}, TensorProto_DataType_INT8)); BoundShapeSpec spec(20, 1000); BoundShapeInferencer eng(spec); - eng.InferBoundShapeAndType(net, shape_map); + eng.InferBoundShapeAndType(net, shape_map, nullptr); const auto& out_shape = eng.shape_info(); verifyShapeInfo( out_shape, @@ -127,7 +127,7 @@ TEST(BoundShapeInference, LengthsRangeFill) { ShapeInfoMap shape_map; BoundShapeSpec spec(20, 1000); BoundShapeInferencer eng(spec); - eng.InferBoundShapeAndType(net, shape_map); + eng.InferBoundShapeAndType(net, shape_map, nullptr); const auto& out_shape = eng.shape_info(); verifyShapeInfo( out_shape, @@ -175,7 +175,7 @@ TEST(BoundShapeInference, Reshape) { shape_map.emplace("B0", makeTensorInfo(ShapeInfo::DimType::CONSTANT, {16})); BoundShapeSpec spec(20, 1000); BoundShapeInferencer eng(spec); - eng.InferBoundShapeAndType(net, shape_map); + eng.InferBoundShapeAndType(net, shape_map, nullptr); const auto& out_shape = eng.shape_info(); verifyShapeInfo( out_shape, "X0", ShapeInfo::DimType::BATCH, {spec.max_batch_size, 1024}); @@ -203,7 +203,7 @@ TEST(BoundShapeInference, ConcatMissingInput) { "I0", makeTensorInfo(ShapeInfo::DimType::BATCH, {spec.max_batch_size, 60})); BoundShapeInferencer eng(spec); - eng.InferBoundShapeAndType(net, shape_map); + eng.InferBoundShapeAndType(net, shape_map, nullptr); const auto& out_shape = eng.shape_info(); verifyShapeInfo( out_shape, "I0", ShapeInfo::DimType::BATCH, {spec.max_batch_size, 60}); @@ -233,7 +233,7 @@ TEST(BoundShapeInference, ConcatInferInputBackwards) { "W0", makeTensorInfo(ShapeInfo::DimType::CONSTANT, {101, 16})); shape_map.emplace("B0", makeTensorInfo(ShapeInfo::DimType::CONSTANT, {16})); BoundShapeInferencer eng(spec); - eng.InferBoundShapeAndType(net, shape_map); + eng.InferBoundShapeAndType(net, shape_map, nullptr); const auto& out_shape = eng.shape_info(); verifyShapeInfo( out_shape, "I0", ShapeInfo::DimType::BATCH, {spec.max_batch_size, 60}); @@ -274,7 +274,7 @@ TEST(BoundShapeInference, Split) { "X1", makeTensorInfo(ShapeInfo::DimType::BATCH, {spec.max_batch_size, 2, 48})); BoundShapeInferencer eng(spec); - eng.InferBoundShapeAndType(net, shape_map); + eng.InferBoundShapeAndType(net, shape_map, nullptr); const auto& out_shape = eng.shape_info(); verifyShapeInfo( out_shape, "X", ShapeInfo::DimType::BATCH, {spec.max_batch_size, 48}); @@ -317,7 +317,7 @@ TEST(BoundShapeInference, FC) { shape_map.emplace("B1", makeTensorInfo(ShapeInfo::DimType::CONSTANT, {1024})); BoundShapeSpec spec(20, 1000); BoundShapeInferencer eng(spec); - eng.InferBoundShapeAndType(net, shape_map); + eng.InferBoundShapeAndType(net, shape_map, nullptr); const auto& out_shape = eng.shape_info(); verifyShapeInfo( out_shape, "X0", ShapeInfo::DimType::BATCH, {spec.max_batch_size, 1024}); @@ -342,7 +342,7 @@ TEST(BoundShapeInference, FC3D) { shape_map.emplace("B0", makeTensorInfo(ShapeInfo::DimType::CONSTANT, {16})); BoundShapeSpec spec(20, 1000); BoundShapeInferencer eng(spec); - eng.InferBoundShapeAndType(net, shape_map); + eng.InferBoundShapeAndType(net, shape_map, nullptr); const auto& out_shape = eng.shape_info(); verifyShapeInfo( out_shape, "X0", ShapeInfo::DimType::BATCH, {spec.max_batch_size, 1024}); @@ -350,46 +350,6 @@ TEST(BoundShapeInference, FC3D) { out_shape, "Out0", ShapeInfo::DimType::BATCH, {spec.max_batch_size, 16}); } -TEST(BoundShapeInference, ClipRangesGatherSigridHash) { - FLAGS_caffe2_extract_feature_length_for_shape_inference = true; - NetDef net; - net.add_op()->CopyFrom(CreateOperatorDef( - "ClipRangesGatherSigridHash", - "", - {"R0", "V0"}, - {"F0_lengths_0", "F0_values_0", "F1_lengths_0", "F1_values_0"}, - {MakeArgument>("max_lengths", {200, 400})})); - ShapeInfoMap shape_map; - BoundShapeSpec spec(50, 1000); - BoundShapeInferencer eng(spec); - eng.InferBoundShapeAndType(net, shape_map); - const auto& out_shape = eng.shape_info(); - verifyShapeInfo( - out_shape, - "F0_lengths_0", - ShapeInfo::DimType::BATCH, - {spec.max_batch_size}, - TensorProto_DataType_INT32); - verifyShapeInfo( - out_shape, - "F0_values_0", - ShapeInfo::DimType::SEQ, - {spec.max_batch_size * 200}, - TensorProto_DataType_INT64); - verifyShapeInfo( - out_shape, - "F1_lengths_0", - ShapeInfo::DimType::BATCH, - {spec.max_batch_size}, - TensorProto_DataType_INT32); - verifyShapeInfo( - out_shape, - "F1_values_0", - ShapeInfo::DimType::SEQ, - {spec.max_batch_size * 400}, - TensorProto_DataType_INT64); -} - TEST(BoundShapeInference, Combo0) { NetDef net; net.add_op()->CopyFrom(CreateOperatorDef( @@ -421,56 +381,9 @@ TEST(BoundShapeInference, Combo0) { "Indices", makeTensorInfo(ShapeInfo::DimType::CONSTANT, {2})); BoundShapeSpec spec(20, 1000); BoundShapeInferencer eng(spec); - eng.InferBoundShapeAndType(net, shape_map); + eng.InferBoundShapeAndType(net, shape_map, nullptr); const auto& out_shape = eng.shape_info(); LOG(INFO) << eng.PrintShapeInfo(); verifyShapeInfo( out_shape, "Gout", ShapeInfo::DimType::BATCH, {spec.max_batch_size, 2}); } - -TEST(BoundShapeInference, Combo1) { - FLAGS_caffe2_extract_feature_length_for_shape_inference = true; - NetDef net; - net.add_op()->CopyFrom(CreateOperatorDef( - "ClipRangesGatherSigridHash", - "", - {"R0", "V0"}, - {"F0_lengths_0", "F0_values_0", "F1_lengths_0", "F1_values_0"}, - {MakeArgument>("max_lengths", {300, 400})})); - - net.add_op()->CopyFrom(CreateOperatorDef( - "SparseLengthsSumFused8BitRowwise", - "", - {"Weights", "F0_values_0", "F0_lengths_0"}, - {"Out"}, - {})); - ShapeInfoMap shape_map; - shape_map.emplace( - "Weights", - makeTensorInfo( - ShapeInfo::DimType::CONSTANT, {1000, 58}, TensorProto_DataType_INT8)); - BoundShapeSpec spec(20, 1000); - BoundShapeInferencer eng(spec); - eng.InferBoundShapeAndType(net, shape_map); - const auto& out_shape = eng.shape_info(); - verifyShapeInfo( - out_shape, - "Weights", - ShapeInfo::DimType::CONSTANT, - {1000, 58}, - TensorProto_DataType_INT8); - verifyShapeInfo( - out_shape, - "F0_values_0", - ShapeInfo::DimType::SEQ, - {spec.max_batch_size * 300}, - TensorProto_DataType_INT64); - verifyShapeInfo( - out_shape, - "F0_lengths_0", - ShapeInfo::DimType::BATCH, - {spec.max_batch_size}, - TensorProto_DataType_INT32); - verifyShapeInfo( - out_shape, "Out", ShapeInfo::DimType::BATCH, {spec.max_batch_size, 50}); -} diff --git a/caffe2/opt/bound_shape_inferencer.cc b/caffe2/opt/bound_shape_inferencer.cc index 58652e4ab725..68167609ac44 100644 --- a/caffe2/opt/bound_shape_inferencer.cc +++ b/caffe2/opt/bound_shape_inferencer.cc @@ -38,17 +38,19 @@ int64_t SizeToDim(const TensorShape& shape, int axis) { } return r; } +} // namespace -void EnsureShapeNames(std::unordered_map* info) { +void BoundShapeInferencer::EnsureShapeNames( + std::unordered_map* info) const { for (auto& kv : *info) { kv.second.shape.set_name(kv.first); } } -} // namespace void BoundShapeInferencer::InferBoundShapeAndType( const NetDef& net, - const std::unordered_map& info) { + const std::unordered_map& info, + caffe2::Workspace* ws) { const static std::unordered_set unsupported{"Tile"}; shape_info_ = info; @@ -79,10 +81,6 @@ void BoundShapeInferencer::InferBoundShapeAndType( InferGivenTensorFill(op); } else if (op.type() == "Shape") { InferShape(op); - } else if ( - op.type() == "ClipRangesGatherSigridHash" && - FLAGS_caffe2_extract_feature_length_for_shape_inference) { - InferClipRangesGatherSigridHash(op); } else { InferCommonOp(op); } @@ -125,8 +123,11 @@ TensorShape& BoundShapeInferencer::CheckAndSetTensorShapeAndType( TensorShape& shape = shape_info.shape; if (is_quantized) { shape_info.is_quantized = true; - shape_info.q_info.scale = 1; - shape_info.q_info.offset = 0; + shape_info.q_info.scale.clear(); + shape_info.q_info.scale.push_back(1); + shape_info.q_info.offset.clear(); + shape_info.q_info.offset.push_back(0); + shape_info.q_info.axis = 1; } if (!rt.second) { // Check shape consistency @@ -340,9 +341,9 @@ void BoundShapeInferencer::InferConcatInputs(const OperatorDef& op) { } } -// For concat net, if some inputs are missing and we have add_axis argument, it -// means that all the inputs should be of the same dimension. In this case, we -// can infer the shape of the missing inputs +// For concat net, if some inputs are missing and we have add_axis argument, +// it means that all the inputs should be of the same dimension. In this case, +// we can infer the shape of the missing inputs void BoundShapeInferencer::InferConcat(const OperatorDef& op) { ArgumentHelper helper(op); auto add_axis = helper.GetSingleArgument("add_axis", 0); @@ -418,7 +419,8 @@ void BoundShapeInferencer::InferFC(const OperatorDef& op) { const ShapeInfo& b_shape_info = b_it->second; auto x_it = shape_info_.find(op.input(0)); if (x_it == shape_info_.end()) { - // We don't have a hint at the x input we try to deduce it from weight shape + // We don't have a hint at the x input we try to deduce it from weight + // shape ArgumentHelper helper(op); auto axis = helper.GetSingleArgument("axis", 1); auto axis_w = helper.GetSingleArgument("axis_w", 1); @@ -464,104 +466,67 @@ void BoundShapeInferencer::InferFC(const OperatorDef& op) { false); } -void BoundShapeInferencer::InferClipRangesGatherSigridHash( - const OperatorDef& op) { - CAFFE_ENFORCE( - op.output_size() % 2 == 0, - "ClipRangesGatherSigridHash has to have even number of outputs"); - ArgumentHelper helper(op); - auto max_lengths_arg = helper.GetRepeatedArgument("max_lengths"); - CAFFE_ENFORCE_EQ( - max_lengths_arg.size() * 2, - op.output_size(), - "Output size of ClipRangesGatherSigridHash has to be the same with 2 * length of max_lengths arg"); - for (int i = 0; i < op.output_size(); i++) { - auto output_name = op.output(i); - if (i % 2 == 0) { - CAFFE_ENFORCE( - output_name.find("lengths") != std::string::npos, - "In ClipRangesGatherSigridHash, name of output in even index has to contain 'lengths'"); - CheckAndSetTensorShapeAndType( - output_name, - ShapeInfo::DimType::BATCH, - {spec_.max_batch_size}, - TensorProto_DataType_INT32, - false); - } else { - CAFFE_ENFORCE( - output_name.find("values") != std::string::npos, - "In ClipRangesGatherSigridHash, name of output in odd index has to contain 'values'"); - CheckAndSetTensorShapeAndType( - output_name, - ShapeInfo::DimType::SEQ, - {max_lengths_arg[i / 2] * spec_.max_batch_size}, - TensorProto_DataType_INT64, - false); - } - } -} - void BoundShapeInferencer::InferCommonOp(const OperatorDef& op) { // First, we need to check that all the input shape/types are already // presented try { - std::vector input_shapes; - for (const auto& input : op.input()) { - const auto it = shape_info_.find(input); - if (it == shape_info_.end()) { - LOG(WARNING) << "Cannot find shape info for " << input << ". Skipping " - << op.type(); - return; + std::vector input_shapes; + for (const auto& input : op.input()) { + const auto it = shape_info_.find(input); + if (it == shape_info_.end()) { + LOG(WARNING) << "Cannot find shape info for " << input << ". Skipping " + << op.type(); + return; + } + input_shapes.emplace_back(it->second.shape); } - input_shapes.emplace_back(it->second.shape); - } - const OpSchema* schema = OpSchemaRegistry::Schema(op.type()); - CAFFE_ENFORCE(schema); - std::vector output_shapes; + const OpSchema* schema = OpSchemaRegistry::Schema(op.type()); + CAFFE_ENFORCE(schema); + std::vector output_shapes; output_shapes = schema->InferTensor(op, input_shapes); - int i = 0; - bool is_quantized = - !(op.type().compare(0, 4, "Int8")) && (op.type() != "Int8Dequantize"); - TensorProto::DataType infered_data_type = TensorProto::UNDEFINED; - if (is_quantized) { - const static std::map type_info_from_input = { - {"Int8Quantize", -1}, // Force this op's output to be uint8 - {"Int8ConvRelu", 1}, - {"Int8MaxPool", 0}, - {"Int8AveragePool", 0}, - {"Int8FC", 1}, - {"Int8Conv", 1}, - {"Int8SumRelu", 0}}; - CAFFE_ENFORCE( - type_info_from_input.find(op.type()) != type_info_from_input.end(), - "Undefined quantized output data type, add it into type_info_from_input"); - int target = type_info_from_input.find(op.type())->second; - if (target == -1) { - infered_data_type = TensorProto::UINT8; - } else { - CAFFE_ENFORCE(target < input_shapes.size()); - infered_data_type = input_shapes[target].data_type(); + int i = 0; + bool is_quantized = + !(op.type().compare(0, 4, "Int8")) && (op.type() != "Int8Dequantize"); + TensorProto::DataType infered_data_type = TensorProto::UNDEFINED; + if (is_quantized) { + const static std::map type_info_from_input = { + {"Int8Quantize", -1}, // Force this op's output to be uint8 + {"Int8ConvRelu", 1}, + {"Int8MaxPool", 0}, + {"Int8AveragePool", 0}, + {"Int8FC", 1}, + {"Int8Conv", 1}, + {"Int8SumRelu", 0}}; + CAFFE_ENFORCE( + type_info_from_input.find(op.type()) != type_info_from_input.end(), + "Undefined quantized output data type, add it into type_info_from_input"); + int target = type_info_from_input.find(op.type())->second; + if (target == -1) { + infered_data_type = TensorProto::UINT8; + } else { + CAFFE_ENFORCE(target < input_shapes.size()); + infered_data_type = input_shapes[target].data_type(); + } + } else if (op.type() == "Int8Dequantize") { + infered_data_type = TensorProto::FLOAT; } - } else if (op.type() == "Int8Dequantize") { - infered_data_type = TensorProto::FLOAT; - } - for (const auto& shape : output_shapes) { - if (infered_data_type == TensorProto::UNDEFINED) { - infered_data_type = shape.data_type(); - } - if (shape.unknown_shape()) { - ++i; - continue; + for (const auto& shape : output_shapes) { + if (infered_data_type == TensorProto::UNDEFINED) { + infered_data_type = shape.data_type(); + } + if (shape.unknown_shape()) { + ++i; + continue; + } + CheckAndSetTensorShapeAndType( + op.output(i++), + current_dim_type_, + ConvertToVec(shape.dims()), + infered_data_type, + is_quantized); } - CheckAndSetTensorShapeAndType( - op.output(i++), - current_dim_type_, - ConvertToVec(shape.dims()), - infered_data_type, - is_quantized); - } } catch (const caffe2::EnforceNotMet& e) { LOG(ERROR) << "Enforce not met while inferring shapes for " << op.type() << ": " << e.msg(); @@ -571,4 +536,18 @@ void BoundShapeInferencer::InferCommonOp(const OperatorDef& op) { } } +std::shared_ptr getBoundShapeInferencer( + const BoundShapeSpec& spec) { + return std::make_shared(spec); +} + +C10_DEFINE_SHARED_REGISTRY( + BoundShapeInferencerRegistry, + BoundShapeInferencerBase, + const BoundShapeSpec&); + +C10_REGISTER_CREATOR( + BoundShapeInferencerRegistry, + C10, + getBoundShapeInferencer); } // namespace caffe2 diff --git a/caffe2/opt/bound_shape_inferencer.h b/caffe2/opt/bound_shape_inferencer.h index 2b5a4ce067dc..77bb3003bc61 100644 --- a/caffe2/opt/bound_shape_inferencer.h +++ b/caffe2/opt/bound_shape_inferencer.h @@ -29,16 +29,19 @@ struct CAFFE2_API BoundShapeSpec { /// then propagates the bound shape down the network. For now the variable part /// (bound part) is the first dimension of the shape, which usually corresponds /// to the batch size or sequence lookup size. -class CAFFE2_API BoundShapeInferencer { +class BoundShapeInferencerBase { public: - explicit BoundShapeInferencer(const BoundShapeSpec& spec) : spec_(spec) { + explicit BoundShapeInferencerBase(const BoundShapeSpec& spec) : spec_(spec) { CAFFE_ENFORCE_GE(spec_.max_batch_size, 0); CAFFE_ENFORCE_GE(spec_.max_seq_size, 0); } - void InferBoundShapeAndType( + virtual ~BoundShapeInferencerBase() {} + + virtual void InferBoundShapeAndType( const NetDef& net, - const std::unordered_map& info); + const std::unordered_map& info, + caffe2::Workspace* ws) = 0; const ShapeInfoMap& shape_info() const { return shape_info_; @@ -58,7 +61,24 @@ class CAFFE2_API BoundShapeInferencer { return ss.str(); } - private: + protected: + const BoundShapeSpec spec_; + std::unordered_map shape_info_; +}; + +class CAFFE2_API BoundShapeInferencer : public BoundShapeInferencerBase { + public: + explicit BoundShapeInferencer(const BoundShapeSpec& spec) + : BoundShapeInferencerBase(spec) {} + + virtual ~BoundShapeInferencer() override {} + + void InferBoundShapeAndType( + const NetDef& net, + const std::unordered_map& info, + caffe2::Workspace* ws) override; + + protected: TensorShape& CheckAndSetTensorShapeAndType( const std::string& name, ShapeInfo::DimType t, @@ -83,16 +103,23 @@ class CAFFE2_API BoundShapeInferencer { void InferShape(const OperatorDef& op); void InferReshape(const OperatorDef& op); void InferLengthsRangeFill(const OperatorDef& op); - void InferClipRangesGatherSigridHash(const OperatorDef& op); // Standard shape/type inference using op schema registered shape inference // function void InferCommonOp(const OperatorDef& op); - const BoundShapeSpec spec_; + void EnsureShapeNames(std::unordered_map* info) const; + ShapeInfo::DimType current_dim_type_{ShapeInfo::DimType::BATCH}; int64_t current_max_batch_size_{0}; - std::unordered_map shape_info_; }; +CAFFE2_API std::shared_ptr getBoundShapeInferencer( + const BoundShapeSpec& spec); + +C10_DECLARE_SHARED_REGISTRY( + BoundShapeInferencerRegistry, + BoundShapeInferencerBase, + const BoundShapeSpec&); + } // namespace caffe2 diff --git a/caffe2/opt/optimize_ideep.cc b/caffe2/opt/optimize_ideep.cc index d770479512b9..f0d251e66e99 100644 --- a/caffe2/opt/optimize_ideep.cc +++ b/caffe2/opt/optimize_ideep.cc @@ -79,6 +79,11 @@ bool isOnIdeepDevice(const repr::NeuralNetOperator& nnOp) { } bool isConvFusion(repr::NNGraph::NodeRef convNode, int fusion_type) { + // Here we only check the type of ConvFusion op (for FP32 only) + if (!repr::nn::is(convNode)) { + return false; + } + auto conv = repr::nn::get(convNode); auto& op = getOpDef(*conv); diff --git a/caffe2/opt/shape_info.cc b/caffe2/opt/shape_info.cc index ebd7b68303be..7eb8d0d3726e 100644 --- a/caffe2/opt/shape_info.cc +++ b/caffe2/opt/shape_info.cc @@ -14,7 +14,23 @@ ShapeInfo getShapeInfoFromBlob(const Blob* blob) { if (blob->meta().id() == TypeMeta::Id()) { shape_info.is_quantized = true; LoadInt8TensorInfoOfBlob( - &shape_info.q_info.scale, &shape_info.q_info.offset, blob); + &shape_info.q_info.scale, + &shape_info.q_info.offset, + &shape_info.q_info.axis, + blob); + } else { +#ifndef C10_MOBILE + auto function_ptr = + ExternalTensorFunctionsBaseRegistry()->Create(blob->meta().id()); + if (function_ptr != nullptr) { + shape_info.is_quantized = true; + function_ptr->LoadInfoOfBlob( + blob, + &shape_info.q_info.scale, + &shape_info.q_info.offset, + &shape_info.q_info.axis); + } +#endif } return shape_info; } diff --git a/caffe2/opt/shape_info.h b/caffe2/opt/shape_info.h index 06d42821f9fb..622b6404c0c8 100644 --- a/caffe2/opt/shape_info.h +++ b/caffe2/opt/shape_info.h @@ -5,13 +5,25 @@ namespace caffe2 { struct CAFFE2_API QShapeInfo { - QShapeInfo(float o = 0, float s = 1) : offset(o), scale(s) {} - float offset; - float scale; - // TODO zrphercule - // Add multi offset/scale support here + QShapeInfo(float o = 0, float s = 1, uint32_t a = 1) { + offset.clear(); + scale.clear(); + offset.push_back(o); + scale.push_back(s); + axis = a; + } + + uint32_t axis; + vector offset; + vector scale; }; +CAFFE2_API void LoadInt8FCDNNLowPPackedWeightBlobInfoOfBlob( + std::vector* scale, + std::vector* offset, + uint32_t* axis, + const Blob* b); + struct CAFFE2_API ShapeInfo { enum DimType : int8_t { UNKNOWN = 0, CONSTANT = 1, BATCH = 2, SEQ = 3 }; ShapeInfo(bool q = false) : is_quantized(q) {} diff --git a/caffe2/perfkernels/embedding_lookup_avx2.cc b/caffe2/perfkernels/embedding_lookup_avx2.cc index 271c07a7bb5d..87ac57356d2a 100644 --- a/caffe2/perfkernels/embedding_lookup_avx2.cc +++ b/caffe2/perfkernels/embedding_lookup_avx2.cc @@ -1291,6 +1291,7 @@ static bool EmbeddingLookup_int32_t_half_float__avx2_fma( } } else { // generic code + alignas(64) at::Half vtmp1[8] = {0}; for (int rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) { float* op = &out[rangeIndex * block_size]; int64_t j = 0; @@ -1335,10 +1336,10 @@ static bool EmbeddingLookup_int32_t_half_float__avx2_fma( _mm_prefetch( reinterpret_cast(&ip_next_T0[j]), _MM_HINT_T0); } - alignas(64) at::Half vtmp1[8]; for (; j < block_size; j++) { vtmp1[0] = ip[j]; - __m256 vtmp2 = _mm256_cvtph_ps(*((__m128i*)vtmp1)); + __m256 vtmp2 = + _mm256_cvtph_ps(*(reinterpret_cast(vtmp1))); op[j] += wgt * ((float*)(&vtmp2))[0]; } } @@ -1837,6 +1838,7 @@ static bool EmbeddingLookup_int64_t_half_float__avx2_fma( } } else { // generic code + alignas(64) at::Half vtmp1[8] = {0}; for (int64_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) { float* op = &out[rangeIndex * block_size]; int64_t j = 0; @@ -1881,10 +1883,10 @@ static bool EmbeddingLookup_int64_t_half_float__avx2_fma( _mm_prefetch( reinterpret_cast(&ip_next_T0[j]), _MM_HINT_T0); } - alignas(64) at::Half vtmp1[8]; for (; j < block_size; j++) { vtmp1[0] = ip[j]; - __m256 vtmp2 = _mm256_cvtph_ps(*((__m128i*)vtmp1)); + __m256 vtmp2 = + _mm256_cvtph_ps(*(reinterpret_cast(vtmp1))); op[j] += wgt * ((float*)(&vtmp2))[0]; } } diff --git a/caffe2/perfkernels/embedding_lookup_fused_8bit_rowwise_avx2.cc b/caffe2/perfkernels/embedding_lookup_fused_8bit_rowwise_avx2.cc index 12f790df22e3..230b3bc85687 100644 --- a/caffe2/perfkernels/embedding_lookup_fused_8bit_rowwise_avx2.cc +++ b/caffe2/perfkernels/embedding_lookup_fused_8bit_rowwise_avx2.cc @@ -1280,6 +1280,7 @@ static bool Fused8BitRowwiseEmbeddingLookup_int32_t_half_float__avx2_fma( } } else { // generic code + alignas(64) at::Half vtmp1[8] = {0}; for (int rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) { float* op = &out[rangeIndex * block_size]; int64_t j = 0; @@ -1324,10 +1325,10 @@ static bool Fused8BitRowwiseEmbeddingLookup_int32_t_half_float__avx2_fma( _mm_prefetch( reinterpret_cast(&ip_next_T0[j]), _MM_HINT_T0); } - alignas(64) at::Half vtmp1[8]; for (; j < block_size; j++) { vtmp1[0] = ip[j]; - __m256 vtmp2 = _mm256_cvtph_ps(*((__m128i*)vtmp1)); + __m256 vtmp2 = + _mm256_cvtph_ps(*(reinterpret_cast(vtmp1))); op[j] += wgt * ((float*)(&vtmp2))[0]; } } @@ -1821,6 +1822,7 @@ static bool Fused8BitRowwiseEmbeddingLookup_int64_t_half_float__avx2_fma( } } else { // generic code + alignas(64) at::Half vtmp1[8] = {0}; for (int64_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) { float* op = &out[rangeIndex * block_size]; int64_t j = 0; @@ -1865,10 +1867,10 @@ static bool Fused8BitRowwiseEmbeddingLookup_int64_t_half_float__avx2_fma( _mm_prefetch( reinterpret_cast(&ip_next_T0[j]), _MM_HINT_T0); } - alignas(64) at::Half vtmp1[8]; for (; j < block_size; j++) { vtmp1[0] = ip[j]; - __m256 vtmp2 = _mm256_cvtph_ps(*((__m128i*)vtmp1)); + __m256 vtmp2 = + _mm256_cvtph_ps(*(reinterpret_cast(vtmp1))); op[j] += wgt * ((float*)(&vtmp2))[0]; } } diff --git a/caffe2/perfkernels/hp_emblookup_codegen.py b/caffe2/perfkernels/hp_emblookup_codegen.py index 0af4120b064f..bfde7521446d 100644 --- a/caffe2/perfkernels/hp_emblookup_codegen.py +++ b/caffe2/perfkernels/hp_emblookup_codegen.py @@ -197,6 +197,8 @@ def compute(InType, use_weights, isa): return code code = [] + if InType == "at::Half": + code.append(" alignas(64) at::Half vtmp1[8] = {0};") code.append( " for (" + IndexType @@ -283,14 +285,15 @@ def compute(InType, use_weights, isa): code.extend(compute(InType, use_weights, isa)) code.append(" }") # leftover - if InType == "at::Half": - code.append(" alignas(64) at::Half vtmp1[8];") code.append(" for (; j < block_size; j++) {") if InType == "float": code.append(" op[j] += wgt * ip[j];") elif InType == "at::Half": code.append(" vtmp1[0] = ip[j];") - code.append(" __m256 vtmp2 = _mm256_cvtph_ps(*((__m128i*)vtmp1));") + code.append( + " __m256 vtmp2 =\n" + " _mm256_cvtph_ps(*(reinterpret_cast(vtmp1)));" + ) code.append(" op[j] += wgt * ((float*)(&vtmp2))[0];") elif InType == "uint8_t": code.append(" op[j] += wgt * ((float)ip[j]) + bio;") diff --git a/caffe2/predictor/predictor.cc b/caffe2/predictor/predictor.cc index 615d871d5c79..300d161a6e65 100644 --- a/caffe2/predictor/predictor.cc +++ b/caffe2/predictor/predictor.cc @@ -4,6 +4,7 @@ namespace caffe2 { +class Workspace; namespace { void enforceIsTensor(Workspace* ws, const std::string& name) { diff --git a/caffe2/predictor/predictor_config.h b/caffe2/predictor/predictor_config.h index eda1c9d03ca2..243729b044e9 100644 --- a/caffe2/predictor/predictor_config.h +++ b/caffe2/predictor/predictor_config.h @@ -1,7 +1,8 @@ #pragma once #include -#include "caffe2/core/net.h" + #include "caffe2/core/tensor.h" +#include "caffe2/core/workspace.h" #include "caffe2/proto/metanet.pb.h" #include "caffe2/proto/predictor_consts.pb.h" diff --git a/caffe2/proto/caffe2.proto b/caffe2/proto/caffe2.proto index 2b079de3d44a..576c1034c561 100644 --- a/caffe2/proto/caffe2.proto +++ b/caffe2/proto/caffe2.proto @@ -126,6 +126,17 @@ message QTensorProto { repeated int32 data = 6 [packed = true]; optional string name = 7; optional TensorProto.DataType data_type = 8 [default = INT32]; + + // Multi-group quantization params + repeated double scales = 9; + repeated double biases = 10; + + // Multi-group quantization needed, indicates in which dimension + // we do the "group wise quantization" + optional int32 axis = 11; + + // It should be true if it is a multi-group quantization proto + optional bool is_multiparam = 12 [default = false]; } // TensorProtos stores multiple TensorProto objects in one single proto. This diff --git a/caffe2/proto/torch.proto b/caffe2/proto/torch.proto index 6208e0276cee..a140bb82cb5c 100644 --- a/caffe2/proto/torch.proto +++ b/caffe2/proto/torch.proto @@ -69,6 +69,9 @@ message ModuleDef { optional bool optimize = 8; repeated AttributeDef attributes = 9; + + // Used for retrieving module state from the pickled IValues table + optional int64 get_state_attribute_id = 10; } // Represents all non-module code that the model depends on. @@ -79,7 +82,7 @@ message LibDef { } enum ProtoVersion { - PROTO_VERSION_NEWEST = 0x0000000000000003; + PROTO_VERSION_NEWEST = 0x0000000000000005; } message ModelDef { diff --git a/caffe2/python/checkpoint.py b/caffe2/python/checkpoint.py index f73745513c10..73ef06999a54 100644 --- a/caffe2/python/checkpoint.py +++ b/caffe2/python/checkpoint.py @@ -19,7 +19,6 @@ ) logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) diff --git a/caffe2/python/layer_model_helper.py b/caffe2/python/layer_model_helper.py index 38078773bed0..f142dde5cc38 100644 --- a/caffe2/python/layer_model_helper.py +++ b/caffe2/python/layer_model_helper.py @@ -106,6 +106,14 @@ def add_metric_field(self, name, value): (name, value) ) + # an empty white_set will skip everything + def filter_metrics_schema(self, white_set): + logger.info("Filter metric schema with white_set {}".format(white_set)) + field_names = self._metrics_schema.field_names() + for name in field_names: + if name not in white_set: + self._metrics_schema = self._metrics_schema - schema.Struct((name, schema.Scalar())) + def add_ad_hoc_plot_blob(self, blob, dtype=None): assert isinstance( blob, (six.string_types, core.BlobReference) diff --git a/caffe2/python/layers/batch_distill_lr_loss.py b/caffe2/python/layers/batch_distill_lr_loss.py deleted file mode 100644 index c4a367956922..000000000000 --- a/caffe2/python/layers/batch_distill_lr_loss.py +++ /dev/null @@ -1,191 +0,0 @@ -## @package batch_distill_lr_loss -# Module caffe2.python.layers.batch_distill_lr_loss -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals - -from caffe2.python import core, schema -from caffe2.python.layers.layers import ( - ModelLayer, -) -from caffe2.python.layers.tags import ( - Tags -) -import numpy as np - - -class BatchDistillLRLoss(ModelLayer): - - def __init__( - self, model, input_record, - name='batch_distill_lr_loss', teacher_weight=0.0, - filter_invalid_teacher_label=False, **kwargs): - - super(BatchDistillLRLoss, self).__init__(model, name, input_record, **kwargs) - - assert teacher_weight >= 0 and teacher_weight <= 1, ( - 'teacher_weight=%0.2f should be in [0, 1]' % teacher_weight - ) - - self._teacher_weight = teacher_weight - self._filter_invalid_teacher_label = filter_invalid_teacher_label - # hyper-parameter determines whether to filter out bad teacehr labels, - # i.e., teacher labels that are zero. - if self._filter_invalid_teacher_label: - self.threshold = model.add_global_constant( - str(model.net.NextScopedBlob('threshold')), - [0.0], # threshold for filtering teacher weight. - dtype=np.float - ) - self.neg_ONE = model.add_global_constant( - str(model.net.NextScopedBlob('neg_ONE')), - [-1.0], - dtype=np.float - ) - self.ONE = model._GetOne() - assert schema.is_schema_subset( - schema.Struct( - ('teacher_label', schema.Scalar()), - ('label', schema.Scalar()), - ('logit', schema.Scalar()), - ), - input_record - ) - self.tags.update([Tags.EXCLUDE_FROM_PREDICTION]) - - self.output_schema = schema.Scalar( - np.float32, - self.get_next_blob_reference('output') - ) - - def add_ops(self, net): - label = self.input_record.label() - if self.input_record.label.field_type() != np.float32: - label = net.Cast( - label, - net.NextScopedBlob('float_label'), - to=core.DataType.FLOAT, - ) - - # Assuming 1-D input - label = net.ExpandDims(label, net.NextScopedBlob('expanded_label'), - dims=[1]) - - teacher_label = self.input_record.teacher_label() - - if self.input_record.teacher_label.field_type() != np.float32: - teacher_label = net.Cast( - teacher_label, - net.NextScopedBlob('float_teacher_label'), - to=core.DataType.FLOAT, - ) - teacher_label = net.ExpandDims( - teacher_label, net.NextScopedBlob('expanded_teacher_label'), - dims=[1]) - - true_xent = net.SigmoidCrossEntropyWithLogits( - [self.input_record.logit(), label], - net.NextScopedBlob('cross_entropy') - ) - - teacher_xent = net.SigmoidCrossEntropyWithLogits( - [self.input_record.logit(), teacher_label], - net.NextScopedBlob('teacher_cross_entropy') - ) - if self._filter_invalid_teacher_label: - squeezed_teacher_label = net.Squeeze( - teacher_label, - net.NextScopedBlob('squeezed_teacher_label'), - dims=[1] - ) - # blob used to contain the original teacher weights - keep_weights = net.ConstantFill( - [squeezed_teacher_label], - net.NextScopedBlob('keep_weights'), - value=self._teacher_weight, - dtype=core.DataType.FLOAT - ) - #blob used to zero out the teacher weights - zero_weights = net.ConstantFill( - [squeezed_teacher_label], - net.NextScopedBlob('zero_weights'), - value=0.0, - dtype=core.DataType.FLOAT - ) - - #Indicating which teacher labels are bad, i.e., are zero. - judge = net.GT( - [squeezed_teacher_label, self.threshold], - net.NextScopedBlob('judge'), - broadcast=1 - ) - #zero out bad teacher weights corresponding to bad teacher labels. - screened_teacher_weights = net.Conditional( - [judge, keep_weights, zero_weights], - net.NextScopedBlob('screened_teacher_weights') - ) - neg_screened_teacher_weights = net.Mul( - [screened_teacher_weights, self.neg_ONE], - net.NextScopedBlob('neg_screened_teacher_weights'), - broadcast=1 - ) - one_minus_screened_teacher_weights = net.Add( - [neg_screened_teacher_weights, self.ONE], - net.NextScopedBlob('one_minus_screened_teacher_weights'), - broadcast=1 - ) - scaled_true_xent = net.Mul( - [true_xent, one_minus_screened_teacher_weights], - net.NextScopedBlob('scaled_cross_entropy'), - broadcast=1 - ) - scaled_teacher_xent = net.Mul( - [teacher_xent, screened_teacher_weights], - net.NextScopedBlob('scaled_teacher_cross_entropy'), - broadcast=1 - ) - else: - scaled_true_xent = net.Scale( - true_xent, - net.NextScopedBlob('scaled_cross_entropy'), - scale=float(1.0 - self._teacher_weight), - ) - scaled_teacher_xent = net.Scale( - teacher_xent, - net.NextScopedBlob('scaled_teacher_cross_entropy'), - scale=float(self._teacher_weight), - ) - if 'weight' in self.input_record.fields: - weight_blob = self.input_record.weight() - if self.input_record.weight.field_type().base != np.float32: - weight_blob = net.Cast( - weight_blob, - weight_blob + '_float32', - to=core.DataType.FLOAT - ) - weight_blob = net.StopGradient( - [weight_blob], - [net.NextScopedBlob('weight_stop_gradient')], - ) - scaled_true_xent = net.Mul( - [scaled_true_xent, weight_blob], - net.NextScopedBlob('weighted_xent_label'), - ) - scaled_teacher_xent = net.Mul( - [scaled_teacher_xent, weight_blob], - net.NextScopedBlob('weighted_xent_teacher'), - ) - - true_loss = net.AveragedLoss( - scaled_true_xent, - net.NextScopedBlob('true_loss') - ) - teacher_loss = net.AveragedLoss( - scaled_teacher_xent, - net.NextScopedBlob('teacher_loss') - ) - net.Add( - [true_loss, teacher_loss], - self.output_schema.field_blobs() - ) diff --git a/caffe2/python/layers/sparse_lookup.py b/caffe2/python/layers/sparse_lookup.py index 73c937358b38..b38909ed9e7a 100644 --- a/caffe2/python/layers/sparse_lookup.py +++ b/caffe2/python/layers/sparse_lookup.py @@ -58,6 +58,10 @@ class SparseLookup(ModelLayer): 'Float16UniformFill' ] + _fp16_compatible_reducers = [ + 'Sum', 'Mean', 'Sqrt', 'PositionWeighted', 'RecencyWeighted', + ] + def __init__(self, model, input_record, inner_shape, reducer, weight_init=None, weight_optim=None, name='sparse_lookup', regularizer=None, **kwargs): @@ -105,6 +109,14 @@ def __init__(self, model, input_record, inner_shape, reducer, # If fp16 is used, make sure fp16 init op is used if self.trainer_version == "fp16": + assert self.reducer in self._fp16_compatible_reducers, ( + "Fp16 training is enabled. The reducer specified is not supported. " + "Got {}. Supported reducers: {}. Right now, in general, sum, mean, " + "positional pooling are supported. Attention is not. Please check " + "if there is fp16 trained sparse features using advanced pooling.".format( + self.reducer, self._fp16_compatible_reducers) + ) + # if init op is UniformFill, we replace it directly if self.weight_init[0] == "UniformFill": self.weight_init = ("Float16UniformFill", self.weight_init[1]) diff --git a/caffe2/python/layers_test.py b/caffe2/python/layers_test.py index 031060301be5..eaee1e52e9c1 100644 --- a/caffe2/python/layers_test.py +++ b/caffe2/python/layers_test.py @@ -113,6 +113,31 @@ def testAddLoss(self): assert core.BlobReference('loss_blob_in_tuple_1')\ in self.model.loss.field_blobs() + def testFilterMetricSchema(self): + self.model.add_metric_field("a:b", schema.Scalar()) + self.model.add_metric_field("a:c", schema.Scalar()) + self.model.add_metric_field("d", schema.Scalar()) + + self.assertEqual( + self.model.metrics_schema, + schema.Struct( + ("a", schema.Struct( + ("b", schema.Scalar()), + ("c", schema.Scalar()), + )), + ("d", schema.Scalar()), + )) + + self.model.filter_metrics_schema({"a:b", "d"}) + self.assertEqual( + self.model.metrics_schema, + schema.Struct( + ("a", schema.Struct( + ("b", schema.Scalar()), + )), + ("d", schema.Scalar()), + )) + def testAddOutputSchema(self): # add the first field self.model.add_output_schema('struct', schema.Struct()) @@ -701,72 +726,6 @@ def testSamplingTrain(self): ] ) - def testDistillBatchLRLoss(self): - input_record = self.new_record(schema.Struct( - ('label', schema.Scalar((np.float64, (1,)))), - ('logit', schema.Scalar((np.float32, (2,)))), - ('teacher_label', schema.Scalar((np.float32(1,)))), - ('weight', schema.Scalar((np.float64, (1,)))) - )) - loss = self.model.BatchDistillLRLoss(input_record) - self.assertEqual(schema.Scalar((np.float32, tuple())), loss) - - def testDistillBatchLRLossWithTeacherWeightScreen(self): - input_record = self.new_record(schema.Struct( - ('label', schema.Scalar((np.float32, (2,)))), - ('logit', schema.Scalar((np.float32, (2, 1)))), - ('teacher_label', schema.Scalar((np.float32(2,)))), - ('weight', schema.Scalar((np.float64, (2,)))) - )) - label_items = np.array([1.0, 1.0], dtype=np.float32) - logit_items = np.array([[1.0], [1.0]], dtype=np.float32) - teacher_label_items = np.array([0.8, -1.0], dtype=np.float32) - weight_items = np.array([1.0, 1.0], dtype=np.float32) - schema.FeedRecord( - input_record, - [label_items, logit_items, teacher_label_items, weight_items] - ) - loss = self.model.BatchDistillLRLoss( - input_record, - teacher_weight=0.5, - filter_invalid_teacher_label=True - ) - self.run_train_net_forward_only() - tensor_loss = workspace.FetchBlob(loss.field_blobs()[0]) - - def cross_entropy(label, logit): - return logit - logit * label + np.log(1 + np.exp(-1.0 * logit)) - - def cal_cross_entropy( - label_items, logit_items, teacher_label_items, weight_items - ): - total_ce = 0 - for i in range(label_items.shape[0]): - true_xent = cross_entropy(label_items[i], logit_items[i, 0]) - if teacher_label_items[i] > 0: - teacher_xent = cross_entropy( - teacher_label_items[i], logit_items[i, 0] - ) - else: - teacher_xent = 0 - teacher_weight = 0.5 if teacher_label_items[i] > 0 else 0 - total_ce += (true_xent * (1 - teacher_weight) + - teacher_xent * teacher_weight) * weight_items[i] - return total_ce / label_items.shape[0] - - correct_ace = cal_cross_entropy( - label_items, - logit_items, - teacher_label_items, - weight_items - ) - self.assertAlmostEqual( - tensor_loss, - np.array(correct_ace), - delta=0.0000001, - msg="Wrong cross entropy {}".format(tensor_loss) - ) - def testBatchLRLoss(self): input_record = self.new_record(schema.Struct( ('label', schema.Scalar((np.float64, (1,)))), diff --git a/caffe2/python/modeling/compute_norm_for_blobs.py b/caffe2/python/modeling/compute_norm_for_blobs.py index 8a1928e08ed9..6b4f1716ffcc 100644 --- a/caffe2/python/modeling/compute_norm_for_blobs.py +++ b/caffe2/python/modeling/compute_norm_for_blobs.py @@ -19,10 +19,11 @@ class ComputeNormForBlobs(NetModifier): blobs: list of blobs to compute norm for logging_frequency: frequency for printing norms to logs p: type of norm. Currently it supports p=1 or p=2 - compute_averaged_norm: norm or averaged_norm (averaged_norm = norm/size) + compute_averaged_norm: norm or averaged_norm (averaged_norm = norm/size + row_index: to plot the entire blob or simply one row at the row_index) """ - def __init__(self, blobs, logging_frequency, p=2, compute_averaged_norm=False): + def __init__(self, blobs, logging_frequency, p=2, compute_averaged_norm=False, row_index=None): self._blobs = blobs self._logging_frequency = logging_frequency self._p = p @@ -31,11 +32,17 @@ def __init__(self, blobs, logging_frequency, p=2, compute_averaged_norm=False): if compute_averaged_norm: self._field_name_suffix = '_averaged' + self._field_name_suffix + if row_index and row_index < 0: + raise Exception('{0} is not a valid row index, row_index should be >= 0'.format( + row_index)) + self.row_index = row_index + def modify_net(self, net, init_net=None, grad_map=None, blob_to_device=None, modify_output_record=False): p = self._p compute_averaged_norm = self._compute_averaged_norm + row_index = self.row_index CPU = muji.OnCPU() # if given, blob_to_device is a map from blob to device_option @@ -51,12 +58,21 @@ def modify_net(self, net, init_net=None, grad_map=None, blob_to_device=None, device = CPU with core.DeviceScope(device): - norm_name = net.NextScopedBlob(prefix=blob + self._field_name_suffix) + if row_index and row_index >= 0: + blob = net.Slice( + [blob], + net.NextScopedBlob(prefix=blob + '_row_{0}'.format(row_index)), + starts=[row_index, 0], + ends=[row_index + 1, -1] + ) + cast_blob = net.Cast( blob, net.NextScopedBlob(prefix=blob + '_float'), to=core.DataType.FLOAT ) + + norm_name = net.NextScopedBlob(prefix=blob + self._field_name_suffix) norm = net.LpNorm( cast_blob, norm_name, p=p, average=compute_averaged_norm ) diff --git a/caffe2/python/modeling/compute_norm_for_blobs_test.py b/caffe2/python/modeling/compute_norm_for_blobs_test.py index f4e8d1ef5614..d6bfda1adf92 100644 --- a/caffe2/python/modeling/compute_norm_for_blobs_test.py +++ b/caffe2/python/modeling/compute_norm_for_blobs_test.py @@ -102,10 +102,10 @@ def test_compute_averaged_norm_for_blobs(self): workspace.RunNetOnce(model.net) fc1_w = workspace.FetchBlob('fc1_w') - fc1_w_l2_averaged_norm = workspace.FetchBlob('fc1_w_averaged_l2_norm') + fc1_w_averaged_l2_norm = workspace.FetchBlob('fc1_w_averaged_l2_norm') - self.assertEqual(fc1_w_l2_averaged_norm.size, 1) - self.assertAlmostEqual(fc1_w_l2_averaged_norm[0], + self.assertEqual(fc1_w_averaged_l2_norm.size, 1) + self.assertAlmostEqual(fc1_w_averaged_l2_norm[0], np.linalg.norm(fc1_w)**2 / fc1_w.size, delta=1e-5) @@ -203,3 +203,30 @@ def test_compute_l1_averaged_norm_for_blobs(self): delta=1e-5) self.assertEqual(len(model.net.Proto().op), 8) + + def test_compute_norm_row_index_for_blobs(self): + model = model_helper.ModelHelper(name="test") + data = model.net.AddExternalInput("data") + fc1 = brew.fc(model, data, "fc1", dim_in=4, dim_out=2) + + net_modifier = ComputeNormForBlobs( + blobs=['fc1_w'], + logging_frequency=10, + compute_averaged_norm=True, + row_index=1 + ) + + net_modifier(model.net) + + workspace.FeedBlob('data', np.random.rand(10, 4).astype(np.float32)) + + workspace.RunNetOnce(model.param_init_net) + workspace.RunNetOnce(model.net) + + fc1_w = workspace.FetchBlob('fc1_w') + fc1_w_row_1_averaged_l2_norm = workspace.FetchBlob('fc1_w_row_1_averaged_l2_norm') + + self.assertEqual(fc1_w_row_1_averaged_l2_norm.size, 1) + self.assertAlmostEqual(fc1_w_row_1_averaged_l2_norm[0], + np.linalg.norm(fc1_w[1])**2 / fc1_w[1].size, + delta=1e-5) diff --git a/caffe2/python/operator_test/adadelta_test.py b/caffe2/python/operator_test/adadelta_test.py index 94709dc6acdc..3c42cc9925cd 100644 --- a/caffe2/python/operator_test/adadelta_test.py +++ b/caffe2/python/operator_test/adadelta_test.py @@ -55,6 +55,8 @@ def ref_adadelta(param_in, **hu.gcs) def test_adadelta(self, inputs, lr, epsilon, decay, gc, dc): param, moment, moment_delta, grad = inputs + moment = np.abs(moment) + moment_delta = np.abs(moment_delta) lr = np.array([lr], dtype=np.float32) op = core.CreateOperator( @@ -85,6 +87,7 @@ def test_adadelta(self, inputs, lr, epsilon, decay, gc, dc): def test_sparse_adadelta(self, inputs, lr, epsilon, decay, gc, dc): param, moment, moment_delta, grad = inputs moment = np.abs(moment) + moment_delta = np.abs(moment_delta) lr = np.array([lr], dtype=np.float32) # Create an indexing array containing values that are lists of indices, diff --git a/caffe2/python/operator_test/adam_test.py b/caffe2/python/operator_test/adam_test.py index 8209b1c04930..2a94528fde52 100644 --- a/caffe2/python/operator_test/adam_test.py +++ b/caffe2/python/operator_test/adam_test.py @@ -60,6 +60,7 @@ def ref_row_wise_adam(param, mom1, mom2, grad, LR, ITER, **hu.gcs) def test_adam(self, inputs, ITER, LR, beta1, beta2, epsilon, gc, dc): param, mom1, mom2, grad = inputs + mom2 = np.abs(mom2) ITER = np.array([ITER], dtype=np.int64) LR = np.array([LR], dtype=np.float32) @@ -93,6 +94,7 @@ def test_adam(self, inputs, ITER, LR, beta1, beta2, epsilon, gc, dc): **hu.gcs_cpu_only) def test_adam_output_grad(self, inputs, ITER, LR, beta1, beta2, epsilon, gc, dc): param, mom1, mom2, grad = inputs + mom2 = np.abs(mom2) ITER = np.array([ITER], dtype=np.int64) LR = np.array([LR], dtype=np.float32) diff --git a/caffe2/python/operator_test/bucketize_op_test.py b/caffe2/python/operator_test/bucketize_op_test.py new file mode 100644 index 000000000000..f22ff6b0aed9 --- /dev/null +++ b/caffe2/python/operator_test/bucketize_op_test.py @@ -0,0 +1,35 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals +from caffe2.python import core, dyndep +from hypothesis import given +import caffe2.python.hypothesis_test_util as hu +import hypothesis.strategies as st +import numpy as np + + +class TestBucketizeOp(hu.HypothesisTestCase): + @given( + x=hu.tensor( + min_dim=1, max_dim=2, dtype=np.float32, + elements=st.floats(min_value=-5, max_value=5)), + **hu.gcs) + def test_bucketize_op(self, x, gc, dc): + length = np.random.randint(low=1, high=5) + boundaries = np.random.randn(length) * 5 + boundaries.sort() + + def ref(x, boundaries): + bucket_idx = np.digitize(x, boundaries, right=True) + return [bucket_idx] + + op = core.CreateOperator('Bucketize', + ["X"], ["INDICES"], + boundaries=boundaries) + self.assertReferenceChecks(gc, op, [x, boundaries], ref) + + +if __name__ == "__main__": + import unittest + unittest.main() diff --git a/caffe2/python/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.py b/caffe2/python/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.py index 05ce3d0f94c8..4db3f1529d81 100644 --- a/caffe2/python/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.py +++ b/caffe2/python/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.py @@ -4,13 +4,11 @@ from __future__ import unicode_literals import numpy as np -import os import unittest -from hypothesis import given, settings +from hypothesis import given import hypothesis.strategies as st -from caffe2.proto import caffe2_pb2 from caffe2.python import core, utils import caffe2.python.hypothesis_test_util as hu import caffe2.python.serialized_test.serialized_test_util as serial @@ -32,7 +30,8 @@ def boxes_area(boxes): def map_rois_to_fpn_levels( rois, k_min, k_max, - roi_canonical_scale, roi_canonical_level): + roi_canonical_scale, roi_canonical_level +): """Determine which FPN level each RoI in a set of RoIs should map to based on the heuristic in the FPN paper. """ @@ -130,25 +129,28 @@ def collect_and_distribute_fpn_rpn_ref(*inputs): return outputs -class TestCollectAndDistributeFpnRpnProposals(serial.SerializedTestCase): - @serial.given(proposal_count=st.integers(min_value=1000, max_value=8000), - rpn_min_level=st.integers(min_value=1, max_value=4), - rpn_num_levels=st.integers(min_value=1, max_value=6), - roi_min_level=st.integers(min_value=1, max_value=4), - roi_num_levels=st.integers(min_value=1, max_value=6), - rpn_post_nms_topN=st.integers(min_value=1000, max_value=4000), - roi_canonical_scale=st.integers(min_value=100, max_value=300), - roi_canonical_level=st.integers(min_value=1, max_value=8), - **hu.gcs_cpu_only) - def test_collect_and_dist( - self, - proposal_count, - rpn_min_level, rpn_num_levels, - roi_min_level, roi_num_levels, - rpn_post_nms_topN, - roi_canonical_scale, roi_canonical_level, - gc, dc): +def collect_rpn_ref(*inputs): + args = inputs[-1] + inputs = inputs[:-1] + rois = collect(inputs, **args) + return [rois] + +def distribute_fpn_ref(*inputs): + args = inputs[-1] + inputs = inputs[:-1] + rois = inputs[0] + num_roi_lvls = args['roi_num_levels'] + outputs = (num_roi_lvls + 2) * [None] + distribute(rois, None, outputs, **args) + # remove the first rois from output of distribute + outputs.pop(0) + return outputs + + +class TestCollectAndDistributeFpnRpnProposals(serial.SerializedTestCase): + @staticmethod + def _create_input(proposal_count, rpn_min_level, rpn_num_levels, roi_canonical_scale): np.random.seed(0) input_names = [] @@ -171,6 +173,30 @@ def test_collect_and_dist( input_names.append('rpn_roi_probs_fpn{}'.format(lvl + rpn_min_level)) inputs.append(rpn_roi_score) + return input_names, inputs + + @serial.given(proposal_count=st.integers(min_value=1000, max_value=8000), + rpn_min_level=st.integers(min_value=1, max_value=4), + rpn_num_levels=st.integers(min_value=1, max_value=6), + roi_min_level=st.integers(min_value=1, max_value=4), + roi_num_levels=st.integers(min_value=1, max_value=6), + rpn_post_nms_topN=st.integers(min_value=1000, max_value=4000), + roi_canonical_scale=st.integers(min_value=100, max_value=300), + roi_canonical_level=st.integers(min_value=1, max_value=8), + **hu.gcs_cpu_only) + def test_collect_and_dist( + self, + proposal_count, + rpn_min_level, rpn_num_levels, + roi_min_level, roi_num_levels, + rpn_post_nms_topN, + roi_canonical_scale, roi_canonical_level, + gc, dc + ): + input_names, inputs = self._create_input( + proposal_count, rpn_min_level, rpn_num_levels, roi_canonical_scale + ) + output_names = [ 'rois', ] @@ -193,7 +219,6 @@ def test_collect_and_dist( ], device_option=gc) args = { - 'proposal_count' : proposal_count, 'rpn_min_level' : rpn_min_level, 'rpn_num_levels' : rpn_num_levels, 'roi_min_level' : roi_min_level, @@ -205,10 +230,87 @@ def test_collect_and_dist( self.assertReferenceChecks( device_option=gc, op=op, - inputs=inputs+[args], + inputs=inputs + [args], reference=collect_and_distribute_fpn_rpn_ref, ) + @given( + proposal_count=st.integers(min_value=1000, max_value=8000), + rpn_min_level=st.integers(min_value=1, max_value=4), + rpn_num_levels=st.integers(min_value=1, max_value=6), + roi_min_level=st.integers(min_value=1, max_value=4), + roi_num_levels=st.integers(min_value=1, max_value=6), + rpn_post_nms_topN=st.integers(min_value=1000, max_value=4000), + roi_canonical_scale=st.integers(min_value=100, max_value=300), + roi_canonical_level=st.integers(min_value=1, max_value=8), + **hu.gcs_cpu_only) + def test_collect_and_dist_separately( + self, + proposal_count, + rpn_min_level, rpn_num_levels, + roi_min_level, roi_num_levels, + rpn_post_nms_topN, + roi_canonical_scale, roi_canonical_level, + gc, dc + ): + input_names, inputs = self._create_input( + proposal_count, rpn_min_level, rpn_num_levels, roi_canonical_scale + ) + + collect_op = core.CreateOperator( + 'CollectRpnProposals', + input_names, + ['rois'], + arg=[ + utils.MakeArgument("rpn_max_level", rpn_min_level + rpn_num_levels - 1), + utils.MakeArgument("rpn_min_level", rpn_min_level), + utils.MakeArgument("rpn_post_nms_topN", rpn_post_nms_topN), + ], + device_option=gc) + collect_args = { + 'rpn_min_level' : rpn_min_level, + 'rpn_num_levels' : rpn_num_levels, + 'rpn_post_nms_topN' : rpn_post_nms_topN, + } + + self.assertReferenceChecks( + device_option=gc, + op=collect_op, + inputs=inputs + [collect_args], + reference=collect_rpn_ref, + ) + + rois = collect(inputs, **collect_args) + + output_names = [] + for lvl in range(roi_num_levels): + output_names.append('rois_fpn{}'.format(lvl + roi_min_level)) + output_names.append('rois_idx_restore') + + distribute_op = core.CreateOperator( + 'DistributeFpnProposals', + ['rois'], + output_names, + arg=[ + utils.MakeArgument("roi_canonical_scale", roi_canonical_scale), + utils.MakeArgument("roi_canonical_level", roi_canonical_level), + utils.MakeArgument("roi_max_level", roi_min_level + roi_num_levels - 1), + utils.MakeArgument("roi_min_level", roi_min_level), + ], + device_option=gc) + distribute_args = { + 'roi_min_level' : roi_min_level, + 'roi_num_levels' : roi_num_levels, + 'roi_canonical_scale' : roi_canonical_scale, + 'roi_canonical_level' : roi_canonical_level} + + self.assertReferenceChecks( + device_option=gc, + op=distribute_op, + inputs=[rois, distribute_args], + reference=distribute_fpn_ref, + ) + if __name__ == "__main__": unittest.main() diff --git a/caffe2/python/operator_test/given_tensor_fill_op_test.py b/caffe2/python/operator_test/given_tensor_fill_op_test.py index 36333f411ba1..bcd277cf258b 100644 --- a/caffe2/python/operator_test/given_tensor_fill_op_test.py +++ b/caffe2/python/operator_test/given_tensor_fill_op_test.py @@ -18,6 +18,7 @@ class TestGivenTensorFillOps(hu.HypothesisTestCase): (core.DataType.BOOL, np.bool_, "GivenTensorFill"), (core.DataType.INT32, np.int32, "GivenTensorFill"), (core.DataType.FLOAT, np.float32, "GivenTensorFill"), + (core.DataType.INT16, np.int16, "GivenTensorInt16Fill"), (core.DataType.INT32, np.int32, "GivenTensorIntFill"), (core.DataType.INT64, np.int64, "GivenTensorInt64Fill"), (core.DataType.BOOL, np.bool_, "GivenTensorBoolFill"), diff --git a/caffe2/python/operator_test/math_ops_test.py b/caffe2/python/operator_test/math_ops_test.py index 0772aee5c9b2..9a5159ee1770 100644 --- a/caffe2/python/operator_test/math_ops_test.py +++ b/caffe2/python/operator_test/math_ops_test.py @@ -19,6 +19,8 @@ class TestMathOps(serial.SerializedTestCase): exponent=st.floats(min_value=2.0, max_value=3.0), **hu.gcs) def test_elementwise_power(self, X, exponent, gc, dc): + # negative integer raised with non-integer exponent is domain error + X = np.abs(X) def powf(X): return (X ** exponent,) diff --git a/caffe2/python/operator_test/shape_inference_test.py b/caffe2/python/operator_test/shape_inference_test.py index a78d9434c811..04a99b2d8e94 100644 --- a/caffe2/python/operator_test/shape_inference_test.py +++ b/caffe2/python/operator_test/shape_inference_test.py @@ -539,6 +539,28 @@ def testHalfInt8Conversion(self): # TODO: find a tighter bound assert(np.allclose(x, x_recovered, atol=1e-2)) + def testLearningRateOp(self): + net = core.Net("lr_test") + iteration = net.ConstantFill( + [], + "iteration", + shape=[1], + value=0, + dtype=core.DataType.INT64, + ) + lr = net.LearningRate( + [iteration], + net.NextScopedBlob("weight_decay"), + base_lr=0.5, + policy="constantWarmup", + multiplier=0.0, + num_iter=0, + ) + (shapes, types) = workspace.InferShapesAndTypes( + [net], + ) + self.assertEqual(shapes['weight_decay'], [1]) + def testShapeOp(self): model = model_helper.ModelHelper(name="shape_op_test") model.Shape('x', 'y') diff --git a/caffe2/python/operator_test/torch_integration_test.py b/caffe2/python/operator_test/torch_integration_test.py index 4e811a193534..a3a0d6fbafa9 100644 --- a/caffe2/python/operator_test/torch_integration_test.py +++ b/caffe2/python/operator_test/torch_integration_test.py @@ -122,6 +122,7 @@ def bbox_transform_ref(): -90, 90, clip_angle_thresh, + legacy_plus_one=True, ) torch.testing.assert_allclose(box_out, a) @@ -161,6 +162,7 @@ def test_box_with_nms_limits( -90, 90, clip_angle_thresh, + legacy_plus_one=True, ) ] class_prob = np.random.randn(sum(roi_counts), num_classes).astype(np.float32) @@ -206,6 +208,7 @@ def box_with_nms_limit_ref(): cls_agnostic_bbox_reg=False, input_boxes_include_bg_cls=True, output_classes_include_bg_cls=True, + legacy_plus_one=True, ) for o, o_ref in zip(outputs, output_refs): @@ -258,6 +261,7 @@ def generate_proposals_ref(): -90, 90, 1.0, + legacy_plus_one=True, ) torch.testing.assert_allclose(rois, a) torch.testing.assert_allclose(rois_probs, b) @@ -392,6 +396,7 @@ def generate_proposals_ref(): -90, 90, 1.0, + legacy_plus_one=True, ) torch.testing.assert_allclose(rois, a.cpu()) torch.testing.assert_allclose(rois_probs, b.cpu()) @@ -451,6 +456,51 @@ def test_roi_align_cpu(self): def test_roi_align_cuda(self): self._test_roi_align(device="cuda") + @given(roi_counts=st.lists(st.integers(0, 5), min_size=1, max_size=10)) + def test_collect_and_distribute_fpn_rpn_proposals_op(self, roi_counts): + batch_size = len(roi_counts) + im_dims = np.random.randint(100, 600, batch_size) + rpn_rois_and_scores = [] + for i in range(5): + rpn_rois_and_scores.append(torch.Tensor(generate_rois(roi_counts, im_dims))) + for i in range(5): + rpn_rois_and_scores.append(torch.rand(sum(roi_counts))) + + rois = torch.ops._caffe2.CollectRpnProposals( + rpn_rois_and_scores, + rpn_max_level=6, + rpn_min_level=2, + rpn_post_nms_topN=sum(roi_counts), + ) + fpn_outputs = torch.ops._caffe2.DistributeFpnProposals( + rois, + roi_canonical_scale=224, + roi_canonical_level=4, + roi_max_level=5, + roi_min_level=2, + legacy_plus_one=True, + ) + + all_outputs = torch.ops._caffe2.CollectAndDistributeFpnRpnProposals( + rpn_rois_and_scores, + roi_canonical_scale=224, + roi_canonical_level=4, + roi_max_level=5, + roi_min_level=2, + rpn_max_level=6, + rpn_min_level=2, + rpn_post_nms_topN=sum(roi_counts), + legacy_plus_one=True, + ) + + rois_fpn_list = fpn_outputs[:-1] + rois_idx_restore_int32 = fpn_outputs[-1] + + # [rois] + fpn_outputs should be equal to all_outputs + torch.testing.assert_allclose(rois, all_outputs[0]) + for x, y in zip(fpn_outputs, all_outputs[1:]): + torch.testing.assert_allclose(x, y) + @given(X=hu.tensor(), fast_gelu=st.booleans()) def _test_gelu_op(self, X, fast_gelu, device): @@ -472,5 +522,58 @@ def test_gelu_op_cuda(self): self._test_gelu_op(device="cuda") + @given(inputs=hu.lengths_tensor( + dtype=np.float32, + min_value=1, + max_value=5, + allow_empty=True, + )) + def _test_lengths_op(self, inputs, ref_op_name, torch_op, device): + data, lengths = inputs + + def _lengths_ref(X, Y): + ref_op = core.CreateOperator(ref_op_name, ["X", "Y"], "out") + workspace.FeedBlob("X", X) + workspace.FeedBlob("Y", Y) + workspace.RunOperatorOnce(ref_op) + return workspace.FetchBlob("out") + + expected_output = _lengths_ref(data, lengths) + actual_output = torch_op( + torch.tensor(data), torch.tensor(lengths, dtype=torch.int32)) + + torch.testing.assert_allclose(expected_output, actual_output.cpu()) + + def _test_lengths_sum_op(self, device): + self._test_lengths_op("LengthsSum", torch.ops._caffe2.LengthsSum, device) + + def test_lengths_sum_op(self): + self._test_lengths_sum_op(device="cpu") + + @unittest.skipIf(not workspace.has_cuda_support, "No cuda support") + def test_lengths_sum_op_cuda(self): + self._test_lengths_sum_op(device="cuda") + + def _test_lengths_mean_op(self, device): + self._test_lengths_op("LengthsMean", torch.ops._caffe2.LengthsMean, device) + + def test_lengths_mean_op(self): + self._test_lengths_mean_op(device="cpu") + + @unittest.skipIf(not workspace.has_cuda_support, "No cuda support") + def test_lengths_mean_op_cuda(self): + self._test_lengths_mean_op(device="cuda") + + def _test_lengths_max_op(self, device): + self._test_lengths_op("LengthsMax", torch.ops._caffe2.LengthsMax, device) + + def test_lengths_max_op(self): + self._test_lengths_max_op(device="cpu") + + @unittest.skipIf(not workspace.has_cuda_support, "No cuda support") + def test_lengths_max_op_cuda(self): + self._test_lengths_max_op(device="cuda") + + if __name__ == '__main__': unittest.main() diff --git a/caffe2/python/pybind_state.cc b/caffe2/python/pybind_state.cc index 2b2340e68722..66434ff4ef15 100644 --- a/caffe2/python/pybind_state.cc +++ b/caffe2/python/pybind_state.cc @@ -422,7 +422,7 @@ void addObjectMethods(py::module& m) { .def("_wrap_tensor_impl", [](Blob* blob, void* ptr) { auto p = c10::intrusive_ptr:: unsafe_reclaim_from_nonowning(static_cast(ptr)); - AT_CHECK(p.defined(), "Can't wrap undefined tensor"); + TORCH_CHECK(p.defined(), "Can't wrap undefined tensor"); auto at_tensor = at::Tensor::wrap_tensor_impl(std::move(p)); BlobSetTensor(blob, Tensor(std::move(at_tensor))); }); @@ -1263,6 +1263,14 @@ void addGlobalMethods(py::module& m) { net->TEST_Benchmark(warmup_runs, main_runs, run_individual); return stat; }); + m.def("benchmark_net_once", [](const std::string& name) { + CAFFE_ENFORCE(gWorkspace); + auto* net = gWorkspace->GetNet(name); + CAFFE_ENFORCE(net, "Didn't find net: ", name); + py::gil_scoped_release g; + float stat = net->TEST_Benchmark_One_Run(); + return stat; + }); m.def("delete_net", [](const std::string& name) { CAFFE_ENFORCE(gWorkspace); diff --git a/caffe2/python/schema.py b/caffe2/python/schema.py index 9c7ef354d1de..72187ccda06c 100644 --- a/caffe2/python/schema.py +++ b/caffe2/python/schema.py @@ -502,6 +502,12 @@ def __add__(self, other): children[name] = right_field continue left_field = children[name] + if not (isinstance(left_field, Struct) and isinstance(right_field, Struct)): + raise TypeError( + "Type of left_field, " + str(type(left_field)) + + ", and type of right_field, " + + str(type(right_field)) + + ", must both the Struct to allow merging of the field, " + name) children[name] = left_field + right_field return Struct(*(viewitems(children))) diff --git a/caffe2/python/sparse_to_dense_mask_test.py b/caffe2/python/sparse_to_dense_mask_test.py index 73e10d0725b8..375068ef537e 100644 --- a/caffe2/python/sparse_to_dense_mask_test.py +++ b/caffe2/python/sparse_to_dense_mask_test.py @@ -35,7 +35,8 @@ def test_sparse_to_dense_mask_invalid_inputs(self): 'SparseToDenseMask', ['indices', 'values', 'default', 'lengths'], ['output'], - mask=[999999999, 2]) + mask=[999999999, 2], + max_skipped_indices=3) workspace.FeedBlob( 'indices', np.array([2000000000000, 999999999, 2, 3, 4, 5], dtype=np.int32)) @@ -48,11 +49,13 @@ def test_sparse_to_dense_mask_invalid_inputs(self): workspace.RunOperatorOnce(op) except RuntimeError: self.fail("Exception raised with only one negative index") + + # 3 invalid inputs should throw. workspace.FeedBlob( 'indices', - np.array([2000000000000, 999999999, -2, -3, -4, -5], dtype=np.int32)) + np.array([-1, 1, 2, 3, 4, 5], dtype=np.int32)) with self.assertRaises(RuntimeError): - workspace.RunOperatorOnce(op) + workspace.RunOperatorMultiple(op, 3) def test_sparse_to_dense_mask_subtensor(self): op = core.CreateOperator( diff --git a/caffe2/python/workspace.py b/caffe2/python/workspace.py index c28865097099..cf03910b51e7 100644 --- a/caffe2/python/workspace.py +++ b/caffe2/python/workspace.py @@ -36,6 +36,7 @@ RootFolder = C.root_folder Workspaces = C.workspaces BenchmarkNet = C.benchmark_net +BenchmarkNetOnce = C.benchmark_net_once GetStats = C.get_stats operator_tracebacks = defaultdict(dict) diff --git a/caffe2/quantization/server/fbgemm_pack_op.cc b/caffe2/quantization/server/fbgemm_pack_op.cc index 7b28da9c9173..9da2013d935e 100644 --- a/caffe2/quantization/server/fbgemm_pack_op.cc +++ b/caffe2/quantization/server/fbgemm_pack_op.cc @@ -1,5 +1,6 @@ #include "fbgemm_pack_op.h" +#include "caffe2/core/tensor.h" #include "caffe2/core/tensor_int8.h" #include "caffe2_dnnlowp_utils.h" @@ -570,10 +571,115 @@ bool ConvDNNLowPPackWeightOp::RunOnDevice() { return true; } +bool Int8DNNLowpPackedWeightBlobShapeFunctions::IsSameMetaType( + TypeIdentifier id) { + return id == TypeMeta::Id() || + id == TypeMeta::Id(); +} + +TypeIdentifier Int8DNNLowpPackedWeightBlobShapeFunctions::GetTypeMetaId( + const string& name) { + if (name == "FC") { + return TypeMeta::Id(); + } else if (name == "Conv") { + return TypeMeta::Id(); + } else { + CAFFE_THROW("Class type is not supported: ", name); + return TypeMeta::Id(); + } +} + +TypeMeta Int8DNNLowpPackedWeightBlobShapeFunctions::GetExternalTensorType( + const void* c) { + // There might be some problem if type if FC. + // We should use a different function. + const Int8ConvDNNLowPPackedWeightBlob* int8_tensor = + reinterpret_cast(c); + return (int8_tensor->original_tensor).dtype(); +} + +vector +Int8DNNLowpPackedWeightBlobShapeFunctions::GetExternalTensorInfo( + const void* c, + size_t* capacity, + DeviceOption* device) { + const Int8ConvDNNLowPPackedWeightBlob* int8_tensor = + reinterpret_cast(c); + return GetTensorInfo(&(int8_tensor->original_tensor), capacity, device); +} + +void Int8DNNLowpPackedWeightBlobShapeFunctions::LoadInfoOfBlob( + const Blob* blob, + std::vector* scale, + std::vector* offset, + uint32_t* axis) { + scale->clear(); + offset->clear(); + const Int8ConvDNNLowPPackedWeightBlob* int8_tensor = + reinterpret_cast(blob->GetRaw()); + const auto& qparams = int8_tensor->qparams; + for (const auto& qparam : qparams) { + scale->emplace_back(qparam.scale); + offset->emplace_back(static_cast(qparam.zero_point)); + } + *axis = 1; +} + +void Int8DNNLowpPackedWeightBlobShapeFunctions::SetupExternalTensorDescriptor( + const Blob* blob, + std::vector>* shapes, + std::vector>* all_scales, + std::vector>* all_offsets, + ExternalTensorDescriptor* desc) { + const auto& dnntensor = blob->template Get(); + const Tensor& cpu_tensor = dnntensor.original_tensor; + + if (cpu_tensor.template IsType()) { + desc->dataType = kONNXIFI_DATATYPE_UINT8; + desc->buffer = reinterpret_cast(cpu_tensor.data()); + } else if (cpu_tensor.template IsType()) { + desc->dataType = kONNXIFI_DATATYPE_INT32; + desc->buffer = reinterpret_cast(cpu_tensor.data()); + } else if (cpu_tensor.template IsType()) { + desc->dataType = kONNXIFI_DATATYPE_INT8; + desc->buffer = reinterpret_cast(cpu_tensor.data()); + } else { + CAFFE_THROW( + "Unsupported Int8ConvDNNLowPPackedWeightBlob type in ONNXIFI: ", + cpu_tensor.dtype().name()); + } + + desc->quantizationParams = dnntensor.qparams.size(); + desc->quantizationAxis = 1; + std::vector scales, offsets; + for (const auto v : dnntensor.qparams) { + scales.emplace_back(v.scale); + offsets.emplace_back(v.zero_point); + } + all_scales->push_back(scales); + all_offsets->push_back(offsets); + desc->scales = all_scales->back().data(); + desc->biases = reinterpret_cast(all_offsets->back().data()); + + // Set up dim and shape + const auto shape = cpu_tensor.sizes(); + desc->dimensions = shape.size(); + shapes->emplace_back(shape.cbegin(), shape.cend()); + desc->shape = shapes->back().data(); +} + // Explicitly register TypeMeta CAFFE_KNOWN_TYPE(Int8FCDNNLowPPackedWeightBlob); CAFFE_KNOWN_TYPE(Int8ConvDNNLowPPackedWeightBlob); +// Register DNNLOWP Type in caffe2 core +REGISTER_EXTERNAL_TENSOR_FUNCTIONS( + (TypeMeta::Id()), + Int8DNNLowpPackedWeightBlobShapeFunctions); +REGISTER_EXTERNAL_TENSOR_FUNCTIONS( + (TypeMeta::Id()), + Int8DNNLowpPackedWeightBlobShapeFunctions); + REGISTER_CPU_OPERATOR_WITH_ENGINE( Int8FCPackWeight, DNNLOWP, diff --git a/caffe2/quantization/server/fbgemm_pack_op.h b/caffe2/quantization/server/fbgemm_pack_op.h index db7ff52d9b9f..31e7e6dd6040 100644 --- a/caffe2/quantization/server/fbgemm_pack_op.h +++ b/caffe2/quantization/server/fbgemm_pack_op.h @@ -101,5 +101,38 @@ fbgemm::CompressedSparseColumn* ExtractOutlierMatrix( int M, int nbits_in_non_outlier, vector& W_quantized); +/* + * Set up used onnxifi data type constexpr + * Should always be synced with onnxifi.h + */ +constexpr uint64_t kONNXIFI_DATATYPE_UINT8 = 2; +constexpr uint64_t kONNXIFI_DATATYPE_INT32 = 6; +constexpr uint64_t kONNXIFI_DATATYPE_INT8 = 3; + +class Int8DNNLowpPackedWeightBlobShapeFunctions + : public ExternalTensorFunctionsBase { + public: + explicit Int8DNNLowpPackedWeightBlobShapeFunctions() + : ExternalTensorFunctionsBase() {} + ~Int8DNNLowpPackedWeightBlobShapeFunctions() override {} + bool IsSameMetaType(TypeIdentifier id) override; + void SetupExternalTensorDescriptor( + const Blob* blob, + std::vector>* shapes, + std::vector>* all_scales, + std::vector>* all_offsets, + ExternalTensorDescriptor* desc) override; + void LoadInfoOfBlob( + const Blob* blob, + std::vector* scale, + std::vector* offset, + uint32_t* axis) override; + TypeIdentifier GetTypeMetaId(const string& name) override; + TypeMeta GetExternalTensorType(const void* c) override; + vector GetExternalTensorInfo( + const void* c, + size_t* capacity, + DeviceOption* device) override; +}; } // namespace caffe2 diff --git a/caffe2/quantization/server/fully_connected_fake_lowp_op_avx2.cc b/caffe2/quantization/server/fully_connected_fake_lowp_op_avx2.cc index 414bfe2d3d21..c66bb976280c 100644 --- a/caffe2/quantization/server/fully_connected_fake_lowp_op_avx2.cc +++ b/caffe2/quantization/server/fully_connected_fake_lowp_op_avx2.cc @@ -2,21 +2,39 @@ namespace caffe2 { +namespace { + +// NOTE: clang-format wants to use a different formatting but the +// current formatting should be easier to read. +alignas(64) const int ld_st_masks[8][8] = { + { 0, 0, 0, 0, 0, 0, 0, 0, }, + { -1, 0, 0, 0, 0, 0, 0, 0, }, + { -1, -1, 0, 0, 0, 0, 0, 0, }, + { -1, -1, -1, 0, 0, 0, 0, 0, }, + { -1, -1, -1, -1, 0, 0, 0, 0, }, + { -1, -1, -1, -1, -1, 0, 0, 0, }, + { -1, -1, -1, -1, -1, -1, 0, 0, }, + { -1, -1, -1, -1, -1, -1, -1, 0, }, +}; + +} // anonymous namespace + // convert to float16 reducing mantissa, preserving exponent void fp32_to_bfp16(const float* source, size_t size, float* dest) { // Results on a 1 sign, 8 exponent, 7 mantissa constexpr int mask = 0xFFFF0000; __m256 wmask = _mm256_broadcast_ss(reinterpret_cast(&mask)); - for (auto i = 0; i < (size / 8) * 8; i += 8) { + size_t i = 0; + for (; i < (size / 8) * 8; i += 8) { __m256 data = _mm256_loadu_ps(&source[i]); _mm256_storeu_ps(&dest[i], _mm256_and_ps(wmask, data)); } - for (auto i = (size / 8) * 8; i < size; i++) { - alignas(64) float tmp[8]; - __m256 data = _mm256_and_ps(wmask, _mm256_set1_ps(source[i])); - _mm256_store_ps(tmp, data); - dest[i] = tmp[0]; + if (i < size) { + __m256i ld_st_mask = _mm256_load_si256( + reinterpret_cast(ld_st_masks[size - i])); + __m256 data = _mm256_maskload_ps(&source[i], ld_st_mask); + _mm256_maskstore_ps(&dest[i], ld_st_mask, _mm256_and_ps(wmask, data)); } } @@ -26,15 +44,16 @@ void fp32_to_bfp24(const float* source, size_t size, float* dest) { constexpr int mask = 0xFFFFFF00; __m256 wmask = _mm256_broadcast_ss(reinterpret_cast(&mask)); - for (auto i = 0; i < (size / 8) * 8; i += 8) { + size_t i = 0; + for (; i < (size / 8) * 8; i += 8) { __m256 data = _mm256_loadu_ps(&source[i]); _mm256_storeu_ps(&dest[i], _mm256_and_ps(wmask, data)); } - for (auto i = (size / 8) * 8; i < size; i++) { - alignas(64) float tmp[8]; - __m256 data = _mm256_and_ps(wmask, _mm256_set1_ps(source[i])); - _mm256_store_ps(tmp, data); - dest[i] = tmp[0]; + if (i < size) { + __m256i ld_st_mask = _mm256_load_si256( + reinterpret_cast(ld_st_masks[size - i])); + __m256 data = _mm256_maskload_ps(&source[i], ld_st_mask); + _mm256_maskstore_ps(&dest[i], ld_st_mask, _mm256_and_ps(wmask, data)); } } @@ -44,15 +63,16 @@ void fp32_to_bfp14(const float* source, size_t size, float* dest) { constexpr int mask = 0xFFFC0000; __m256 wmask = _mm256_broadcast_ss((float*)(&mask)); - for (auto i = 0; i < (size / 8) * 8; i += 8) { + size_t i = 0; + for (; i < (size / 8) * 8; i += 8) { __m256 data = _mm256_loadu_ps(&source[i]); _mm256_storeu_ps(&dest[i], _mm256_and_ps(wmask, data)); } - for (auto i = (size / 8) * 8; i < size; i++) { - alignas(64) float tmp[8]; - __m256 data = _mm256_and_ps(wmask, _mm256_set1_ps(source[i])); - _mm256_store_ps(tmp, data); - dest[i] = tmp[0]; + if (i < size) { + __m256i ld_st_mask = _mm256_load_si256( + reinterpret_cast(ld_st_masks[size - i])); + __m256 data = _mm256_maskload_ps(&source[i], ld_st_mask); + _mm256_maskstore_ps(&dest[i], ld_st_mask, _mm256_and_ps(wmask, data)); } } @@ -65,15 +85,17 @@ void fp32_to_bfp16_scalar(const float* source, size_t size, float* dest) { // convert to IEEE float16 void fp32_to_fp16(const float* source, size_t size, float* dest) { - for (auto i = 0; i < (size / 8) * 8; i += 8) { + size_t i = 0; + for (; i < (size / 8) * 8; i += 8) { __m128i vin_fp16 = _mm256_cvtps_ph(_mm256_loadu_ps(&source[i]), 0); _mm256_storeu_ps(&dest[i], _mm256_cvtph_ps(vin_fp16)); } - for (auto i = (size / 8) * 8; i < size; i++) { - alignas(64) float tmp[8]; - __m128i vin_fp16 = _mm256_cvtps_ph(_mm256_set1_ps(source[i]), 0); - _mm256_store_ps(tmp, _mm256_cvtph_ps(vin_fp16)); - dest[i] = tmp[0]; + if (i < size) { + __m256i ld_st_mask = _mm256_load_si256( + reinterpret_cast(ld_st_masks[size - i])); + __m128i vin_fp16 = + _mm256_cvtps_ph(_mm256_maskload_ps(&source[i], ld_st_mask), 0); + _mm256_maskstore_ps(&dest[i], ld_st_mask, _mm256_cvtph_ps(vin_fp16)); } } @@ -85,20 +107,25 @@ void fp32_to_bfp16_round(const float* source, size_t size, float* dest) { __m256i woffset = _mm256_set1_epi32(offset); __m256i wmask = _mm256_set1_epi32(mask); - for (auto i = 0; i < (size / 8) * 8; i += 8) { + size_t i = 0; + for (; i < (size / 8) * 8; i += 8) { __m256i v32int = _mm256_add_epi32( _mm256_loadu_si256(reinterpret_cast(&source[i])), woffset); _mm256_storeu_si256( reinterpret_cast<__m256i*>(&dest[i]), _mm256_and_si256(wmask, v32int)); } - for (auto i = (size / 8) * 8; i < size; i++) { - alignas(64) float tmp[8]; + if (i < size) { + __m256i ld_st_mask = _mm256_load_si256( + reinterpret_cast(ld_st_masks[size - i])); __m256i v32int = _mm256_add_epi32( - _mm256_set1_epi32(*reinterpret_cast(&source[i])), woffset); - _mm256_store_si256( - reinterpret_cast<__m256i*>(tmp), _mm256_and_si256(wmask, v32int)); - dest[i] = tmp[0]; + _mm256_maskload_epi32( + reinterpret_cast(&source[i]), ld_st_mask), + woffset); + _mm256_maskstore_epi32( + reinterpret_cast(&dest[i]), + ld_st_mask, + _mm256_and_si256(wmask, v32int)); } } diff --git a/caffe2/serialize/inline_container.cc b/caffe2/serialize/inline_container.cc index 9d1c426e67b6..fb119d625ff8 100644 --- a/caffe2/serialize/inline_container.cc +++ b/caffe2/serialize/inline_container.cc @@ -165,6 +165,18 @@ static std::string getPadding(size_t cursor, const std::string& filename, size_t return buf; } +bool PyTorchStreamReader::hasFile(const std::string& name) { + std::stringstream ss; + ss << archive_name_ << "/" << name; + mz_zip_reader_locate_file(ar_.get(), ss.str().c_str(), nullptr, 0); + bool result = ar_->m_last_error != MZ_ZIP_FILE_NOT_FOUND; + if (!result) { + ar_->m_last_error = MZ_ZIP_NO_ERROR; + } + valid("attempting to locate file"); + return result; +} + size_t PyTorchStreamReader::getFileID(const std::string& name) { std::stringstream ss; ss << archive_name_ << "/" << name; diff --git a/caffe2/serialize/inline_container.h b/caffe2/serialize/inline_container.h index 28f71492780c..5ca9dcde7fa6 100644 --- a/caffe2/serialize/inline_container.h +++ b/caffe2/serialize/inline_container.h @@ -106,8 +106,8 @@ class CAFFE2_API PyTorchStreamReader final { // return dataptr, size std::tuple getRecord(const std::string& name); - size_t getRecordOffset(const std::string& name); + bool hasFile(const std::string& name); ~PyTorchStreamReader(); diff --git a/caffe2/serialize/inline_container_test.cc b/caffe2/serialize/inline_container_test.cc index 6b4c969665ce..70105f5c3bc6 100644 --- a/caffe2/serialize/inline_container_test.cc +++ b/caffe2/serialize/inline_container_test.cc @@ -39,6 +39,9 @@ TEST(PyTorchStreamWriterAndReader, SaveAndLoad) { // read records through readers PyTorchStreamReader reader(&iss); + ASSERT_TRUE(reader.hasFile("key1")); + ASSERT_TRUE(reader.hasFile("key2")); + ASSERT_FALSE(reader.hasFile("key2000")); at::DataPtr data_ptr; int64_t size; std::tie(data_ptr, size) = reader.getRecord("key1"); @@ -48,7 +51,6 @@ TEST(PyTorchStreamWriterAndReader, SaveAndLoad) { ASSERT_EQ(memcmp(the_file.c_str() + off1, data1.data(), data1.size()), 0); ASSERT_EQ(off1 % kFieldAlignment, 0); - std::tie(data_ptr, size) = reader.getRecord("key2"); size_t off2 = reader.getRecordOffset("key2"); ASSERT_EQ(off2 % kFieldAlignment, 0); diff --git a/caffe2/sgd/iter_op.cc b/caffe2/sgd/iter_op.cc index a2fc9e56abf3..6b5d20a189c8 100644 --- a/caffe2/sgd/iter_op.cc +++ b/caffe2/sgd/iter_op.cc @@ -50,6 +50,7 @@ OPERATOR_SCHEMA(AtomicIter) .NumInputs(2) .NumOutputs(1) .EnforceInplace({{1, 0}}) + .IdenticalTypeAndShapeOfInput(1) .SetDoc(R"DOC( Similar to Iter, but takes a mutex as the first input to make sure that updates are carried out atomically. This can be used in e.g. Hogwild sgd @@ -60,4 +61,4 @@ algorithms. NO_GRADIENT(Iter); NO_GRADIENT(AtomicIter); -} // namespace caffe2 +} // namespace caffe2 diff --git a/caffe2/sgd/learning_rate_op.cc b/caffe2/sgd/learning_rate_op.cc index d6c4260bd79c..c4857884a643 100644 --- a/caffe2/sgd/learning_rate_op.cc +++ b/caffe2/sgd/learning_rate_op.cc @@ -6,6 +6,12 @@ REGISTER_CPU_OPERATOR(LearningRate, LearningRateOp); OPERATOR_SCHEMA(LearningRate) .NumInputs(1) .NumOutputs(1) + .TensorInferenceFunction([](const OperatorDef&, + const vector& in) { + vector out(1); + out[0] = in[0]; + return out; + }) .SetDoc(R"DOC( Learning rate is a decreasing function of time. With low learning rates the improvements will be linear. With high learning rates they will start to look diff --git a/caffe2/utils/math_gpu.cu b/caffe2/utils/math_gpu.cu index 836e0b75026c..44966972d130 100644 --- a/caffe2/utils/math_gpu.cu +++ b/caffe2/utils/math_gpu.cu @@ -2687,6 +2687,22 @@ CAFFE2_CUDA_EXPORT void CopyVector( } } +template <> +CAFFE2_CUDA_EXPORT void CopyVector( + const int N, + const int* src, + int* dst, + CUDAContext* context) { + if (src != dst && N > 0) { + cudaMemcpyAsync( + dst, + src, + sizeof(int) * N, + cudaMemcpyDeviceToDevice, + context->cuda_stream()); + } +} + namespace { template diff --git a/caffe2/utils/signal_handler.cc b/caffe2/utils/signal_handler.cc index 5620eb801ca5..8b9db0ae0fcd 100644 --- a/caffe2/utils/signal_handler.cc +++ b/caffe2/utils/signal_handler.cc @@ -123,15 +123,13 @@ struct { const char* name; int signum; struct sigaction previous; -} kSignalHandlers[] = { - { "SIGABRT", SIGABRT, {} }, - { "SIGINT", SIGINT, {} }, - { "SIGILL", SIGILL, {} }, - { "SIGFPE", SIGFPE, {} }, - { "SIGBUS", SIGBUS, {} }, - { "SIGSEGV", SIGSEGV, {} }, - { nullptr, 0, {} } -}; +} kSignalHandlers[] = {{"SIGABRT", SIGABRT, {}}, + {"SIGINT", SIGINT, {}}, + {"SIGILL", SIGILL, {}}, + {"SIGFPE", SIGFPE, {}}, + {"SIGBUS", SIGBUS, {}}, + {"SIGSEGV", SIGSEGV, {}}, + {nullptr, 0, {}}}; struct sigaction* getPreviousSigaction(int signum) { for (auto handler = kSignalHandlers; handler->name != nullptr; handler++) { @@ -433,7 +431,7 @@ REGISTER_CAFFE2_INIT_FUNCTION( "Inits signal handlers for fatal signals so we can see what if" " caffe2_print_stacktraces is set."); -} // namepsace internal +} // namespace internal #endif // defined(CAFFE2_SUPPORTS_FATAL_SIGNAL_HANDLERS) } // namespace caffe2 @@ -444,7 +442,12 @@ REGISTER_CAFFE2_INIT_FUNCTION( namespace caffe2 { SignalHandler::SignalHandler( SignalHandler::Action SIGINT_action, - SignalHandler::Action SIGHUP_action) {} + SignalHandler::Action SIGHUP_action) { + SIGINT_action_ = SIGINT_action; + SIGHUP_action_ = SIGHUP_action; + my_sigint_count_ = 0; + my_sighup_count_ = 0; +} SignalHandler::~SignalHandler() {} bool SignalHandler::GotSIGINT() { return false; diff --git a/caffe2/video/video_input_op.h b/caffe2/video/video_input_op.h index c7e0dab5f75e..c5441e600390 100644 --- a/caffe2/video/video_input_op.h +++ b/caffe2/video/video_input_op.h @@ -484,8 +484,10 @@ VideoInputOp::VideoInputOp( label_shape[1] = num_of_class_; ReinitializeTensor(&prefetched_label_, label_shape, at::dtype().device(CPU)); } else { - prefetched_label_.Resize( - vector(1, batch_size_ * clip_per_video_ * multi_crop_count_)); + ReinitializeTensor( + &prefetched_label_, + vector(1, batch_size_ * clip_per_video_ * multi_crop_count_), + at::dtype().device(CPU)); } ReinitializeTensor(&prefetched_video_id_, vector(1, batch_size_ * clip_per_video_ * multi_crop_count_), at::dtype().device(CPU)); diff --git a/cmake/Codegen.cmake b/cmake/Codegen.cmake index fc9f1a74fc5f..8a2c3b1a2fc9 100644 --- a/cmake/Codegen.cmake +++ b/cmake/Codegen.cmake @@ -101,11 +101,21 @@ if (INTERN_BUILD_ATEN_OPS) IF(CXX_AVX2_FOUND) SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_AVX2_CPU_DEFINITION") + + # Some versions of GCC pessimistically split unaligned load and store + # instructions when using the default tuning. This is a bad choice on + # new Intel and AMD processors so we disable it when compiling with AVX2. + # See https://stackoverflow.com/questions/52626726/why-doesnt-gcc-resolve-mm256-loadu-pd-as-single-vmovupd#tab-top + check_cxx_compiler_flag("-mno-avx256-split-unaligned-load -mno-avx256-split-unaligned-store" COMPILER_SUPPORTS_NO_AVX256_SPLIT) + IF(COMPILER_SUPPORTS_NO_AVX256_SPLIT) + SET(CPU_NO_AVX256_SPLIT_FLAGS "-mno-avx256-split-unaligned-load -mno-avx256-split-unaligned-store") + ENDIF(COMPILER_SUPPORTS_NO_AVX256_SPLIT) + LIST(APPEND CPU_CAPABILITY_NAMES "AVX2") IF(MSVC) LIST(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG}/arch:AVX2") ELSE(MSVC) - LIST(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG} -mavx2 -mfma") + LIST(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG} -mavx2 -mfma ${CPU_NO_AVX256_SPLIT_FLAGS}") ENDIF(MSVC) ENDIF(CXX_AVX2_FOUND) diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index f0fb061b427f..2ddc2abb6580 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -1101,15 +1101,6 @@ if (NOT INTERN_BUILD_MOBILE) ENDIF () ENDIF() - if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") - if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "4.9") - if (CUDA_VERSION VERSION_LESS "8.0") - MESSAGE(STATUS "Found gcc >=5 and CUDA <= 7.5, adding workaround C++ flags") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_FORCE_INLINES -D_MWAITXINTRIN_H_INCLUDED -D__STRICT_ANSI__") - endif() - endif() - endif() - LIST(APPEND CUDA_NVCC_FLAGS -Wno-deprecated-gpu-targets) LIST(APPEND CUDA_NVCC_FLAGS --expt-extended-lambda) diff --git a/cmake/Modules/FindOpenMP.cmake b/cmake/Modules/FindOpenMP.cmake index 5aace851397c..8ec0b44310fb 100644 --- a/cmake/Modules/FindOpenMP.cmake +++ b/cmake/Modules/FindOpenMP.cmake @@ -101,7 +101,7 @@ function(_OPENMP_FLAG_CANDIDATES LANG) set(OMP_FLAG_Intel "-qopenmp") endif() set(OMP_FLAG_MIPSpro "-mp") - set(OMP_FLAG_MSVC "-openmp") + set(OMP_FLAG_MSVC "-openmp:experimental" "-openmp") set(OMP_FLAG_PathScale "-openmp") set(OMP_FLAG_NAG "-openmp") set(OMP_FLAG_Absoft "-openmp") diff --git a/cmake/public/cuda.cmake b/cmake/public/cuda.cmake index f0ca7a5d18f7..ce215a736244 100644 --- a/cmake/public/cuda.cmake +++ b/cmake/public/cuda.cmake @@ -28,6 +28,9 @@ endif() message(STATUS "Caffe2: CUDA detected: " ${CUDA_VERSION}) message(STATUS "Caffe2: CUDA nvcc is: " ${CUDA_NVCC_EXECUTABLE}) message(STATUS "Caffe2: CUDA toolkit directory: " ${CUDA_TOOLKIT_ROOT_DIR}) +if(CUDA_VERSION VERSION_LESS 9.0) + message(FATAL_ERROR "PyTorch requires CUDA 9.0 and above.") +endif() if(CUDA_FOUND) # Sometimes, we may mismatch nvcc with the CUDA headers we are @@ -299,17 +302,6 @@ if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows") endif() endif() -if (${CUDA_VERSION} LESS 8.0) # CUDA 7.x - list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED") - list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__") -elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x - list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED") - list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__") - # CUDA 8 may complain that sm_20 is no longer supported. Suppress the - # warning for now. - list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets") -endif() - # Add onnx namepsace definition to nvcc if (ONNX_NAMESPACE) list(APPEND CUDA_NVCC_FLAGS "-DONNX_NAMESPACE=${ONNX_NAMESPACE}") @@ -333,16 +325,6 @@ if ((CUDA_VERSION VERSION_EQUAL 9.0) OR "variable to use another version (for example): \n" " export CUDAHOSTCXX='/usr/bin/gcc-5'\n") endif() -elseif (CUDA_VERSION VERSION_EQUAL 8.0) - # CUDA 8.0 requires GCC version <= 5 - if (CMAKE_C_COMPILER_ID STREQUAL "GNU" AND - NOT CMAKE_C_COMPILER_VERSION VERSION_LESS 6.0 AND - CUDA_HOST_COMPILER STREQUAL CMAKE_C_COMPILER) - message(FATAL_ERROR - "CUDA 8.0 is not compatible with GCC version >= 6. " - "Use the following option to use another version (for example): \n" - " -DCUDA_HOST_COMPILER=/usr/bin/gcc-5\n") - endif() endif() # setting nvcc arch flags diff --git a/docker/caffe2/jenkins/common/install_python.sh b/docker/caffe2/jenkins/common/install_python.sh index da79ab80bcf4..b22702a3a278 100755 --- a/docker/caffe2/jenkins/common/install_python.sh +++ b/docker/caffe2/jenkins/common/install_python.sh @@ -165,5 +165,7 @@ pip install --no-cache-dir \ mock \ typing \ typing-extensions \ - pyyaml + pyyaml \ + librosa>=0.6.2 \ + psutil diff --git a/docs/cpp/source/check-doxygen.sh b/docs/cpp/source/check-doxygen.sh index 9959a3fc2ba6..4311227cb91d 100755 --- a/docs/cpp/source/check-doxygen.sh +++ b/docs/cpp/source/check-doxygen.sh @@ -34,8 +34,10 @@ popd doxygen 2> original-doxygen-log.txt cp original-doxygen-log.txt doxygen-log.txt -echo "Original output" -cat original-doxygen-log.txt +# Uncomment this if you need it for debugging; we're not printing this +# by default because it is confusing. +# echo "Original output" +# cat original-doxygen-log.txt # Filter out some warnings. ignore_warning "warning: no uniquely matching class member found for" @@ -44,9 +46,12 @@ ignore_warning "warning: explicit link request to 'Item' could not be resolved" # Count the number of remaining warnings. warnings="$(grep 'warning:' doxygen-log.txt | wc -l)" +echo "Treating all remaining warnings as errors" + if [[ "$warnings" -ne "0" ]]; then - echo "Filtered output" + echo "Failing Doxygen test because the following warnings were treated fatally:" cat doxygen-log.txt + echo "Please fix these warnings. To run this test locally, use docs/cpp/source/check-doxygen.sh" rm -f doxygen-log.txt original-doxygen-log.txt exit 1 fi diff --git a/docs/cpp/source/installing.rst b/docs/cpp/source/installing.rst index 4e486fe00cdb..9f196ddda59a 100644 --- a/docs/cpp/source/installing.rst +++ b/docs/cpp/source/installing.rst @@ -73,7 +73,6 @@ We can now run the following commands to build the application from within the mkdir build cd build cmake -DCMAKE_PREFIX_PATH=/absolute/path/to/libtorch .. - cd .. make where ``/absolute/path/to/libtorch`` should be the absolute (!) path to the unzipped LibTorch diff --git a/docs/libtorch.rst b/docs/libtorch.rst index 6b5c411f5363..fe87bdfaeb94 100644 --- a/docs/libtorch.rst +++ b/docs/libtorch.rst @@ -18,7 +18,7 @@ You can use a python script/module located in tools package to build libtorch Alternatively, you can invoke a shell script in the same directory to achieve the same goal :: cd - BUILD_TORCH=ON ONNX_NAMESPACE=onnx_torch bash tools/build_pytorch_libs.sh --use-nnpack caffe2 + ONNX_NAMESPACE=onnx_torch bash tools/build_pytorch_libs.sh --use-nnpack caffe2 ls torch/lib/tmp_install # output is produced here ls torch/lib/tmp_install/lib/libtorch.so # of particular interest diff --git a/docs/source/_static/img/tensorboard/add_histogram_raw.png b/docs/source/_static/img/tensorboard/add_histogram_raw.png new file mode 100644 index 000000000000..96ebe5c48038 Binary files /dev/null and b/docs/source/_static/img/tensorboard/add_histogram_raw.png differ diff --git a/docs/source/jit.rst b/docs/source/jit.rst index eebd713ca1b8..56b75ae0175c 100644 --- a/docs/source/jit.rst +++ b/docs/source/jit.rst @@ -1100,6 +1100,36 @@ Q: How do I store attributes on a ``ScriptModule``? +Q: I would like to trace module's method but I keep getting this error: + +``RuntimeError: Cannot insert a Tensor that requires grad as a constant. Consider making it a parameter or input, or detaching the gradient`` + + This error usually means that, the method you are tracing, uses module's parameters and + you are passing module's method instead of a module instance (e.g. ``my_module_instance.forward`` vs ``my_module_instance``). + - Invoking ``trace`` with module's method captures module parameters (which may require gradients) as **constants**. + - On the other hand, invoking ``trace`` with module's instance (e.g. ``my_module``) creates a new module and correctly copies parameters into the new module, so they can accumulate gradients if required. + Given that ``trace`` treats ``my_module_instance.forward`` as a standalone function, it also means there is **not** currently a way to trace + arbitrary methods in the module except for ``forward`` that use module's parameters. + Version **1.1.1** will add a new API ``trace_module`` that will allow users to trace any method in the module and more than one method :: + + class Net(nn.Module): + def __init__(self): + super(Net, self).__init__() + self.conv = nn.Conv2d(1, 1, 3) + + def forward(self, x): + return self.conv(x) + + def weighted_kernel_sum(self, weight): + return weight * self.conv.weight + + example_weight = torch.rand(1, 1, 3, 3) + example_forward_input = torch.rand(1, 1, 3, 3) + n = Net() + inputs = {'forward' : example_forward_input, 'weighted_kernel_sum' : example_weight} + module = torch.jit.trace_module(n, inputs) + + Builtin Functions ~~~~~~~~~~~~~~~~~ diff --git a/docs/source/onnx.rst b/docs/source/onnx.rst index 90bf2878e7b7..ef1dcadd6dc8 100644 --- a/docs/source/onnx.rst +++ b/docs/source/onnx.rst @@ -212,10 +212,10 @@ To confirm whether the operator is standardized or not, please check the If the operator is an ATen operator, which means you can find the declaration of the function in ``torch/csrc/autograd/generated/VariableType.h`` (available in generated code in PyTorch install dir), you should add the symbolic -function in ``torch/onnx/symbolic.py`` and follow the instructions listed as below: +function in ``torch/onnx/symbolic_opset.py`` and follow the instructions listed as below: -* Define the symbolic function in - `torch/onnx/symbolic.py `_. +* Define the symbolic function in ``torch/onnx/symbolic_opset.py``, for example + `torch/onnx/symbolic_opset9.py `_. Make sure the function has the same name as the ATen operator/function defined in ``VariableType.h``. * The first parameter is always the exported ONNX graph. @@ -303,7 +303,7 @@ The ONNX graph C++ definition is in ``torch/csrc/jit/ir.h``. Here is an example of handling missing symbolic function for ``elu`` operator. We try to export the model and see the error message as below:: - UserWarning: ONNX export failed on elu because torch.onnx.symbolic.elu does not exist + UserWarning: ONNX export failed on elu because torch.onnx.symbolic_opset9.elu does not exist RuntimeError: ONNX export failed: Couldn't export operator elu The export fails because PyTorch does not support exporting ``elu`` operator. @@ -311,7 +311,7 @@ We find ``virtual Tensor elu(const Tensor & input, Scalar alpha, bool inplace) c in ``VariableType.h``. This means ``elu`` is an ATen operator. We check the `ONNX operator list `_, and confirm that ``Elu`` is standardized in ONNX. -We add the following lines to ``symbolic.py``:: +We add the following lines to ``symbolic_opset9.py``:: def elu(g, input, alpha, inplace=False): return g.op("Elu", input, alpha_f=_scalar(alpha)) @@ -319,7 +319,7 @@ We add the following lines to ``symbolic.py``:: Now PyTorch is able to export ``elu`` operator. There are more examples in -`symbolic.py `_, +`symbolic_opset9.py `_, `tensor.py `_, `padding.py `_. diff --git a/docs/source/tensorboard.rst b/docs/source/tensorboard.rst index 46cc0810a91b..d51149759527 100644 --- a/docs/source/tensorboard.rst +++ b/docs/source/tensorboard.rst @@ -86,6 +86,7 @@ Expected result: .. automethod:: add_scalar .. automethod:: add_scalars .. automethod:: add_histogram + .. automethod:: add_histogram_raw .. automethod:: add_image .. automethod:: add_images .. automethod:: add_figure diff --git a/docs/source/tensors.rst b/docs/source/tensors.rst index 385b26d4d6c3..a86beffabbf2 100644 --- a/docs/source/tensors.rst +++ b/docs/source/tensors.rst @@ -142,6 +142,7 @@ view of a storage and defines numeric operations on it. .. autoattribute:: is_cuda .. autoattribute:: device .. autoattribute:: grad + .. autoattribute:: ndim .. automethod:: abs .. automethod:: abs_ @@ -208,6 +209,7 @@ view of a storage and defines numeric operations on it. .. automethod:: cumsum .. automethod:: data_ptr .. automethod:: dequantize + .. automethod:: dequantize_linear .. automethod:: det .. automethod:: dense_dim .. automethod:: detach diff --git a/docs/source/torch.rst b/docs/source/torch.rst index 27694ce8ab42..cd4d6975101a 100644 --- a/docs/source/torch.rst +++ b/docs/source/torch.rst @@ -125,6 +125,8 @@ Parallelism ---------------------------------- .. autofunction:: get_num_threads .. autofunction:: set_num_threads +.. autofunction:: get_num_interop_threads +.. autofunction:: set_num_interop_threads Locally disabling gradient computation -------------------------------------- @@ -223,10 +225,12 @@ Reduction Ops .. autofunction:: norm .. autofunction:: prod .. autofunction:: std +.. autofunction:: std_mean .. autofunction:: sum .. autofunction:: unique .. autofunction:: unique_consecutive .. autofunction:: var +.. autofunction:: var_mean Comparison Ops diff --git a/scripts/build_windows.bat b/scripts/build_windows.bat index 0d1130dff65b..c89bc2458041 100644 --- a/scripts/build_windows.bat +++ b/scripts/build_windows.bat @@ -18,10 +18,6 @@ if NOT DEFINED BUILD_SHARED_LIBS ( set BUILD_SHARED_LIBS=OFF ) -if NOT DEFINED BUILD_TORCH ( - set BUILD_TORCH=OFF -) - IF NOT DEFINED BUILDING_WITH_TORCH_LIBS ( set BUILDING_WITH_TORCH_LIBS=OFF ) @@ -50,6 +46,10 @@ if NOT DEFINED USE_OBSERVERS ( set USE_OBSERVERS=OFF ) +if NOT DEFINED MSVC_Z7_OVERRIDE ( + set MSVC_Z7_OVERRIDE=OFF +) + if NOT DEFINED CMAKE_GENERATOR ( if DEFINED APPVEYOR_BUILD_WORKER_IMAGE ( if "%APPVEYOR_BUILD_WORKER_IMAGE%" == "Visual Studio 2017" ( diff --git a/scripts/onnx/test.sh b/scripts/onnx/test.sh index 4ad633bd87f9..45393c62f0e4 100755 --- a/scripts/onnx/test.sh +++ b/scripts/onnx/test.sh @@ -23,7 +23,21 @@ do done set -- "${UNKNOWN[@]}" # leave UNKNOWN -pip install pytest scipy torchvision hypothesis +pip install pytest scipy hypothesis + +install_torchvision() { + echo "Installing torchvision at branch master" + rm -rf vision + # TODO: This git clone is bad, it means pushes to torchvision can break + # PyTorch CI + git clone https://github.com/pytorch/vision --quiet + pushd vision + pip install -q --user . + popd + rm -rf vision +} +install_torchvision + if [[ $PARALLEL == 1 ]]; then pip install pytest-xdist fi diff --git a/setup.py b/setup.py index 441bc08d7082..8c0a5cb56858 100644 --- a/setup.py +++ b/setup.py @@ -594,10 +594,11 @@ def run(self): try: import numpy as np - NUMPY_INCLUDE_DIR = np.get_include() - USE_NUMPY = True except ImportError: USE_NUMPY = False +else: + NUMPY_INCLUDE_DIR = np.get_include() + USE_NUMPY = True if USE_CUDA: if IS_WINDOWS: diff --git a/test/common_methods_invocations.py b/test/common_methods_invocations.py index 6202d5bdecef..18a787b9e46c 100644 --- a/test/common_methods_invocations.py +++ b/test/common_methods_invocations.py @@ -395,6 +395,16 @@ def method_tests(): ('std', (S, S, S), (1, True, True), 'keepdim_dim', (True,), [0]), ('std', (S,), (0,), 'dim_1d', (True,), [0]), ('std', (S,), (0, True, True), 'keepdim_dim_1d', (True,), [0]), + ('__var_mean__', (S, S, S), NO_ARGS, ''), + ('__var_mean__', (S, S, S), (1,), 'dim', [0]), + ('__var_mean__', (S, S, S), (1, True, True), 'keepdim_dim', [0]), + ('__var_mean__', (S,), (0,), 'dim_1d', [0]), + ('__var_mean__', (S,), (0, True, True), 'keepdim_dim_1d', [0]), + ('__std_mean__', (S, S, S), NO_ARGS, ''), + ('__std_mean__', (S, S, S), (1,), 'dim', [0]), + ('__std_mean__', (S, S, S), (1, True, True), 'keepdim_dim', [0]), + ('__std_mean__', (S,), (0,), 'dim_1d', [0]), + ('__std_mean__', (S,), (0, True, True), 'keepdim_dim_1d', [0]), ('renorm', (S, S, S), (2, 1, 0.5), 'dim', (), [1]), ('renorm', (S, S, S), (1, 2, 3), 'norm_1'), ('renorm', (S, S, S), (inf, 2, 0.5), 'norm_inf'), @@ -456,8 +466,12 @@ def method_tests(): ('ger', (S,), ((M,),)), ('matmul', (L,), ((L,),), '', (True,)), ('matmul', (S, M), ((M,),), "2d_1d", (True,)), - ('matmul', (M, ), ((M, S),), "1d_2d", (True,)), + ('matmul', (M,), ((M, S),), "1d_2d", (True,)), ('matmul', (S, M), ((M, S),), "2d_2d", (True,)), + ('matmul', (S, S, M), ((M,),), "3d_1d", (True,)), + ('matmul', (S, S, M), ((M, S),), "3d_2d", (True,)), + ('matmul', (M,), ((S, M, S),), "1d_3d", (True,)), + ('matmul', (S, M), ((S, M, S),), "2d_3d", (True,)), ('matmul', (S, S, M, M), ((S, S, M, S),), "4d_4d", (True,)), ('matmul', (S, S, M, M), ((M,),), "4d_1d", (True,)), ('matmul', (M,), ((S, S, M, S),), "1d_4d", (True,)), diff --git a/test/common_utils.py b/test/common_utils.py index ca9b0b9184c2..4ef97e85e936 100644 --- a/test/common_utils.py +++ b/test/common_utils.py @@ -20,10 +20,12 @@ import socket import time from collections import OrderedDict +from contextlib import contextmanager from functools import wraps from itertools import product from copy import deepcopy from numbers import Number +import tempfile import __main__ import errno @@ -66,6 +68,24 @@ def run_tests(argv=UNITTEST_ARGS): # Environment variable `IS_PYTORCH_CI` is set in `.jenkins/common.sh`. IS_PYTORCH_CI = bool(os.environ.get('IS_PYTORCH_CI', 0)) +if IS_WINDOWS: + @contextmanager + def TemporaryFileName(): + # Ideally we would like to not have to manually delete the file, but NamedTemporaryFile + # opens the file, and it cannot be opened multiple times in Windows. To support Windows, + # close the file after creation and try to remove it manually + f = tempfile.NamedTemporaryFile(delete=False) + try: + f.close() + yield f.name + finally: + os.unlink(f.name) +else: + @contextmanager # noqa: T484 + def TemporaryFileName(): + with tempfile.NamedTemporaryFile() as f: + yield f.name + def _check_module_exists(name): r"""Returns if a top-level module with :attr:`name` exists *without** diff --git a/test/cpp/api/CMakeLists.txt b/test/cpp/api/CMakeLists.txt index ef63856d5680..d59946199b35 100644 --- a/test/cpp/api/CMakeLists.txt +++ b/test/cpp/api/CMakeLists.txt @@ -30,7 +30,7 @@ endif() add_executable(test_api ${TORCH_API_TEST_SOURCES}) target_include_directories(test_api PRIVATE ${ATen_CPU_INCLUDE}) -target_link_libraries(test_api PRIVATE torch gtest) +target_link_libraries(test_api PRIVATE caffe2 gtest) if (USE_CUDA) target_link_libraries(test_api PRIVATE @@ -38,6 +38,9 @@ if (USE_CUDA) ${CUDA_NVRTC_LIB} ${CUDA_CUDA_LIB} ${TORCH_CUDA_LIBRARIES}) + + target_link_libraries(test_api PRIVATE caffe2_gpu) + target_compile_definitions(test_api PRIVATE "USE_CUDA") endif() diff --git a/test/cpp/api/torch_include.cpp b/test/cpp/api/torch_include.cpp index 1bcde267cc17..d85e728de88e 100644 --- a/test/cpp/api/torch_include.cpp +++ b/test/cpp/api/torch_include.cpp @@ -9,4 +9,6 @@ TEST(TorchIncludeTest, GetSetNumThreads) { torch::init_num_threads(); torch::set_num_threads(2); ASSERT_EQ(torch::get_num_threads(), 2); + torch::set_num_interop_threads(2); + ASSERT_EQ(torch::get_num_interop_threads(), 2); } diff --git a/test/cpp/jit/CMakeLists.txt b/test/cpp/jit/CMakeLists.txt index 66860ebef0ab..263a306dc848 100644 --- a/test/cpp/jit/CMakeLists.txt +++ b/test/cpp/jit/CMakeLists.txt @@ -4,7 +4,8 @@ add_executable(test_jit ${TORCH_ROOT}/test/cpp/common/main.cpp ${JIT_TEST_ROOT}/test.cpp) -target_link_libraries(test_jit PRIVATE torch gtest) +target_link_libraries(test_jit PRIVATE caffe2 gtest) +target_include_directories(test_jit PRIVATE ${ATen_CPU_INCLUDE}) target_compile_definitions(test_jit PRIVATE USE_GTEST) if (USE_CUDA) @@ -13,5 +14,8 @@ if (USE_CUDA) ${CUDA_NVRTC_LIB} ${CUDA_CUDA_LIB} ${TORCH_CUDA_LIBRARIES}) + + target_link_libraries(test_jit PRIVATE caffe2_gpu) + target_compile_definitions(test_jit PRIVATE USE_CUDA) endif() diff --git a/test/cpp/jit/test.cpp b/test/cpp/jit/test.cpp index 78a527976f24..7a7b45b3c386 100644 --- a/test/cpp/jit/test.cpp +++ b/test/cpp/jit/test.cpp @@ -31,6 +31,7 @@ #include #include #include +#include using namespace torch::jit::script; using namespace torch::jit::test; @@ -75,12 +76,14 @@ namespace jit { _(NoneSchemaMatch) \ _(ClassParser) \ _(Profiler) \ + _(InsertGuards) \ _(PeepholeOptimize) \ _(RecordFunction) \ _(SubgraphMatching) \ _(ModuleDefine) \ _(QualifiedName) \ - _(ClassImport) + _(ClassImport) \ + _(ScriptObject) #define TH_FORALL_TESTS_CUDA(_) \ _(ArgumentSpec) \ @@ -108,7 +111,7 @@ TH_FORALL_TESTS_CUDA(JIT_GTEST_CUDA) #endif #define JIT_TEST(name) test##name(); -void runJITCPPTests(bool runCuda) { +TORCH_API void runJITCPPTests(bool runCuda) { TH_FORALL_TESTS(JIT_TEST) if (runCuda) { TH_FORALL_TESTS_CUDA(JIT_TEST) diff --git a/test/cpp/jit/test_alias_analysis.h b/test/cpp/jit/test_alias_analysis.h index d3d9550971e9..87a07f548ae9 100644 --- a/test/cpp/jit/test_alias_analysis.h +++ b/test/cpp/jit/test_alias_analysis.h @@ -680,6 +680,48 @@ graph(): AT_ASSERT(!aliasDb.mayContainAlias(first_st, second_st)); AT_ASSERT(!aliasDb.mayContainAlias(second_st, tup_st)); } + { + // Test list container aliasing + auto graph = std::make_shared(); + std::unordered_map vmap; + script::parseIR( + R"IR( +graph(): + %10 : bool? = prim::Constant() + %8 : Device? = prim::Constant() + %4 : int? = prim::Constant() + %0 : int = prim::Constant[value=2]() + %1 : int = prim::Constant[value=3]() + %2 : int[] = prim::ListConstruct(%0, %1) + %x : Tensor = aten::rand(%2, %4, %4, %8, %10) + %12 : int[] = prim::ListConstruct(%0, %1) + %y : Tensor = aten::rand(%12, %4, %4, %8, %10) + %22 : int[] = prim::ListConstruct(%0, %1) + %z : Tensor = aten::rand(%22, %4, %4, %8, %10) + %32 : int[] = prim::ListConstruct(%0, %1) + %fresh : Tensor = aten::rand(%32, %4, %4, %8, %10) + %foo : Tensor[] = prim::ListConstruct(%x, %y) + %43 : Tensor[] = aten::append(%foo, %z) + return () +)IR", + graph.get(), + vmap); + AliasDb aliasDb(graph); + auto x = vmap["x"]; + auto y = vmap["y"]; + auto z = vmap["z"]; + // Tensors x, y, and z went into a list, so they all may alias each other. + ASSERT_TRUE(aliasDb.mayAlias(x, y)); + ASSERT_TRUE(aliasDb.mayAlias(y, z)); + ASSERT_TRUE(aliasDb.mayAlias(x, z)); + + // But we know `fresh` didn't go into a list, so x, y, and z should not + // alias it. + auto fresh = vmap["fresh"]; + ASSERT_FALSE(aliasDb.mayAlias(x, fresh)); + ASSERT_FALSE(aliasDb.mayAlias(y, fresh)); + ASSERT_FALSE(aliasDb.mayAlias(z, fresh)); + } } void testWildcards() { @@ -707,7 +749,7 @@ void testWildcards() { AliasDb aliasDb(graph); ASSERT_FALSE(aliasDb.mayAlias(a, fresh)); - ASSERT_TRUE(aliasDb.mayAlias(wildcard, fresh)); + ASSERT_FALSE(aliasDb.mayAlias(wildcard, fresh)); ASSERT_TRUE(aliasDb.mayAlias(wildcard, a)); ASSERT_FALSE(aliasDb.mayAlias( std::unordered_set({wildcard}), @@ -719,8 +761,7 @@ void testWildcards() { { graph->lint(); AliasDb aliasDb(graph); - // Any write should be considered a write to the wildcard - ASSERT_TRUE(aliasDb.hasWriters(wildcard->node())); + ASSERT_FALSE(aliasDb.hasWriters(wildcard->node())); } const auto wildcardWrite = graph->insert(writes, {wildcard})->node(); @@ -728,9 +769,9 @@ void testWildcards() { graph->lint(); AliasDb aliasDb(graph); // Test writes to wildcards - ASSERT_TRUE(aliasDb.writesToAlias( + ASSERT_FALSE(aliasDb.writesToAlias( wildcardWrite, std::unordered_set{fresh})); - ASSERT_TRUE(aliasDb.writesToAlias( + ASSERT_FALSE(aliasDb.writesToAlias( wildcardWrite, std::unordered_set{fresh2})); ASSERT_TRUE(aliasDb.writesToAlias( wildcardWrite, std::unordered_set{a})); diff --git a/test/cpp/jit/test_base.h b/test/cpp/jit/test_base.h index 7726308982f8..64e7203dc96e 100644 --- a/test/cpp/jit/test_base.h +++ b/test/cpp/jit/test_base.h @@ -10,9 +10,9 @@ #include #else #include "c10/util/Exception.h" -#define ASSERT_EQ(x, y) AT_ASSERT((x) == (y)) -#define ASSERT_NE(x, y) AT_ASSERT((x) != (y)) -#define ASSERT_TRUE AT_ASSERT +#define ASSERT_EQ(x, y) TORCH_INTERNAL_ASSERT((x) == (y)) +#define ASSERT_NE(x, y) TORCH_INTERNAL_ASSERT((x) != (y)) +#define ASSERT_TRUE TORCH_INTERNAL_ASSERT #define ASSERT_FALSE(x) ASSERT_TRUE(!(x)) #define ASSERT_THROWS_WITH(statement, substring) \ try { \ diff --git a/test/cpp/jit/test_class_import.h b/test/cpp/jit/test_class_import.h index d8b3f8423d2b..0ba81b22b345 100644 --- a/test/cpp/jit/test_class_import.h +++ b/test/cpp/jit/test_class_import.h @@ -1,8 +1,11 @@ #pragma once -#include #include +#include + +#include #include +#include namespace torch { namespace jit { @@ -13,10 +16,12 @@ op_version_set = 1 class FooNestedTest: def __init__(self, y): self.y = y + class FooNestedTest2: def __init__(self, y): self.y = y self.nested = __torch__.FooNestedTest(y) + class FooTest: def __init__(self, x): self.class_attr = __torch__.FooNestedTest(x) @@ -58,6 +63,37 @@ void testClassImport() { ASSERT_FALSE(c); } +void testScriptObject() { + Module m1; + Module m2; + std::vector constantTable; + import_libs( + m1.class_compilation_unit(), + "__torch__", + classSrcs1, + constantTable, + nullptr); + import_libs( + m2.class_compilation_unit(), + "__torch__", + classSrcs2, + constantTable, + nullptr); + + // Incorrect arguments for constructor should throw + c10::QualifiedName base("__torch__"); + ASSERT_ANY_THROW(m1.create_class(c10::QualifiedName(base, "FooTest"), {1})); + auto x = torch::ones({2, 3}); + auto obj = m2.create_class(c10::QualifiedName(base, "FooTest"), x).toObject(); + auto dx = obj->getAttr("dx"); + ASSERT_TRUE(test::almostEqual(x, dx.toTensor())); + + auto new_x = torch::rand({2, 3}); + obj->setAttr("dx", new_x); + auto new_dx = obj->getAttr("dx"); + ASSERT_TRUE(test::almostEqual(new_x, new_dx.toTensor())); +} + } // namespace script } // namespace jit } // namespace torch diff --git a/test/cpp/jit/test_fuser.h b/test/cpp/jit/test_fuser.h index 29ed1a68ccec..6f28520820f5 100644 --- a/test/cpp/jit/test_fuser.h +++ b/test/cpp/jit/test_fuser.h @@ -197,7 +197,7 @@ void testRegisterFusionCachesKernel(std::ostream& out = std::cout) { std::find_if(nodes.begin(), nodes.end(), [](const Node* node) { return node->kind() == prim::FusionGroup; }); - AT_CHECK( + TORCH_CHECK( maybe_fusion_group != nodes.end(), "testRegisterFusionCachesKernel: could not create FusionGroup"); return *maybe_fusion_group; diff --git a/test/cpp/jit/test_misc.h b/test/cpp/jit/test_misc.h index e2de3a4973ab..cf3ec10b1c1a 100644 --- a/test/cpp/jit/test_misc.h +++ b/test/cpp/jit/test_misc.h @@ -23,6 +23,7 @@ #include "torch/csrc/jit/passes/create_autodiff_subgraphs.h" #include "torch/csrc/jit/passes/dead_code_elimination.h" #include "torch/csrc/jit/passes/graph_fuser.h" +#include "torch/csrc/jit/passes/insert_guards.h" #include "torch/csrc/jit/passes/lower_grad_of.h" #include "torch/csrc/jit/passes/lower_tuples.h" #include "torch/csrc/jit/passes/requires_grad_analysis.h" @@ -642,22 +643,22 @@ void checkTracedInputs(const TracedTestInputs& inputs) { const auto& sizes = std::get<1>(input); if (fn == "test") { found_test = true; - AT_CHECK(sizes.size() == 1); - AT_CHECK(sizes[0] == std::vector({1, 2, 3})); + TORCH_CHECK(sizes.size() == 1); + TORCH_CHECK(sizes[0] == std::vector({1, 2, 3})); } else if (fn == "test::pow") { found_pow = true; - AT_CHECK(sizes.size() == 2); - AT_CHECK(sizes[0] == std::vector({1, 2, 3})); - AT_CHECK(sizes[1].empty()); + TORCH_CHECK(sizes.size() == 2); + TORCH_CHECK(sizes[0] == std::vector({1, 2, 3})); + TORCH_CHECK(sizes[1].empty()); } else if (fn.find("::mul") != std::string::npos) { found_mul = true; - AT_CHECK(sizes.size() > 1); - AT_CHECK(sizes[0] == std::vector({1, 2, 3})); + TORCH_CHECK(sizes.size() > 1); + TORCH_CHECK(sizes[0] == std::vector({1, 2, 3})); } } - AT_CHECK(found_test); - AT_CHECK(found_pow); - AT_CHECK(found_mul); + TORCH_CHECK(found_test); + TORCH_CHECK(found_pow); + TORCH_CHECK(found_mul); } std::string getFullName(const autograd::profiler::RecordFunction* fn_ptr) { @@ -683,13 +684,17 @@ void testRecordFunction() { for (const auto& input : inputs) { if (input.isTensor()) { sizes.push_back(input.toTensor().sizes().vec()); - } else if (input.isScalar()){ + } else if (input.isScalar()) { sizes.push_back(std::vector()); } } traced_inputs.push_back( std::make_tuple(std::string(getFullName(&fn)), sizes)); - }, [](const autograd::profiler::RecordFunction&) {}, true); + }, + [](const autograd::profiler::RecordFunction&) {}, + true); + + autograd::profiler::setSamplingProbability(1.0); auto t = torch::randn({1, 2, 3}, at::kCPU); t.set_requires_grad(true); @@ -736,7 +741,7 @@ void testAutogradProfiler() { for (size_t pos = 0; (pos = result.find("tanh", pos)) != std::string::npos; count++, pos++) { } - AT_CHECK(count == 200); + TORCH_CHECK(count == 200); } void testNoneSchemaMatch() { @@ -797,7 +802,7 @@ void testModuleConversion() { // test cuda to cpu for params and buffers m->register_parameter("foo", torch::ones({}, at::kCUDA), false); m->register_buffer("bar", torch::ones({}, at::kCUDA)); - + m->to(at::kCUDA); m->to(at::kCPU); AT_ASSERT(m->get_parameter("foo").data().device().is_cpu()); @@ -807,14 +812,13 @@ void testModuleConversion() { // test cpu to cuda for params and buffers m->register_parameter("foo", torch::ones({}), false); m->register_buffer("bar", torch::ones({})); - + m->to(at::kCUDA); AT_ASSERT(m->get_parameter("foo").data().device().is_cuda()); AT_ASSERT(m->get_buffer("bar").data().device().is_cuda()); } } - static int testPassValue = 0; void fakePass(std::shared_ptr& g) { testPassValue++; @@ -841,12 +845,53 @@ graph(%a): AT_ASSERT(testPassValue); } -static void checkShape(Node* n, std::vector expected) { - auto tp = n->output()->type(); +static void checkShape( + Node* n, + std::vector expected, + bool prev = true) { + auto profile = (prev) ? n->inputs().at(0)->node() : n; + auto tp = profile->output()->type(); auto ptp = tp->expect(); ASSERT_EQ(ptp->sizes().concrete_sizes().value(), expected); } +void testInsertGuards() { + static const auto basic_example = R"JIT( + def basic(x, y): + a = x + y + b = x * y + c = x + 1 + d = a - c + e = b - c + return d + e + )JIT"; + + auto cu = compile(basic_example); + auto& fun = cu->get_function("basic"); + auto pr = ProfilingRecord::instrumentGraph(fun.graph()); + auto x = at::randn({2, 3}, at::kCPU); + auto y = at::randn({2, 3}, at::kCPU); + auto v = [](at::Tensor t) { return autograd::make_variable(t, false); }; + auto stack = createStack({v(x), v(y)}); + // introduce some profiling information + Code cd(pr->profiled_graph_); + InterpreterState is{cd}; + is.run(stack); + auto copy = pr->profiled_graph_->copy(); + InsertGuards(copy); + auto nodes = copy->block()->nodes(); + auto guard = std::find_if(nodes.begin(), nodes.end(), [](Node* n) { + return n->kind() == prim::Guard; + }); + ASSERT_NE(guard, nodes.end()); + ASSERT_EQ(guard->input()->type()->cast(), nullptr); + checkShape(*guard, {2, 3}, false); + int num_guards = std::count_if(nodes.begin(), nodes.end(), [](Node* n) { + return n->kind() == prim::Guard; + }); + ASSERT_EQ(num_guards, 11); +} + void testProfiler() { constexpr int batch_size = 4; constexpr int input_size = 256; @@ -879,7 +924,7 @@ void testProfiler() { auto mm = std::find_if(begin, end, [](Node* n) { return n->kind() == aten::mm; }); ASSERT_NE(mm, end); - std::vector mm_expected{4, 2048}; + std::vector mm_expected{4, 256}; std::vector eltwise{4, 512}; checkShape(*mm, mm_expected); auto sigmoid_n = std::find_if( diff --git a/test/cpp/jit/test_subgraph_matcher.h b/test/cpp/jit/test_subgraph_matcher.h index ee157de9f8bd..f33e19e5c7ba 100644 --- a/test/cpp/jit/test_subgraph_matcher.h +++ b/test/cpp/jit/test_subgraph_matcher.h @@ -361,6 +361,80 @@ graph(%x, %y): AT_ASSERT(findPatternMatches(pattern1, graph).size() == 0); } +void testAttributes() { + Graph graph; + script::parseIR( + R"IR( +graph(%0): + %a = a::a[isattr=[1,2]](%0) + %b = a::b[intattr=10, floatattr=3.14](%0) + %c = a::c[myattr="qqq"](%a, %b) + return (%c))IR", + &graph); + + { + Graph pattern; + script::parseIR( + R"IR( +graph(%a, %b): + %c = a::c[myattr="qqq"](%a, %b) + return (%c))IR", + &pattern); + AT_ASSERT(!findPatternMatches(pattern, graph).empty()); + } + { + Graph pattern; + script::parseIR( + R"IR( +graph(%a, %b): + %c = a::c[myattr="zzz"](%a, %b) + return (%c))IR", + &pattern); + AT_ASSERT(findPatternMatches(pattern, graph).empty()); + } + { + Graph pattern; + script::parseIR( + R"IR( +graph(%0): + %b = a::b[extraattr=10](%0) + return (%b))IR", + &pattern); + AT_ASSERT(findPatternMatches(pattern, graph).empty()); + } + { + Graph pattern; + script::parseIR( + R"IR( +graph(%0): + %b = a::b[intattr=10, floatattr=3.14](%0) + return (%b))IR", + &pattern); + AT_ASSERT(!findPatternMatches(pattern, graph).empty()); + } + { + Graph pattern; + script::parseIR( + R"IR( +graph(%0): + %b = a::b[intattr=10, floatattr=3.14, strattr="rrr"](%0) + return (%b))IR", + &pattern); + AT_ASSERT(findPatternMatches(pattern, graph).empty()); + } + { + Graph pattern; + script::parseIR( + R"IR( +graph(%0): + %a = a::a[isattr=[1,2]](%0) + return (%a))IR", + &pattern); + // Lists are not supported yet, thus we shouldn't match for now. + AT_ASSERT(findPatternMatches(pattern, graph).empty()); + } +} + void testBadPattern() { Graph graph, pattern1, pattern2; script::parseIR( @@ -405,6 +479,7 @@ void testSubgraphMatching() { testOverlappingMatches(); testMatchInBasicBlocks1(); testMatchInBasicBlocks2(); + testAttributes(); testBadPattern(); } diff --git a/test/cpp_extensions/complex_registration_extension.cpp b/test/cpp_extensions/complex_registration_extension.cpp index 459ffb30dc21..f512dc6dbc97 100644 --- a/test/cpp_extensions/complex_registration_extension.cpp +++ b/test/cpp_extensions/complex_registration_extension.cpp @@ -40,7 +40,7 @@ struct ComplexCPUType : public at::CPUTypeDefault { AT_ASSERT(options.device().is_cpu()); for (auto x: size) { - AT_CHECK(x >= 0, "Trying to create tensor using size with negative dimension: ", size); + TORCH_CHECK(x >= 0, "Trying to create tensor using size with negative dimension: ", size); } auto* allocator = at::getCPUAllocator(); int64_t nelements = at::prod_intlist(size); diff --git a/test/cpp_extensions/cuda_extension.cpp b/test/cpp_extensions/cuda_extension.cpp index ad7396fe7f45..d6349b8aa0b3 100644 --- a/test/cpp_extensions/cuda_extension.cpp +++ b/test/cpp_extensions/cuda_extension.cpp @@ -6,8 +6,8 @@ void sigmoid_add_cuda(const float* x, const float* y, float* output, int size); torch::Tensor sigmoid_add(torch::Tensor x, torch::Tensor y) { - AT_CHECK(x.type().is_cuda(), "x must be a CUDA tensor"); - AT_CHECK(y.type().is_cuda(), "y must be a CUDA tensor"); + TORCH_CHECK(x.type().is_cuda(), "x must be a CUDA tensor"); + TORCH_CHECK(y.type().is_cuda(), "y must be a CUDA tensor"); auto output = torch::zeros_like(x); sigmoid_add_cuda( x.data(), y.data(), output.data(), output.numel()); diff --git a/test/expect/TestScript.test_print-stdout.expect b/test/expect/TestScript.test_print-stdout.expect index 0131b4bcc523..f4449c73b849 100644 --- a/test/expect/TestScript.test_print-stdout.expect +++ b/test/expect/TestScript.test_print-stdout.expect @@ -2,4 +2,4 @@ 0.9526 0.9975 0.9999 -[ Variable[CPUType]{4} ] 1 2 [1, 2] [1., 2.] +[ Variable[CPUDoubleType]{4} ] 1 2 [1, 2] [1., 2.] diff --git a/test/expect/TestScript.test_string_print-stdout.expect b/test/expect/TestScript.test_string_print-stdout.expect index c8a75f4ca842..19f670510f10 100644 --- a/test/expect/TestScript.test_string_print-stdout.expect +++ b/test/expect/TestScript.test_string_print-stdout.expect @@ -1,2 +1,2 @@ 1 -[ Variable[CPUType]{} ] abcd 2 1.5 +[ Variable[CPULongType]{} ] abcd 2 1.5 diff --git a/test/onnx/expect/TestOperators.test_c2_op.expect b/test/onnx/expect/TestOperators.test_c2_op.expect index 568df7594c6c..bfd22835e355 100644 --- a/test/onnx/expect/TestOperators.test_c2_op.expect +++ b/test/onnx/expect/TestOperators.test_c2_op.expect @@ -55,6 +55,11 @@ graph { f: 1 type: FLOAT } + attribute { + name: "legacy_plus_one" + i: 1 + type: INT + } domain: "org.pytorch._caffe2" } name: "torch-jit-export" diff --git a/test/onnx/expect/TestOperators.test_maxpool_dilations.expect b/test/onnx/expect/TestOperators.test_maxpool_dilations.expect index 61b6d01a99eb..470a5d94d2d9 100644 --- a/test/onnx/expect/TestOperators.test_maxpool_dilations.expect +++ b/test/onnx/expect/TestOperators.test_maxpool_dilations.expect @@ -6,6 +6,11 @@ graph { input: "0" output: "1" op_type: "MaxPool" + attribute { + name: "ceil_mode" + i: 0 + type: INT + } attribute { name: "dilations" ints: 2 diff --git a/test/onnx/expect/TestOperators.test_sign.expect b/test/onnx/expect/TestOperators.test_sign.expect new file mode 100644 index 000000000000..5fb611054ec6 --- /dev/null +++ b/test/onnx/expect/TestOperators.test_sign.expect @@ -0,0 +1,46 @@ +ir_version: 4 +producer_name: "pytorch" +producer_version: "1.1" +graph { + node { + input: "0" + output: "1" + op_type: "Sign" + } + name: "torch-jit-export" + input { + name: "0" + type { + tensor_type { + elem_type: 1 + shape { + dim { + dim_value: 3 + } + dim { + dim_value: 4 + } + } + } + } + } + output { + name: "1" + type { + tensor_type { + elem_type: 1 + shape { + dim { + dim_value: 3 + } + dim { + dim_value: 4 + } + } + } + } + } +} +opset_import { + version: 9 +} diff --git a/test/onnx/test_onnx_opset.py b/test/onnx/test_onnx_opset.py new file mode 100644 index 000000000000..37a87af255be --- /dev/null +++ b/test/onnx/test_onnx_opset.py @@ -0,0 +1,111 @@ +from test_pytorch_common import TestCase, run_tests + +import torch +import torch.onnx +from torch.nn import Module + +import onnx + +import io + +from torch.onnx.symbolic_helper import _export_onnx_opset_version +from torch.onnx import ir_version, producer_name, producer_version + + +def check_onnx_opset_operator(model, ops, opset_version=_export_onnx_opset_version): + # check_onnx_components + assert model.ir_version == ir_version and \ + model.producer_name == producer_name and \ + model.producer_version == producer_version and \ + model.opset_import[0].version == opset_version + + # check the schema with the onnx checker + onnx.checker.check_model(model) + + # check target type and attributes + graph = model.graph + # ops should contain an object for each node + # in graph.node, in the right order. + # At least the op_name should be specified, + # but the op's attributes can optionally be + # specified as well + assert len(ops) == len(graph.node) + for i in range(0, len(ops)): + assert graph.node[i].op_type == ops[i]['op_name'] + if "attributes" in ops[i] : + attributes = ops[i]['attributes'] + assert len(attributes) == len(graph.node[i].attribute) + for j in range(0, len(attributes)): + for attribute_field in attributes[j].keys(): + assert attributes[j][attribute_field] == getattr(graph.node[i].attribute[j], attribute_field) + + +def check_onnx_opsets_operator(module, x, ops, opset_versions): + for opset_version in opset_versions: + f = io.BytesIO() + torch.onnx.export(module, x, f, opset_version=opset_version) + model = onnx.load(io.BytesIO(f.getvalue())) + check_onnx_opset_operator(model, ops[opset_version], opset_version) + + +class TestONNXOpset(TestCase): + + def test_opset_fallback(self): + class MyModule(Module): + def forward(self, x): + return torch.isnan(x) + + ops = [{"op_name" : "IsNaN"}, + {"op_name" : "Cast", "attributes" : [{"name" : "to", "i" : 2, "type" : 2}]}] + ops = {9 : ops, 10 : ops} + x = torch.tensor([1.0, float('nan'), 2.0]) + check_onnx_opsets_operator(MyModule(), x, ops, opset_versions=[9, 10]) + + def test_topk(self): + class MyModule(Module): + def forward(self, x): + return torch.topk(x, 3) + + ops_9 = [{"op_name" : "TopK", "attributes" : [{"name" : "axis", "i" : -1, "type" : 2}, + {"name" : "k", "i" : 3, "type" : 2}]}] + ops_10 = [{"op_name" : "Constant", "attributes" : [{"name" : "value", "type" : 4}]}, + {"op_name" : "Unsqueeze", "attributes" : [{"name" : "axes", "ints" : [0], "type" : 7}]}, + {"op_name" : "TopK", "attributes" : [{"name" : "axis", "i" : -1, "type" : 2}]}] + ops = {9 : ops_9, 10 : ops_10} + x = torch.arange(1., 6., requires_grad=True) + check_onnx_opsets_operator(MyModule(), x, ops, opset_versions=[9, 10]) + + def test_maxpool(self): + module = torch.nn.MaxPool1d(2, stride=1) + + ops_9 = [{"op_name" : "MaxPool", + "attributes" : + [{"name": "kernel_shape", "ints": [2], "type": 7}, + {"name": "pads", "ints": [0, 0], "type": 7}, + {"name": "strides", "ints": [1], "type": 7}]}] + ops_10 = [{"op_name" : "MaxPool", + "attributes" : + [{"name": "ceil_mode", "i": 0, "type": 2}, + {"name": "kernel_shape", "ints": [2], "type": 7}, + {"name": "pads", "ints": [0, 0], "type": 7}, + {"name": "strides", "ints": [1], "type": 7}]}] + ops = {9 : ops_9, 10 : ops_10} + x = torch.randn(20, 16, 50) + check_onnx_opsets_operator(module, x, ops, opset_versions=[10]) + + # add test with dilations + module = torch.nn.MaxPool1d(2, stride=1, dilation=2) + + ops_10 = [{"op_name" : "MaxPool", + "attributes" : + [{"name": "ceil_mode", "i": 0, "type": 2}, + {"name": "dilations", "ints": [2], "type": 7}, + {"name": "kernel_shape", "ints": [2], "type": 7}, + {"name": "pads", "ints": [0, 0], "type": 7}, + {"name": "strides", "ints": [1], "type": 7}]}] + ops = {9 : ops_9, 10 : ops_10} + x = torch.randn(20, 16, 50) + check_onnx_opsets_operator(module, x, ops, opset_versions=[10]) + +if __name__ == '__main__': + run_tests() diff --git a/test/onnx/test_operators.py b/test/onnx/test_operators.py index 955892e2da6a..4ab0b2fa4b6a 100644 --- a/test/onnx/test_operators.py +++ b/test/onnx/test_operators.py @@ -416,6 +416,10 @@ def test_slice(self): x = torch.rand(3, 4, requires_grad=True) self.assertONNX(lambda x: x[:, 1:2], x) + def test_sign(self): + x = torch.rand(3, 4, requires_grad=True) + self.assertONNX(lambda x: x.sign(), x) + def test_narrow(self): x = torch.randn(3, 3, requires_grad=True) self.assertONNX(lambda x: torch.narrow(x, 0, 0, 2), x) @@ -581,7 +585,7 @@ def __init__(self): def forward(self, scores, bbox_deltas, im_info, anchors): a, b = torch.ops._caffe2.GenerateProposals( (scores), (bbox_deltas), (im_info), (anchors), - 2.0, 6000, 300, 0.7, 16, True, -90, 90, 1.0, + 2.0, 6000, 300, 0.7, 16, True, -90, 90, 1.0, True, ) return a, b diff --git a/test/onnx/test_pytorch_onnx_caffe2.py b/test/onnx/test_pytorch_onnx_caffe2.py index 529b186f2352..deecd3630838 100644 --- a/test/onnx/test_pytorch_onnx_caffe2.py +++ b/test/onnx/test_pytorch_onnx_caffe2.py @@ -77,6 +77,11 @@ def wrapper(self): def do_export(model, inputs, *args, **kwargs): f = io.BytesIO() out = torch.onnx._export(model, inputs, f, *args, **kwargs) + if isinstance(model, torch.jit.ScriptModule): + # Special case for common case of passing a single Tensor + if isinstance(inputs, torch.Tensor): + inputs = (inputs,) + out = model(*inputs) return f.getvalue(), out @@ -178,7 +183,7 @@ def run_actual_test(self, model, train, batch_size, state_dict=None, # Verify the model runs the same in Caffe2 verify.verify(model, input, c2, rtol=rtol, atol=atol, - do_constant_folding=do_constant_folding) + example_outputs=example_outputs, do_constant_folding=do_constant_folding) def run_model_test(self, model, train, batch_size, state_dict=None, input=None, use_gpu=True, rtol=0.001, atol=1e-7, @@ -1249,6 +1254,17 @@ def forward(self, input): x = torch.tensor([1.0, float('nan'), 2.0]) self.run_model_test(IsNaNModel(), train=False, input=x, batch_size=BATCH_SIZE, use_gpu=False) + def test_scatter(self): + class ScatterModel(torch.nn.Module): + def forward(self, input, indices, values): + return input.scatter(1, indices, values) + + input = torch.tensor([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]) + indices = torch.tensor([[1, 0], [0, 1], [0, 1]], dtype=torch.int64) + values = torch.tensor([[1.0, 1.1], [2.0, 2.1], [3.0, 3.1]]) + self.run_model_test(ScatterModel(), train=False, input=(input, indices, values), + batch_size=BATCH_SIZE, use_gpu=False) + def test_flatten(self): class FlattenModel(torch.nn.Module): def forward(self, input): @@ -1368,7 +1384,7 @@ def forward(self, feature, im_info, anchors): bbox_deltas = self.conv(feature) a, b = torch.ops._caffe2.GenerateProposals( feature, bbox_deltas, im_info, anchors, - 2.0, 6000, 300, 0.7, 16, True, -90, 90, 1.0, + 2.0, 6000, 300, 0.7, 16, True, -90, 90, 1.0, True, ) output = torch.ops._caffe2.RoIAlign( feature, a, @@ -1424,7 +1440,7 @@ def __init__(self): def forward(self, scores, bbox_deltas, im_info, anchors): a, b = torch.ops._caffe2.GenerateProposals( scores, bbox_deltas, im_info, anchors, - 2.0, 6000, 300, 0.7, 16, True, -90, 90, 1.0, + 2.0, 6000, 300, 0.7, 16, True, -90, 90, 1.0, True, ) return a, b @@ -1458,6 +1474,7 @@ def forward(self, rois, deltas, im_info): angle_bound_lo=-90, angle_bound_hi=90, clip_angle_thresh=0.5, + legacy_plus_one=True, ) return a, b @@ -1475,7 +1492,7 @@ def forward(self, rois, deltas, im_info): im_info[:, 2] = 1.0 im_info = torch.zeros((batch_size, 3)) inputs = (torch.tensor(rois), torch.tensor(deltas), torch.tensor(im_info)) - self.run_model_test(MyModel(), train=False, input=inputs, batch_size=3) + self.run_model_test(MyModel(), train=False, input=inputs, batch_size=3, use_gpu=False) # BoxWithNMSLimits has requirements for the inputs, so randomly generated inputs # in Caffe2BackendTestEmbed doesn't work with this op. @@ -1502,6 +1519,7 @@ def test_c2_box_with_nms_limits(self): -90, 90, clip_angle_thresh, + legacy_plus_one=True, ) ] class_prob = np.random.randn(sum(roi_counts), num_classes).astype(np.float32) @@ -1529,11 +1547,12 @@ def forward(self, class_prob, pred_bbox, batch_splits): cls_agnostic_bbox_reg=False, input_boxes_include_bg_cls=True, output_classes_include_bg_cls=True, + legacy_plus_one=True, ) return a, b, c, d inputs = (torch.tensor(class_prob), torch.tensor(pred_bbox), torch.tensor(batch_splits)) - self.run_model_test(MyModel(), train=False, input=inputs, batch_size=3) + self.run_model_test(MyModel(), train=False, input=inputs, batch_size=3, use_gpu=False) def test_c2_inference_lstm(self): num_layers = 4 @@ -1572,7 +1591,24 @@ def forward(self, lstm_in): torch.from_numpy(hx), ] + [param.detach() for param in torch_lstm._flat_weights] - self.run_model_test(MyModel(), train=False, input=lstm_in, batch_size=3) + self.run_model_test(MyModel(), train=False, input=lstm_in, batch_size=3, use_gpu=False) + + def test_topk(self): + class TopKModel(torch.nn.Module): + def forward(self, input): + return torch.topk(input, 3) + + x = torch.arange(1., 6.) + self.run_model_test(TopKModel(), train=False, input=x, batch_size=BATCH_SIZE) + + def test_topk_script(self): + class TopKModel(torch.jit.ScriptModule): + @torch.jit.script_method + def forward(self, input): + return torch.topk(input, 3, dim=0) + + x = torch.randn(4, 3, requires_grad=True) + self.run_model_test(TopKModel(), train=False, input=(x,), batch_size=BATCH_SIZE, example_outputs=torch.topk(x, 3, dim=0)) def test_floor(self): class FloorModel(torch.nn.Module): @@ -1643,6 +1679,24 @@ def forward(self, input): return view_by_prim_shape(input) self.run_model_test(PrimShapeModel(), train=False, input=x, batch_size=BATCH_SIZE) + def test_and(self): + class AndModel(torch.nn.Module): + def forward(self, x, y): + return x & y + + x = torch.randint(0, 1, (3, 5)) + y = torch.randint(0, 1, (3, 5)) + self.run_model_test(AndModel(), train=False, input=(x, y), batch_size=BATCH_SIZE) + + def test_or(self): + class OrModel(torch.nn.Module): + def forward(self, x, y): + return x | y + + x = torch.randint(0, 1, (3, 5)) + y = torch.randint(0, 1, (3, 5)) + self.run_model_test(OrModel(), train=False, input=(x, y), batch_size=BATCH_SIZE) + # a bit of metaprogramming to set up all the rnn tests diff --git a/test/onnx/test_utility_funs.py b/test/onnx/test_utility_funs.py index 4d3ee99a1532..0470669a21ec 100644 --- a/test/onnx/test_utility_funs.py +++ b/test/onnx/test_utility_funs.py @@ -4,7 +4,7 @@ import torch import torch.onnx from torch.onnx import utils -from torch.onnx.symbolic import _set_opset_version +from torch.onnx.symbolic_helper import _set_opset_version import onnx diff --git a/test/onnx/verify.py b/test/onnx/verify.py index b687a99962c1..b104dca726cb 100644 --- a/test/onnx/verify.py +++ b/test/onnx/verify.py @@ -244,7 +244,7 @@ def set_training(model, mode): def verify(model, args, backend, verbose=False, training=False, rtol=1e-3, atol=1e-7, - test_args=2, do_constant_folding=False): + test_args=2, do_constant_folding=False, example_outputs=None): """ Export a model into ONNX, import it into a specified ONNX backend, and then on a few random inputs verify that PyTorch and the backend produced the same @@ -358,14 +358,18 @@ def load_bytes(b): with set_training(model, training): proto_bytes = io.BytesIO() torch_out = torch.onnx._export(model, args, proto_bytes, verbose=verbose, - do_constant_folding=do_constant_folding) + do_constant_folding=do_constant_folding, example_outputs=example_outputs) + if isinstance(model, torch.jit.ScriptModule): + torch_out = model(*args) proto = load_bytes(proto_bytes) prepared = backend.prepare(proto) def run(args): alt_proto_bytes = io.BytesIO() torch_out = torch.onnx._export(model, args, alt_proto_bytes, verbose=verbose, - do_constant_folding=do_constant_folding) + do_constant_folding=do_constant_folding, example_outputs=example_outputs) + if isinstance(model, torch.jit.ScriptModule): + torch_out = model(*args) alt_proto = load_bytes(alt_proto_bytes) if proto.SerializeToString() != alt_proto.SerializeToString(): # OK, let's try to figure out what happened. diff --git a/test/test_autograd.py b/test/test_autograd.py index 971f4a03bdbb..ddf3c4b54813 100644 --- a/test/test_autograd.py +++ b/test/test_autograd.py @@ -2162,6 +2162,24 @@ def f(a, b): run_functional_checks(self, "test_cdist", "cdist", f, True, f_args_variable, f_args_tensor) + def test_var_mean_differentiable(self): + dim = [2, 4] + keepdim = False + input1 = torch.randn(3, 4, 5, 6, 2, 3, requires_grad=True) + input2 = deepcopy(input1) + var1, mean1 = torch.var_mean(input1, dim=dim, keepdim=keepdim) + var2 = input2.var(dim=dim, keepdim=keepdim) + mean2 = input2.mean(dim=dim, keepdim=keepdim) + grad = torch.randn(3, 4, 6, 3, requires_grad=True) + + r1 = var1 * var1 * mean1 * mean1 + r2 = var2 * var2 * mean2 * mean2 + self.assertTrue(torch.allclose(r1, r2, rtol=0.01, atol=0.0)) + + torch.autograd.backward(r1, grad) + torch.autograd.backward(r2, grad) + self.assertTrue(torch.allclose(input1.grad, input2.grad, rtol=0.01, atol=0.0)) + @skipIfNoLapack def test_cholesky(self): def func(root, upper): @@ -3006,6 +3024,34 @@ def parent_on_cpu(inp): # gpu thread ReadyQueue out.sum().backward() + def test_version_counter(self): + x = torch.randn(1, 2) + + # In-place op bumps version + x_saved_version = x._version + x.add_(1).add_(1) + self.assertTrue(x._version > x_saved_version) + + # Differentiable view shares version counter + xz = x[:] + self.assertTrue(x._version == xz._version) + xz.add_(1) + self.assertTrue(x._version == xz._version) + + # `x.data = y` preserves version counter of `x` + x_saved_version = x._version + x.data = torch.randn(2, 3) + self.assertTrue(x._version == x_saved_version) + x.add_(1) + self.assertTrue(x._version > x_saved_version) + # Make sure `x` is still using the same version counter it shares with `xz` + self.assertTrue(x._version == xz._version) + + # In-place op on `xz` also updates version of `x`, + # because they share the version counter + xz.add_(1) + self.assertTrue(x._version == xz._version) + def index_variable(shape, max_indices): if not isinstance(shape, tuple): diff --git a/test/test_c10d.py b/test/test_c10d.py index e21eb211ab82..0af1979099b6 100644 --- a/test/test_c10d.py +++ b/test/test_c10d.py @@ -579,6 +579,14 @@ def opts(self, threads=2): opts.threads = threads return opts + def test_empty_tensors(self): + store = c10d.FileStore(self.file.name, self.world_size) + pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts()) + + xs = [torch.FloatTensor([])] + pg.broadcast(xs).wait() + self.assertEqual(0, xs[0].numel()) + def test_broadcast_checks(self): store = c10d.FileStore(self.file.name, self.world_size) pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts()) @@ -1344,6 +1352,30 @@ def setUp(self): def tearDown(self): pass + def test_empty_tensors(self): + store = c10d.FileStore(self.file.name, self.world_size) + pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) + + xs = [torch.cuda.FloatTensor([])] + pg.broadcast(xs).wait() + self.assertEqual(0, xs[0].numel()) + + pg.allreduce(xs).wait() + self.assertEqual(0, xs[0].numel()) + + pg.reduce(xs).wait() + self.assertEqual(0, xs[0].numel()) + + ys = [[torch.cuda.FloatTensor([]) for _ in range(self.world_size)]] + pg.allgather(ys, xs).wait() + for y in ys[0]: + self.assertEqual(0, y.numel()) + + ys = [torch.cuda.FloatTensor([])] + xs = [[torch.cuda.FloatTensor([]) for _ in range(self.world_size)]] + pg.reduce_scatter(ys, xs).wait() + self.assertEqual(0, ys[0].numel()) + def test_broadcast_ops(self): store = c10d.FileStore(self.file.name, self.world_size) pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) @@ -1646,52 +1678,60 @@ def tearDown(self): def world_size(self): return 2 - def _prepare_single_device_module(self, process_group, gpus, global_batch_size): + def _prepare_single_device_module(self, process_group, devices, device_ids, global_batch_size): model = Net() ddp_model = DistributedDataParallel( - copy.deepcopy(model).cuda(gpus[0]), - device_ids=gpus, + copy.deepcopy(model).to(devices[0]), + device_ids=device_ids, process_group=process_group, bucket_cap_mb=0.001) - model.cuda(gpus[0]) + model.to(devices[0]) - input = torch.randn(global_batch_size, 2).cuda(gpus[0]) - target = torch.randn(global_batch_size, 4).cuda(gpus[0]) + input = torch.randn(global_batch_size, 2).to(devices[0]) + target = torch.randn(global_batch_size, 4).to(devices[0]) return model, ddp_model, input, target - def _prepare_multi_device_module(self, process_group, gpus, global_batch_size): + def _prepare_multi_device_module(self, process_group, devices, device_ids, global_batch_size): self.assertTrue( - len(gpus) == 2 or len(gpus) == 4, - "unexpected devices for ddp tests {}".format(gpus)) - if len(gpus) == 2: - model = DoubleGpuNet(gpus) - elif len(gpus) == 4: - model = QuadraGpuNet(gpus) + len(devices) == 2 or len(devices) == 4, + "unexpected devices for ddp tests {}".format(devices)) + if len(devices) == 2: + model = DoubleGpuNet(devices) + elif len(devices) == 4: + model = QuadraGpuNet(devices) ddp_model = DistributedDataParallel( copy.deepcopy(model), + device_ids=device_ids, process_group=process_group, bucket_cap_mb=0.001) - input = torch.randn(global_batch_size, 2).to(gpus[0]) + input = torch.randn(global_batch_size, 2).cuda(devices[0]) target = torch.randn(global_batch_size, 4) return model, ddp_model, input, target - def _test_ddp_with_process_group(self, process_group, gpus, multi_gpu=False): - local_batch_size = len(gpus) + def _test_ddp_with_process_group(self, process_group, devices, device_ids, multi_device=False): + """ + Note: we pass down `device_ids` all the way to DistributedDataParallel + as part of the test. Below you find tests that either use a list of + integers, a list of `torch.Device` instances, or an empty list. + The `devices` argument is used to control placement of the model and + must always be specified as list of `torch.Device` instances. + """ + local_batch_size = len(devices) global_batch_size = self.world_size * local_batch_size - if multi_gpu: + if multi_device: model, ddp_model, input, target = \ self._prepare_multi_device_module( - process_group, gpus, global_batch_size) + process_group, devices, device_ids, global_batch_size) else: model, ddp_model, input, target = \ self._prepare_single_device_module( - process_group, gpus, global_batch_size) + process_group, devices, device_ids, global_batch_size) def step_model(model, input, target): model.train() @@ -1725,87 +1765,72 @@ def update_parameters(model): torch.manual_seed(1337 + iteration) input = input[torch.randperm(global_batch_size)] - def _test_gloo_backend(self, gpus, multi_gpu=False, use_str=False): - if use_str: - gpus = list(map(lambda i: torch.device('cuda:' + str(i)), gpus)) + def _test_gloo_backend(self, devices, device_ids, multi_device=False): store = c10d.FileStore(self.file.name, self.world_size) options = c10d.ProcessGroupGloo.Options() options.devices = [c10d.ProcessGroupGloo.create_tcp_device(interface="lo")] process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options) - self._test_ddp_with_process_group(process_group, gpus, multi_gpu) + self._test_ddp_with_process_group(process_group, devices, device_ids, multi_device) + + def test_gloo_backend_cpu_module(self): + self._test_gloo_backend([torch.device('cpu')], []) @skip_if_not_multigpu - def test_gloo_backend(self): - gpus = gpus_for_rank(self.world_size)[self.rank] - self._test_gloo_backend(gpus) + def test_gloo_backend_1gpu_module_device_ids_integer_list(self): + int_devices = gpus_for_rank(self.world_size)[self.rank][:1] + devices = list([torch.device('cuda:' + str(i)) for i in int_devices]) + self._test_gloo_backend(devices, int_devices) @skip_if_not_multigpu - def test_gloo_backend_str(self): - gpus = gpus_for_rank(self.world_size)[self.rank] - self._test_gloo_backend(gpus, use_str=True) + def test_gloo_backend_1gpu_module_device_ids_torch_device_list(self): + int_devices = gpus_for_rank(self.world_size)[self.rank][:1] + devices = list([torch.device('cuda:' + str(i)) for i in int_devices]) + self._test_gloo_backend(devices, devices) @skip_if_lt_x_gpu(4) def test_gloo_backend_2gpu_module(self): - gpus = gpus_for_rank(self.world_size)[self.rank] - self._test_gloo_backend(gpus[:2], multi_gpu=True) - - @skip_if_lt_x_gpu(4) - def test_gloo_backend_2gpu_module_str(self): - gpus = gpus_for_rank(self.world_size)[self.rank] - self._test_gloo_backend(gpus[:2], multi_gpu=True, use_str=True) + int_devices = gpus_for_rank(self.world_size)[self.rank][:2] + devices = list([torch.device('cuda:' + str(i)) for i in int_devices]) + self._test_gloo_backend(devices, [], multi_device=True) @skip_if_lt_x_gpu(8) def test_gloo_backend_4gpu_module(self): - gpus = gpus_for_rank(self.world_size)[self.rank] - self._test_gloo_backend(gpus[:4], multi_gpu=True) - - @skip_if_lt_x_gpu(8) - def test_gloo_backend_4gpu_module_str(self): - gpus = gpus_for_rank(self.world_size)[self.rank] - self._test_gloo_backend(gpus[:4], multi_gpu=True, use_str=True) + int_devices = gpus_for_rank(self.world_size)[self.rank][:4] + devices = list([torch.device('cuda:' + str(i)) for i in int_devices]) + self._test_gloo_backend(devices, [], multi_device=True) - def _test_nccl_backend(self, gpus, multi_gpu=False, use_str=False): - if use_str: - gpus = list(map(lambda i: torch.device('cuda:' + str(i)), gpus)) + def _test_nccl_backend(self, devices, device_ids, multi_device=False): store = c10d.FileStore(self.file.name, self.world_size) process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) - self._test_ddp_with_process_group(process_group, gpus, multi_gpu) + self._test_ddp_with_process_group(process_group, devices, device_ids, multi_device) @skip_if_not_multigpu @skip_if_not_nccl - def test_nccl_backend(self): - gpus = gpus_for_rank(self.world_size)[self.rank] - self._test_nccl_backend(gpus) + def test_nccl_backend_1gpu_module_device_ids_integer_list(self): + int_devices = gpus_for_rank(self.world_size)[self.rank][:1] + devices = list([torch.device('cuda:' + str(i)) for i in int_devices]) + self._test_nccl_backend(devices, int_devices) @skip_if_not_multigpu @skip_if_not_nccl - def test_nccl_backend_str(self): - gpus = gpus_for_rank(self.world_size)[self.rank] - self._test_nccl_backend(gpus, use_str=True) + def test_nccl_backend_1gpu_module_device_ids_torch_device_list(self): + int_devices = gpus_for_rank(self.world_size)[self.rank][:1] + devices = list([torch.device('cuda:' + str(i)) for i in int_devices]) + self._test_nccl_backend(devices, devices) @skip_if_lt_x_gpu(4) @skip_if_not_nccl def test_nccl_backend_2gpu_module(self): - gpus = gpus_for_rank(self.world_size)[self.rank] - self._test_nccl_backend(gpus[:2], multi_gpu=True) - - @skip_if_lt_x_gpu(4) - @skip_if_not_nccl - def test_nccl_backend_2gpu_module_str(self): - gpus = gpus_for_rank(self.world_size)[self.rank] - self._test_nccl_backend(gpus[:2], multi_gpu=True, use_str=True) + int_devices = gpus_for_rank(self.world_size)[self.rank][:2] + devices = list([torch.device('cuda:' + str(i)) for i in int_devices]) + self._test_nccl_backend(devices, [], multi_device=True) @skip_if_lt_x_gpu(8) @skip_if_not_nccl def test_nccl_backend_4gpu_module(self): - gpus = gpus_for_rank(self.world_size)[self.rank] - self._test_nccl_backend(gpus[:4], multi_gpu=True) - - @skip_if_lt_x_gpu(8) - @skip_if_not_nccl - def test_nccl_backend_4gpu_module_str(self): - gpus = gpus_for_rank(self.world_size)[self.rank] - self._test_nccl_backend(gpus[:4], multi_gpu=True, use_str=True) + int_devices = gpus_for_rank(self.world_size)[self.rank][:4] + devices = list([torch.device('cuda:' + str(i)) for i in int_devices]) + self._test_nccl_backend(devices, [], multi_device=True) @skip_if_lt_x_gpu(4) @skip_if_not_nccl @@ -2373,16 +2398,15 @@ def check_no_grads(): @skip_if_not_multigpu @skip_if_not_nccl def test_accumulate_gradients(self): - gpus = gpus_for_rank(self.world_size)[self.rank][0:1] - self.assertEqual(len(gpus), 1) + int_devices = gpus_for_rank(self.world_size)[self.rank][:1] + devices = list([torch.device('cuda:' + str(i)) for i in int_devices]) store = c10d.FileStore(self.file.name, self.world_size) process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) - local_batch_size = len(gpus) - global_batch_size = self.world_size * local_batch_size + global_batch_size = self.world_size model, ddp_model, input, target = \ self._prepare_single_device_module( - process_group, gpus, global_batch_size) + process_group, devices, devices, global_batch_size) def step_model(model, input, target): model.train() @@ -2395,25 +2419,25 @@ def step_model(model, input, target): ddp_model.train() ddp_model.module(input) - # check two model parameters over 2 iterations + # Check two model parameters over 4 iterations. + # Use 4 iterations because we alternate between reducing and + # not reducing and want to make sure we switch both ways. for iteration in range(4): - # single cpu/gpu training step_model(model, input, target) if iteration % 2 == 0: # Skip gradients sync without calling prepare_for_backward - step_model(ddp_model.module, - input[self.rank * local_batch_size: (self.rank + 1) * local_batch_size], - target[self.rank * local_batch_size: (self.rank + 1) * local_batch_size]) - + step_model( + ddp_model.module, + input[self.rank : (self.rank + 1)], + target[self.rank : (self.rank + 1)]) for i, j in zip(model.parameters(), ddp_model.parameters()): self.assertNotEqual(i.grad, j.grad) else: - # DDP training, DDP scatters subsets of input_cpu to nodes/GPUs - step_model(ddp_model, - input[self.rank * local_batch_size: (self.rank + 1) * local_batch_size], - target[self.rank * local_batch_size: (self.rank + 1) * local_batch_size]) - + step_model( + ddp_model, + input[self.rank : (self.rank + 1)], + target[self.rank : (self.rank + 1)]) for i, j in zip(model.parameters(), ddp_model.parameters()): self.assertEqual(i.grad, j.grad) @@ -2646,6 +2670,72 @@ def test_multi_limit_multi_dtype(self): self.assertEqual([[0], [1], [2, 4], [3, 5]], result) +class CommTest(MultiProcessTestCase): + def tearDown(self): + super(CommTest, self).tearDown() + try: + os.remove(self.file.name) + except OSError: + pass + + @property + def world_size(self): + return 2 + + def _test_broadcast_coalesced(self, process_group, device): + half = torch.float16 + + # No support for float16 for CPU tensors + if device == torch.device('cpu'): + half = torch.float32 + + target = torch.arange(60, dtype=half, device=device).chunk(5) + target += torch.arange(60, dtype=torch.float32, device=device).chunk(5) + target += torch.arange(60, dtype=half, device=device).chunk(5) + target += torch.arange(60, dtype=torch.float64, device=device).chunk(5) + target += torch.arange(60, dtype=half, device=device).chunk(5) + target += torch.arange(60, dtype=torch.float32, device=device).chunk(5) + + # The tensors to pass to broadcast are idential to the target + # only on the process that is the root of the broadcast. + if self.rank == 0: + tensors = list(tensor.clone() for tensor in target) + else: + tensors = list(torch.empty_like(tensor) for tensor in target) + + c10d._broadcast_coalesced( + process_group, + tensors, + buffer_size=256) + + self.assertEqual(tensors, target) + + @skip_if_not_multigpu + @skip_if_not_nccl + def test_broadcast_coalesced_nccl(self): + store = c10d.FileStore(self.file.name, self.world_size) + process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) + device = torch.device('cuda:%d' % self.rank) + self._test_broadcast_coalesced(process_group, device) + + @skip_if_not_multigpu + def test_broadcast_coalesced_gloo_cuda(self): + store = c10d.FileStore(self.file.name, self.world_size) + options = c10d.ProcessGroupGloo.Options() + options.devices = [c10d.ProcessGroupGloo.create_tcp_device(interface="lo")] + process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options) + device = torch.device('cuda:%d' % self.rank) + self._test_broadcast_coalesced(process_group, device) + + def test_broadcast_coalesced_gloo_cpu(self): + store = c10d.FileStore(self.file.name, self.world_size) + options = c10d.ProcessGroupGloo.Options() + options.devices = [c10d.ProcessGroupGloo.create_tcp_device(interface="lo")] + process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options) + device = torch.device('cpu') + self._test_broadcast_coalesced(process_group, device) + + if __name__ == '__main__': assert not torch.cuda._initialized, "test_distributed must not have initialized CUDA context on main process" diff --git a/test/test_cuda.py b/test/test_cuda.py index 5afcfd1b74bf..2a400e47bd72 100644 --- a/test/test_cuda.py +++ b/test/test_cuda.py @@ -974,15 +974,47 @@ def test_copy_streams(self): self._test_copy_sync_current_stream(x0, x2) def test_copy_non_blocking(self): - x = torch.randn(5, 5).cuda() - y = torch.zeros(5, 5) - y.copy_(x, non_blocking=True) - self.assertEqual(x, y) + def _test_copy_non_blocking(a, b): + event = torch.cuda.Event() + a.copy_(b, non_blocking=True) + event.record() + self.assertFalse(event.query()) + event.synchronize() + self.assertEqual(a, b) - x = torch.randn(5, 5) - y = torch.zeros(5, 5).cuda() - y.copy_(x, non_blocking=True) - self.assertEqual(x, y) + # 10MB copies + x = torch.ones(10000000, dtype=torch.uint8).cuda() + y = torch.zeros(10000000, dtype=torch.uint8).pin_memory() + _test_copy_non_blocking(x, y) + + x = torch.zeros(10000000, dtype=torch.uint8).pin_memory() + y = torch.ones(10000000, dtype=torch.uint8).cuda() + _test_copy_non_blocking(x, y) + + def test_copy_broadcast(self): + x = torch.randn(10, 5) + y = torch.randn(5, device='cuda') + x.copy_(y) + self.assertEqual(x[3], y.cpu()) + + x = torch.randn(10, 5, device='cuda') + y = torch.randn(5) + x.copy_(y) + self.assertEqual(x[3].cpu(), y) + + def test_copy_noncontig(self): + def do_test(d0, d1): + x = torch.tensor([1.5, 2.5, 3.5, 4.5, 5.5, 6.5], device=d0) + y = torch.tensor([0, 0, 0, 0, 0, 0], device=d1) + self.assertNotEqual(x.dtype, y.dtype) + + y[::2].copy_(x[::2]) + self.assertEqual(y, [1, 0, 3, 0, 5, 0]) + + do_test('cpu', 'cuda') + do_test('cuda', 'cpu') + if TEST_MULTIGPU: + do_test('cuda:0', 'cuda:1') def test_serialization_array_with_storage(self): x = torch.randn(5, 5).cuda() @@ -2720,9 +2752,6 @@ def test_bincount_cuda(self): self.assertEqual(t.cpu().bincount(), t.bincount()) self.assertEqual(t.cpu().bincount(w_cpu), t.bincount(w)) - def test_histc_cuda(self): - _TestTorchMixin._test_histc(self, device='cuda') - def test_tiny_half_norm_(self): a = torch.arange(25).cuda().float() a /= 100000000 diff --git a/test/test_dataloader.py b/test/test_dataloader.py index 825689ffced6..2cf3c4b5c7a1 100644 --- a/test/test_dataloader.py +++ b/test/test_dataloader.py @@ -454,6 +454,10 @@ def kill_pid(pid): def init_fn(worker_id): torch.manual_seed(12345) +# used with test_error_in_init +def error_worker_init_fn(_): + raise RuntimeError("Error in worker_init_fn") + class TestDataLoader(TestCase): @@ -509,6 +513,11 @@ def fn(): self.assertRaises(ValueError, fn) + def test_error_in_init(self): + loader = DataLoader(self.dataset, num_workers=2, worker_init_fn=error_worker_init_fn) + with self.assertRaisesRegex(RuntimeError, 'Error in worker_init_fn'): + list(iter(loader)) + def test_sequential(self): self._test_sequential(DataLoader(self.dataset)) diff --git a/test/test_distributions.py b/test/test_distributions.py index c35dec77ef12..3f74918d0d8a 100644 --- a/test/test_distributions.py +++ b/test/test_distributions.py @@ -953,6 +953,18 @@ def ref_log_prob(idx, x, log_prob): logits = probs_to_logits(probs, is_binary=True) self._check_log_prob(Binomial(total_count, logits=logits), ref_log_prob) + def test_binomial_stable(self): + logits = torch.tensor([-100., 100.], dtype=torch.float) + total_count = 1. + x = torch.tensor([0., 0.], dtype=torch.float) + log_prob = Binomial(total_count, logits=logits).log_prob(x) + self.assertTrue(torch.isfinite(log_prob).all()) + + # make sure that the grad at logits=0, value=0 is 0.5 + x = torch.tensor(0., requires_grad=True) + y = Binomial(total_count, logits=x).log_prob(torch.tensor(0.)) + self.assertEqual(grad(y, x)[0], torch.tensor(-0.5)) + @unittest.skipIf(not TEST_NUMPY, "NumPy not found") def test_binomial_log_prob_vectorized_count(self): probs = torch.tensor([0.2, 0.7, 0.9]) diff --git a/test/test_fake_quant.py b/test/test_fake_quant.py index b8d8dbb5dba1..7c39ee2b9b8c 100644 --- a/test/test_fake_quant.py +++ b/test/test_fake_quant.py @@ -1,6 +1,7 @@ from __future__ import absolute_import, division, print_function, unicode_literals import torch +import torch.cuda import torch.jit import numpy as np import unittest @@ -66,6 +67,9 @@ def test_backward(self): np.testing.assert_allclose(dX, dX_prime, rtol=tolerance, atol=tolerance) def test_numerical_consistency(self): + ''' + Comparing numerical consistency between CPU quantize/dequantize op and the CPU fake quantize op + ''' np.random.seed(NP_RANDOM_SEED) fake_quantize_per_tensor_affine_forward = torch.ops.quantized.fake_quantize_per_tensor_affine_forward @@ -74,13 +78,72 @@ def test_numerical_consistency(self): num_bits = 8 X = np.random.rand(20, 20) * 125 X_torch = torch.from_numpy(X).float() - Y = X_torch.quantize_linear(scale, zero_point).dequantize() + Y = torch.dequantize(torch.quantize_linear(X_torch, scale, zero_point, torch.qint8)) Y_prime = fake_quantize_per_tensor_affine_forward( X=X_torch, scale=scale, zero_point=zero_point, num_bits=num_bits, quant_delay=0, iter=0) tolerance = 1e-6 np.testing.assert_allclose(Y, Y_prime, rtol=tolerance, atol=tolerance) + """Tests the forward path of the FakeQuantizePerTensorAffine CUDA op.""" + @unittest.skipIf(not torch.cuda.is_available(), 'no CUDA') + def test_forward_cuda(self): + np.random.seed(NP_RANDOM_SEED) + fake_quantize_per_tensor_affine_forward = torch.ops.quantized.fake_quantize_per_tensor_affine_forward + + scale = 3 + zero_point = 2 + num_bits = 8 + X = np.random.rand(20, 20) * 125 + X_torch = torch.from_numpy(X).float().cuda() + Y = _fake_quantize_per_tensor_affine_reference(X, scale, zero_point, num_bits) + Y_prime = fake_quantize_per_tensor_affine_forward( + X=X_torch, scale=scale, zero_point=zero_point, num_bits=num_bits, + quant_delay=0, iter=0) + tolerance = 1e-6 + np.testing.assert_allclose(Y, Y_prime.cpu(), rtol=tolerance, atol=tolerance) + + """Tests the backward method. Note that this runs the reference quantization + and thus the errors might be originating there.""" + @unittest.skipIf(not torch.cuda.is_available(), 'no CUDA') + def test_backward_cuda(self): + np.random.seed(NP_RANDOM_SEED) + fake_quantize_per_tensor_affine_backward = torch.ops.quantized.fake_quantize_per_tensor_affine_backward + + scale = 3 + zero_point = 2 + num_bits = 8 + X = np.random.rand(20, 20) * 125 + Y = _fake_quantize_per_tensor_affine_reference(X, scale, zero_point, num_bits) + dY = Y - X # Fake gradient + dX = _fake_quantize_per_tensor_affine_grad_reference(X, dY, scale, zero_point, num_bits) + X_torch = torch.from_numpy(X).float().cuda() + dY_torch = torch.from_numpy(dY).float().cuda() + dX_prime = fake_quantize_per_tensor_affine_backward( + X=X_torch, dY=dY_torch, scale=scale, zero_point=zero_point, + num_bits=num_bits, quant_delay=0, iter=0) + tolerance = 1e-6 + np.testing.assert_allclose(dX, dX_prime.cpu(), rtol=tolerance, atol=tolerance) + + @unittest.skipIf(not torch.cuda.is_available(), 'no CUDA') + def test_numerical_consistency_cuda(self): + ''' + Comparing numerical consistency between CPU quantize/dequantize op and the CUDA fake quantize op + ''' + np.random.seed(NP_RANDOM_SEED) + fake_quantize_per_tensor_affine_forward = torch.ops.quantized.fake_quantize_per_tensor_affine_forward + + scale = 3 + zero_point = 2 + num_bits = 8 + X = np.random.rand(20, 20) * 125 + X_torch = torch.from_numpy(X).float() + Y = torch.dequantize(torch.quantize_linear(X_torch, scale, zero_point, torch.qint8)) + Y_prime = fake_quantize_per_tensor_affine_forward( + X=X_torch.cuda(), scale=scale, zero_point=zero_point, num_bits=num_bits, + quant_delay=0, iter=0) + tolerance = 1e-6 + np.testing.assert_allclose(Y, Y_prime.cpu(), rtol=tolerance, atol=tolerance) if __name__ == '__main__': run_tests() diff --git a/test/test_jit.py b/test/test_jit.py index 7ce71658f287..bbf826bd1668 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -17,7 +17,7 @@ from torch._six import inf, PY2, builtins, StringIO from common_utils import TestCase, run_tests, IS_WINDOWS, TEST_WITH_UBSAN, \ skipIfRocm, skipIfNoLapack, suppress_warnings, load_tests, IS_SANDCASTLE, \ - freeze_rng_state, set_rng_seed, slowTest + freeze_rng_state, set_rng_seed, slowTest, TemporaryFileName from common_nn import module_tests, new_module_tests, criterion_tests from textwrap import dedent from functools import wraps, reduce @@ -75,9 +75,7 @@ CUDA_VERSION = torch._C._cuda_getCompiledVersion() for d in range(torch.cuda.device_count()): major = torch.cuda.get_device_capability(d)[0] - if (CUDA_VERSION < 8000 and major >= 6) or (CUDA_VERSION < 9000 and major >= 7): - RUN_CUDA = False - if (CUDA_VERSION < 9000 or major < 6): + if (major < 6): RUN_CUDA_HALF = False RUN_CUDA_MULTI_GPU = RUN_CUDA and torch.cuda.device_count() > 1 @@ -86,25 +84,6 @@ WINDOWS = sys.platform == 'win32' -if WINDOWS: - @contextmanager - def TemporaryFileName(): - # Ideally we would like to not have to manually delete the file, but NamedTemporaryFile - # opens the file, and it cannot be opened multiple times in Windows. To support Windows, - # close the file after creation and try to remove it manually - f = tempfile.NamedTemporaryFile(delete=False) - try: - f.close() - yield f.name - finally: - os.unlink(f.name) -else: - @contextmanager # noqa: T484 - def TemporaryFileName(): - with tempfile.NamedTemporaryFile() as f: - yield f.name - - def LSTMCellF(input, hx, cx, *params): return LSTMCell(input, (hx, cx), *params) @@ -267,6 +246,11 @@ def wrapper(*args, **kwargs): return wrapper return noop_fuser +@contextmanager +def enable_profiling_mode(): + torch._C._jit_set_profiling_mode(True) + yield + torch._C._jit_set_profiling_mode(False) # note: not re-entrant, use unnested only @contextmanager @@ -1316,15 +1300,15 @@ def forward(self, x): x = F.relu(self.conv1(x)) return x - trace = testModule() + scriptM = testModule() # Constant Propagation step is performed because this pass is intended # to insert quant-dequant nodes for quantizable tensors. The type analysis # happens as part of this jit pass - torch._C._jit_pass_constant_propagation(trace.graph) + torch._C._jit_pass_constant_propagation(scriptM.graph) # TODO: Build the qparam_dict from parse_ir directly for this pass - qparam_dict = _helper_generate_qparam(trace, input_data) - torch._C._jit_pass_insert_quantdequant(trace.graph, qparam_dict) + qparam_dict = _helper_generate_qparam(scriptM, input_data) + torch._C._jit_pass_insert_quantdequant(scriptM.graph, qparam_dict) # We expect to see quant-dequant node before and after # both conv and relu nodes and at external output since relu @@ -1332,14 +1316,11 @@ def forward(self, x): # quantization nodes FileCheck().check("quantize_linear").check_next("int_repr") \ .check_next("dequantize_linear") \ - .check("conv2d").check_next("Constant") \ - .check_next("Constant").check_next("quantize_linear") \ + .check("conv2d").check("quantize_linear") \ .check_next("int_repr").check_next("dequantize_linear") \ - .run(str(trace.graph)) - FileCheck().check("relu").check_next("Constant") \ - .check_next("Constant").check_next("quantize_linear") \ + .check("relu").check("quantize_linear") \ .check_next("int_repr").check_next("dequantize_linear") \ - .check_next("return").run(str(trace.graph)) + .check_next("return").run(str(scriptM.graph)) def test_insert_quantdequant_consecutive_qnodes_trace(self): input_data = torch.ones([1, 1, 5, 5]) @@ -1353,12 +1334,12 @@ def forward(self, x): x = F.relu(self.conv1(x)) return x - trace = torch.jit.trace(testModule(), (input_data)) + scriptM = torch.jit.trace(testModule(), (input_data)) - qparam_dict = _helper_generate_qparam(trace, input_data) + qparam_dict = _helper_generate_qparam(scriptM, input_data) if not len(qparam_dict): return - torch._C._jit_pass_insert_quantdequant(trace.graph, qparam_dict) + torch._C._jit_pass_insert_quantdequant(scriptM.graph, qparam_dict) # We expect to see quant-dequant node before and after # both conv and relu nodes and at external output since relu @@ -1366,14 +1347,11 @@ def forward(self, x): # quantization nodes FileCheck().check("quantize_linear").check_next("int_repr") \ .check_next("dequantize_linear") \ - .check("_convolution").check_next("Constant") \ - .check_next("Constant").check_next("quantize_linear") \ + .check("_convolution").check("quantize_linear") \ .check_next("int_repr").check_next("dequantize_linear") \ - .run(str(trace.graph)) - FileCheck().check("relu").check_next("Constant") \ - .check_next("Constant").check_next("quantize_linear") \ + .check("relu").check("quantize_linear") \ .check_next("int_repr").check_next("dequantize_linear") \ - .check_next("return").run(str(trace.graph)) + .check_next("return").run(str(scriptM.graph)) def test_insert_quantdequant_single_qnode(self): input_data = torch.ones([1, 1, 5, 5]) @@ -1389,26 +1367,24 @@ def forward(self, x): x1 = torch.add(x, 1) return x1 - trace = testModule() + scriptM = testModule() # Constant Propagation step is performed because this pass is intended # to insert quant-dequant nodes for quantizable tensors. The type analysis # happens as part of this jit pass - torch._C._jit_pass_constant_propagation(trace.graph) + torch._C._jit_pass_constant_propagation(scriptM.graph) - qparam_dict = _helper_generate_qparam(trace, input_data) - torch._C._jit_pass_insert_quantdequant(trace.graph, qparam_dict) + qparam_dict = _helper_generate_qparam(scriptM, input_data) + torch._C._jit_pass_insert_quantdequant(scriptM.graph, qparam_dict) # We expect to see quant-dequant node before and after # both conv and no quant-dequant after add. Constant nodes correspond # to params for the quantization nodes FileCheck().check("quantize_linear").check_next("int_repr") \ .check_next("dequantize_linear") \ - .check("conv2d").check_next("Constant") \ - .check_next("Constant").check_next("quantize_linear") \ + .check("conv2d").check("quantize_linear") \ .check_next("int_repr").check_next("dequantize_linear") \ - .check_next("add").check_next("return") \ - .run(str(trace.graph)) + .check_next("add").check_next("return").run(str(scriptM.graph)) def test_insert_quantdequant_alternate_qnode(self): input_data = torch.ones([1, 1, 5, 5]) @@ -1425,28 +1401,105 @@ def forward(self, x): x2 = F.relu(x1) return x2 - trace = testModule() + scriptM = testModule() # Constant Propagation step is performed because this pass is intended # to insert quant-dequant nodes for quantizable tensors. The type analysis # happens as part of this jit pass - torch._C._jit_pass_constant_propagation(trace.graph) + torch._C._jit_pass_constant_propagation(scriptM.graph) - qparam_dict = _helper_generate_qparam(trace, input_data) - torch._C._jit_pass_insert_quantdequant(trace.graph, qparam_dict) + qparam_dict = _helper_generate_qparam(scriptM, input_data) + torch._C._jit_pass_insert_quantdequant(scriptM.graph, qparam_dict) # We expect to see quant-dequant node before and after # conv, relu and add. Constant nodes correspond to params for the # quantization nodes FileCheck().check("quantize_linear").check_next("int_repr") \ - .check_next("dequantize_linear") \ - .check("conv2d").check_next("Constant") \ - .check_next("Constant").check_next("quantize_linear") \ - .check_next("int_repr").run(str(trace.graph)) - FileCheck().check("add").check_next("Constant")\ - .check_next("Constant").check_next("quantize_linear") \ + .check_next("dequantize_linear").check("conv2d") \ + .check("quantize_linear").check_next("int_repr") \ + .check_next("dequantize_linear").run(str(scriptM.graph)) + FileCheck().check("add").check("quantize_linear") \ .check_next("int_repr").check("dequantize_linear") \ - .run(str(trace.graph)) + .run(str(scriptM.graph)) + + def test_insert_quantdequant_for_weight(self): + input_data = torch.ones([1, 1, 1, 1]) + + class testModule(torch.jit.ScriptModule): + def __init__(self): + super(testModule, self).__init__() + self.conv1 = nn.Conv2d(1, 1, 1, 1) + + @torch.jit.script_method + def forward(self, x): + x = self.conv1(x) + return x + + def getQParamFunc(value): + scale = 0.5 + zero_point = 1 + return 'per_tensor_quant', scale, zero_point + + scriptModule = testModule() + + # Constant Propagation step is performed because this pass is intended + # to insert quant-dequant nodes for quantizable tensors. The type analysis + # happens as part of this jit pass + torch._C._jit_pass_constant_propagation(scriptModule.graph) + torch._C._jit_pass_insert_quantdequant_for_weight_bias(scriptModule._c, + "forward", + "weight", + getQParamFunc) + + # We expect to see quant-dequant node before conv node for weight. + FileCheck().check("quantize_linear").check_next("int_repr") \ + .check_next("dequantize_linear") \ + .check("conv2d").run(str(scriptModule.graph)) + + def test_insert_quantdequant_for_bias(self): + # Inserting quant-dequant nodes for bias requires scale info present for + # activation and weight so q-dq pass done first for these inputs. + + class testModule(torch.jit.ScriptModule): + def __init__(self): + super(testModule, self).__init__() + self.conv1 = nn.Conv2d(1, 1, 1, 1).float() + + @torch.jit.script_method + def forward(self, x): + x = x.quantize_linear(1.0, 0, torch.uint8) + x = x.int_repr() + x = x.dequantize_linear(1.0, 0, torch.uint8) + x = self.conv1(x) + return x + + def getQParamFuncW(value): + return 'per_tensor_quant', 0.5, 1 + + def getQParamFunc(input_scale, weight_scale): + scale = 1 / input_scale / weight_scale + zero_point = 0 + return 'per_tensor_quant', scale, zero_point + + scriptModule = testModule() + + torch._C._jit_pass_constant_propagation(scriptModule.graph) + torch._C._jit_pass_insert_quantdequant_for_weight_bias(scriptModule._c, + "forward", + "weight", + getQParamFuncW) + torch._C._jit_pass_insert_quantdequant_for_weight_bias(scriptModule._c, + "forward", + "bias", + getQParamFunc) + # We expect to see 3 pairs of quant-dequant nodes. + + FileCheck().check("quantize_linear").check_next("int_repr") \ + .check_next("dequantize_linear").check("quantize_linear") \ + .check_next("int_repr").check_next("dequantize_linear") \ + .check("quantize_linear").check_next("int_repr") \ + .check_next("dequantize_linear").check("conv2d") \ + .run(str(scriptModule.graph)) def test_pattern_based_rewrite(self): # mul(mul(mul(mul(x,y),z),x),y) --> mul(mul(mulmul(x,y,z), x), y) --> @@ -2138,7 +2191,7 @@ def __init__(self): def forward(self, scores, bbox_deltas, im_info, anchors): a, b = torch.ops._caffe2.GenerateProposals( (scores), (bbox_deltas), (im_info), (anchors), - 2.0, 6000, 300, 0.7, 16, True, -90, 90, 1.0, + 2.0, 6000, 300, 0.7, 16, True, -90, 90, 1.0, True, ) return a, b model = MyModel() @@ -3109,6 +3162,21 @@ def fn(x): warns = [str(w.message) for w in warns] self.assertEqual(len(warns), 0) + @unittest.skipIf(sys.platform == "win32", "temp file name on windows") + def test_trace_save(self): + def fn(x): + return x + 2 + + def check(func): + with tempfile.NamedTemporaryFile() as f: + func.save(f.name) + loaded = torch.jit.load(f.name) + input = torch.randn(2, 2) + self.assertEqual(func(input), loaded(input)) + + out = torch.jit.trace(fn, (torch.ones(2, 2),)) + check(out) + @unittest.skipIf(sys.platform == "win32", "TODO: need to fix this test case for Windows") def test_torch_load_error(self): class J(torch.jit.ScriptModule): @@ -3266,6 +3334,45 @@ def foo(x): else: cu.define(full) + def test_inherit_method(self): + class A(torch.jit.ScriptModule): + def __init__(self): + super(A, self).__init__() + + @torch.jit.script_method + def forward(self, x): + return x + self.bar(x) + + class B(A): + def __init__(self): + super(B, self).__init__() + + @torch.jit.script_method + def bar(self, x): + return x * x + + with self.assertRaisesRegex(RuntimeError, 'attribute'): + A() # cannot use because bar is not defined + + v = torch.rand(3, 4) + b = B() + self.assertEqual(b(v), v + v * v) + + class C(torch.jit.ScriptModule): + def __init__(self): + super(C, self).__init__() + + @torch.jit.script_method + def bar(self, x): + return x + + class D(C, B): + def __init__(self): + super(D, self).__init__() + + self.assertEqual(D()(v), v + v) + + def test_tracing_multiple_methods(self): class Net(nn.Module): def __init__(self): @@ -3291,6 +3398,11 @@ def weighted_kernel_sum(self, weight): check_inputs.append({'forward' : check_forward_input, 'weighted_kernel_sum' : check_weight}) module = torch.jit.trace_module(n, inputs, True, True, check_inputs) + module = torch.jit.trace(n.forward, example_forward_input) + module = torch.jit.trace(n.forward, example_forward_input, True, True, [example_forward_input]) + with self.assertRaisesRegex(AttributeError, "trace doesn't support compiling individual module's functions"): + module = torch.jit.trace(n.weighted_kernel_sum, inputs) + def test_submodule_twice(self): @torch.jit.script def foo(x): @@ -3742,12 +3854,17 @@ def func(a, b): def func2(a, b, c, d): return c + a ** b ** d + def func3(a, b): + # type: (int, float) -> float + return a ** b + a = torch.rand(1, requires_grad=True) b = torch.rand(1, requires_grad=True) c = torch.rand(1, requires_grad=True) d = torch.rand(1, requires_grad=True) self.checkScript(func, (a, b), optimize=True) self.checkScript(func2, (a, b, c, d), optimize=True) + self.checkScript(func3, (4, -0.5), optimize=True) @unittest.skipIf(not RUN_CUDA, "device tests require CUDA") def test_pow_scalar_backward_cuda(self): @@ -4338,7 +4455,7 @@ def reassign_from_empty_literal(): if True: x = [1, 2, 3] return - with self.assertRaisesRegex(RuntimeError, r"previously has type Tensor\[\]"): + with self.assertRaisesRegex(RuntimeError, r"previously has type List\[Tensor\]"): self.checkScript(reassign_from_empty_literal, (), optimize=False) def reassign_from_empty_builtin(): @@ -5006,6 +5123,31 @@ def func(alpha, beta, x, y): # NOTE: cannot optimize yet because broadcasts are not inserted before the fuser runs self.checkScript(script, [alpha, beta, x, y], optimize=False, outputs=outputs) + def test_profiling_graph_executor(self): + @torch.jit.script + def basic(x, y): + a = x + y + b = x * y + c = x + 1 + d = a - c + e = b - c + return d + e + + a = torch.rand(2, 3) + b = torch.rand(2, 3) + + with enable_profiling_mode(): + basic(a, b) + basic(a, b) + basic(a, b) + + # this tests that a profiling count is being decrement by + # a profile instruction. + # this is the easiest way to test that a graph was instrumented + # from python + with self.assertRaisesRegex(RuntimeError, "Not yet implemented"): + basic(a, b) + def test_resize_input_ops(self): # resize_ and resize_as resize the input tensor. because our shape analysis # is flow invariant, we set any Tensor that can alias a resized Tensor @@ -5510,7 +5652,7 @@ def test_not_cast(x): self.checkScript(test_not_cast, (torch.tensor(1),)) self.checkScript(test_not_cast, (torch.tensor(0),)) - with self.assertRaisesRegex(RuntimeError, "expected"): + with self.assertRaisesRegex(RuntimeError, "Could not cast value of type Tuple\[Tensor, Tensor\]"): # noqa: W605 @torch.jit.script def test_mult(x, y): return not(x, y) @@ -5535,7 +5677,7 @@ def test_cast_float(x): self.checkScript(test_cast_float, (0.,)) self.checkScript(test_cast_float, (-1.,)) - with self.assertRaisesRegex(RuntimeError, "expected a bool, int, float, or Tensor"): + with self.assertRaisesRegex(RuntimeError, "Could not cast value of type Tuple\[int, int\] to bool"): # noqa: W605 @torch.jit.script def test_bad_conditional(x): if (1, 2): @@ -5860,6 +6002,61 @@ def test_pow_int(x, y): self.checkScript(test_pow_float, (2.0, 2.0)) self.checkScript(test_pow_int, (2.0, 2)) + @unittest.skipIf(PY2, "Requires python 3") + def test_math_gcd(self): + def test_gcd(x, y): + # type: (int, int) -> int + return math.gcd(x, y) + + for inputs in [(2, 4), (-5, -15), (-5, 15), (10, 0), (0, 10), (-5, 0), (0, -5), (0, 0), (0, -0)]: + self.checkScript(test_gcd, inputs) + + def test_math_ops1(self): + funcs_template = dedent(''' + def func(): + return math.{func}({scalar}) + ''') + + def run_test(code): + scope = {} + execWrapper(code, globals(), scope) + cu = torch.jit.CompilationUnit(code) + self.assertEqual(cu.func(), scope['func']()) + + special_domain = ['gamma', 'lgamma'] + + for func in ['erf', 'erfc', 'expm1', 'fabs', 'gamma', 'lgamma']: + for scalar in [1, 10, 0, -1, -1.5, 5.0, 1.5]: + if func in special_domain and scalar in [0, -1]: + continue + code = funcs_template.format(func=func, scalar=scalar) + run_test(code) + + def test_math_copysign(self): + + def func1(x, y): + # type: (int, int) -> float + return math.copysign(x, y) + + def func2(x, y): + # type: (int, float) -> float + return math.copysign(x, y) + + def func3(x, y): + # type: (float, int) -> float + return math.copysign(x, y) + + def func4(x, y): + # type: (float, float) -> float + return math.copysign(x, y) + + inputs = [(3.3, 5.5), (3.3, -5.5), (-3.3, 5.5), (-3.3, -5.5), (3.3, 0.0), (0.0, 3.3)] + for a, b in inputs: + self.checkScript(func1, (int(a), int(b))) + self.checkScript(func2, (int(a), b)) + self.checkScript(func3, (a, int(b))) + self.checkScript(func4, (a, b)) + def test_if_nest_while(self): def func(a, b): # type: (int, int) -> int @@ -8089,7 +8286,7 @@ def foo(i): v = torch.rand(10, 3) self.checkScript(foo, (v,)) - with self.assertRaisesRegex(RuntimeError, r"variable 'a' previously has type \(Tensor, Tensor\)"): + with self.assertRaisesRegex(RuntimeError, r"variable 'a' previously has type Tuple"): @torch.jit.script def mixtypes(x): a = (x, x) @@ -8347,7 +8544,7 @@ def fn(x : torch.Tensor, y : Tensor, z) -> Tuple[Tensor, Tensor, Tensor]: fn = get_fn('test_type_annotation_py3', script_path) with self.assertRaisesRegex(RuntimeError, r"expected a value of type Tensor for argument" - r" '0' but found \(Tensor, Tensor\)"): + r" '0' but found Tuple\[Tensor,"): @torch.jit.script def bad_fn(x): x, y = fn((x, x), x, x) @@ -9285,7 +9482,7 @@ def f3(a): def f4(a): torch.cat(a) - with self.assertRaisesRegex(RuntimeError, r'argument \'tensors\' but found int\[\]'): + with self.assertRaisesRegex(RuntimeError, r'argument \'tensors\' but found List\[int\]'): @torch.jit.script def f5(a): torch.cat([3]) @@ -9342,7 +9539,7 @@ def foo(x, y): self.assertExpected(str(cu.foo.schema)) def test_parser_type_annotations_unknown_type(self): - with self.assertRaisesRegex(RuntimeError, r'Unknown type name Foo'): + with self.assertRaisesRegex(RuntimeError, "Unknown type name 'Foo'"): cu = torch.jit.CompilationUnit(''' def foo(x : Tensor, y : Tuple[Tuple[Foo, Tensor], Tensor]) -> Tuple[Tensor, Tensor]: return x, x @@ -10759,6 +10956,61 @@ def foo(x : {input}, y : Tuple[Tensor, Tensor]) -> Tuple[{output}, {output}]: test_str.append(str(fn.schema)) self.assertExpected("\n".join(test_str)) + @unittest.skipIf(not PY35, "Python 3.5 needed") + def test_multiline_annot_ast_py3_fn(self): + code = dedent(''' + from typing import Tuple, List, Optional + from torch import Tensor + from torch.jit.annotations import BroadcastingList2, BroadcastingList3 + import torch + @torch.jit.script + def foo(x, # type: {input} + y # type: Tuple[Tensor, Tensor] + ): + # type: (...) -> Tuple[{output}, {output}] + return x, x + ''') + test_str = [] + + for pair in self.type_input_return_pairs(): + fn = self._get_py3_code(self.format_code(code, pair), 'foo') + args = fn.schema.arguments + returns = fn.schema.returns + self.assertEqual(str(args[0].type), pair[1]) + self.assertEqual(str(args[1].type), "Tuple[Tensor, Tensor]") + self.assertEqual(str(returns[0].type), "Tuple[{}, {}]".format(pair[1], pair[1])) + + def test_bad_multiline_annotations(self): + with self.assertRaisesRegex(RuntimeError, "Return type line"): + @torch.jit.script + def bad_type_line(a, # type: Tensor + b, # type: Tensor + c # type: Tensor + ): + # type: (int, int, int) -> Tensor + # type: bad type line # noqa: F723 + + return a + b + c + + with self.assertRaisesRegex(RuntimeError, "Return type line"): + @torch.jit.script + def bad_return_line(a, # type: Tensor + b, + c # type: Tensor + ): + # type: (int, int, int) -> Tensor + return a + b + c + + # TODO: this should be supported but is difficult to parse + with self.assertRaisesRegex(RuntimeError, "Number of type annotations"): + @torch.jit.script + def missing_type(a, # type: Tensor + b, + c # type: Tensor + ): + # type: (...) -> Tensor + return a + b + c + # Python AST Frontend , Python 3-style type annotations , Script method @unittest.skipIf(not PY35, "Python 3.5 needed") def test_annot_ast_py3_method(self): @@ -11471,6 +11723,28 @@ def forward(self, x): weak_mod.weight = torch.nn.Parameter(torch.ones(5, 5) * 100) self.assertFalse(strong_mod(inp).allclose(weak_mod(inp))) + def test_weak_module_isinstance(self): + tester = self + + class M(torch.jit.ScriptModule): + def __init__(self): + super(M, self).__init__() + self.linear = nn.Linear(2, 2) + tester.assertTrue(isinstance(self.linear, nn.Linear)) + + m = M() + + def test_weak_module_attributes(self): + tester = self + + class M(torch.jit.ScriptModule): + def __init__(self): + super(M, self).__init__() + self.linear = nn.Linear(2, 2) + tester.assertEqual(self.linear.in_features, 2) + + m = M() + def test_backend_cudnn_enabled(self): # Only test that this compiles @torch.jit.script @@ -11796,6 +12070,30 @@ def fn(x): self.checkScript(fn, ("abcde",)) + def test_str_ops(self): + def test_str_is(s): + # type: (str) -> Tuple[bool, bool, bool, bool, bool, bool] + return s.isupper(), s.islower(), s.isdigit(), s.isspace(), \ + s.isalnum(), s.isalpha() + + def test_str_to(s): + # type: (str) -> Tuple[str, str] + return s.upper(), s.lower() + + inputs = ["", "12a", "!B", "12", "a", "B", "aB", "$12", "B12", "AB ", + " \t", " \n", "\na", "abc"] + + for input in inputs: + self.checkScript(test_str_is, (input,)) + self.checkScript(test_str_to, (input,)) + + def test_str_cmp(a, b): + # type: (str, str) -> Tuple[bool, bool, bool, bool, bool, bool] + return a != b, a == b, a < b, a > b, a <= b, a >= b + + for i in range(len(inputs) - 1): + self.checkScript(test_str_cmp, (inputs[i], inputs[i + 1])) + def test_ord(self): def fn(x): # type: (str) -> int @@ -11811,6 +12109,48 @@ def index_str_to_tensor(s): s = u'\u00a3'.encode('utf8')[:1] self.checkScript(index_str_to_tensor, (s,)) + @unittest.skipIf(IS_WINDOWS or IS_SANDCASTLE, "NYI: TemporaryFileName support for Windows or Sandcastle") + def test_get_set_state(self): + class M(torch.jit.ScriptModule): + __constants__ = ['number'] + + def __init__(self, number, submodule=None): + super(M, self).__init__() + self.register_buffer('buffer1', torch.ones(2, 2)) + self.register_buffer('buffer2', torch.ones(2, 2)) + self.number = number + if submodule: + self.submodule = submodule + + @torch.jit.script_method + def __getstate__(self): + # type: () -> Tuple[Tensor, Tensor, int] + return (self.buffer1, self.buffer2, 74) + + @torch.jit.script_method + def __setstate__(self, state): + # type: (Tuple[Tensor, Tensor, int]) -> None + self.buffer1 = state[0] + 10 + self.buffer2 = state[1] + 10 + + with TemporaryFileName() as fname: + m = M(23, submodule=M(99)) + m.save(fname) + loaded = torch.jit.load(fname) + + # Check original module + self.assertEqual(m.buffer1, torch.ones(2, 2)) + self.assertEqual(m.buffer2, torch.ones(2, 2)) + + # Check top level module + self.assertEqual(loaded.buffer1, torch.ones(2, 2) + 10) + self.assertEqual(loaded.buffer2, torch.ones(2, 2) + 10) + + # Check submodule + self.assertEqual(loaded.submodule.buffer1, torch.ones(2, 2) + 10) + self.assertEqual(loaded.submodule.buffer2, torch.ones(2, 2) + 10) + + def test_string_slicing(self): def fn1(x): # type: (str) -> str @@ -12244,22 +12584,35 @@ def foo(x): m = self.createFunctionFromGraph(foo.graph) self.getExportImportCopy(m) + def get_pickle_values(self): + return (('dict', {"I": "am", "a test": "test"}, Dict[str, str]), + ('float', 2.3, float), + ('int', 99, int), + ('bool', False, bool), + ('tuple', (1, 2, 3, 4), Tuple[int, int, int, int]), + ('list', [(1, 2), (3, 4)], List[Tuple[int, int]]), + ('tensor', torch.randn(2, 2), torch.Tensor), + ('int_list', [1, 2, 3, 4], List[int]), + ('tensor_list', [torch.ones(2, 2) + i for i in range(4)], List[torch.Tensor]), + ('bool_list', [True, True, False, True], List[bool]), + ('float_list', [1., 2., 3., 4.], List[float]), + ('str_list', ['hello', 'bye'], List[str]), + ('none', None, Optional[int]),) + def test_attribute_serialization(self): + tester = self + class M(torch.jit.ScriptModule): def __init__(self): super(M, self).__init__() - self.table = torch.jit.Attribute({"I": "am", "a test": "test"}, Dict[str, str]) - self.float = torch.jit.Attribute(2.3, float) - self.int = torch.jit.Attribute(99, int) - self.bool = torch.jit.Attribute(False, bool) - self.tuple = torch.jit.Attribute((1, 2, 3, 4), Tuple[int, int, int, int]) - self.list = torch.jit.Attribute([(1, 2), (3, 4)], List[Tuple[int, int]]) - self.tensor = torch.jit.Attribute(torch.randn(2, 2), torch.Tensor) - self.int_list = torch.jit.Attribute([1, 2, 3, 4], List[int]) + for name, value, the_type in tester.get_pickle_values(): + setattr(self, name, torch.jit.Attribute(value, the_type)) @torch.jit.script_method def forward(self): - return (self.table, self.float, self.int, self.bool, self.tuple, self.list, self.int_list) + return (self.dict, self.float, self.int, self.bool, self.tuple, + self.list, self.int_list, self.tensor_list, self.bool_list, + self.float_list, self.str_list, self.none) m = M() imported_m = self.getExportImportCopy(m) @@ -12277,21 +12630,19 @@ def fn(x): @unittest.skipIf(IS_WINDOWS or IS_SANDCASTLE, "NYI: TemporaryFileName support for Windows or Sandcastle") def test_attribute_unpickling(self): tensor = torch.randn(2, 2) + tester = self class M(torch.jit.ScriptModule): def __init__(self): super(M, self).__init__() - self.table = torch.jit.Attribute({"I": "am", "a test": "test"}, Dict[str, str]) - self.float = torch.jit.Attribute(2.3, float) - self.int = torch.jit.Attribute(99, int) - self.tuple = torch.jit.Attribute((1, 2, 3, 4), Tuple[int, int, int, int]) - self.list = torch.jit.Attribute([(1, 2), (3, 4)], List[Tuple[int, int]]) - self.tensor = torch.jit.Attribute(tensor, torch.Tensor) - self.int_list = torch.jit.Attribute([1, 2, 3, 4], List[int]) + for name, value, the_type in tester.get_pickle_values(): + setattr(self, name, torch.jit.Attribute(value, the_type)) @torch.jit.script_method def forward(self): - return (self.table, self.float, self.int, self.tuple, self.list, self.int_list) + return (self.dict, self.float, self.int, self.bool, self.tuple, + self.list, self.int_list, self.tensor_list, self.bool_list, + self.float_list, self.str_list, self.none) with TemporaryFileName() as fname: M().save(fname) @@ -12300,10 +12651,32 @@ def forward(self): pickled_data = archive.read(os.path.join(archive_name, 'attributes.pkl')) out = pickle.load(io.BytesIO(pickled_data)) - self.assertEqual(out[0], {"I": "am", "a test": "test"}) - self.assertEqual(out[1], 2.3) - self.assertEqual(out[2], 99) - self.assertEqual(out[6], [1, 2, 3, 4]) + def is_tensor_value(item): + if isinstance(item, torch.Tensor): + return True + if isinstance(item, list): + return is_tensor_value(item[0]) + return False + + for loaded_item, item in zip(out, self.get_pickle_values()): + if is_tensor_value(item[1]): + continue + self.assertEqual(item[1], loaded_item) + + def test_script_recurse(self): + def a_python_fn(a, b, c): + return a + b + c + + with torch.jit._enable_recursive_script(): + @torch.jit.script + def a_script_fn(d, e, f): + return a_python_fn(d, e, f) + + graph = str(a_script_fn.graph) + FileCheck().check("aten::add").run(graph) + FileCheck().check_not("a_python_fn").run(graph) + t = torch.ones(2, 2) + self.assertEqual(a_script_fn(t, t, t), t + t + t) @unittest.skipIf(IS_WINDOWS or IS_SANDCASTLE, "NYI: TemporaryFileName support for Windows or Sandcastle") def test_old_models_bc(self): @@ -12521,6 +12894,13 @@ def test_pickle_checkpoint(self): self._test_pickle_checkpoint('cpu') self._test_pickle_checkpoint_views('cpu') + def test_string_list(self): + def fn(string): + # type: (str) -> List[str] + return list(string) + + self.checkScript(fn, ("abcdefgh",)) + def test_split(self): def split_two(tensor): a, b, c = torch.split(tensor, 2, dim=1) @@ -12529,6 +12909,14 @@ def split_two(tensor): y = torch.randn(3, 6) self.checkScript(split_two, [(x + y)]) + def test_python_op_name(self): + import random + + with self.assertRaisesRegex(RuntimeError, "randint"): + @torch.jit.script + def fn(): + return random.randint() + class MnistNet(nn.Module): def __init__(self): @@ -13820,14 +14208,14 @@ class TestJitGeneratedFunctional(JitTestCase): '', (True, 'aten::_batch_norm_impl_index')), ('instance_norm', (S, S, S), (non_differentiable(torch.zeros(S)), non_differentiable(torch.ones(S))),), ('layer_norm', (S, S, S, S), ([5],), '', - (True, ['aten::contiguous', 'aten::_batch_norm_impl_index'])), + (False, ['aten::contiguous', 'aten::_batch_norm_impl_index'])), ('layer_norm', (S, S, S, S), ([5], non_differentiable(torch.rand(S)),), 'with_only_weight', - (True, ['aten::contiguous', 'aten::_batch_norm_impl_index'])), + (False, ['aten::contiguous', 'aten::_batch_norm_impl_index'])), ('layer_norm', (S, S, S, S), ([5], None, non_differentiable(torch.rand(S)),), 'with_only_bias', - (True, ['aten::contiguous', 'aten::_batch_norm_impl_index'])), + (False, ['aten::contiguous', 'aten::_batch_norm_impl_index'])), ('layer_norm', (S, S, S, S), ([5], non_differentiable(torch.rand(S)), non_differentiable(torch.rand(S))), 'with_weight_and_bias', - (True, ['aten::contiguous', 'aten::_batch_norm_impl_index', 'aten::addcmul'])), + (False, ['aten::contiguous', 'aten::_batch_norm_impl_index', 'aten::addcmul'])), ('group_norm', (S, S, S), (1, torch.rand(5),),), ('local_response_norm', (S, S, S), (2, ),), ('nll_loss', F.log_softmax(torch.randn(3, 5), dim=0), (torch.tensor([1, 0, 4]),), '', (True, 'aten::nll_loss_forward')), @@ -15273,6 +15661,53 @@ def _xor(): # noqa: E306 def test(): return Foo(torch.tensor(1)) + Foo(torch.tensor(1)) + def test_cast_overloads(self): + @torch.jit.script + class Foo(object): + def __init__(self, val): + # type: (float) -> None + self.val = val + + def __int__(self): + return int(self.val) + + def __float__(self): + return self.val + + def __bool__(self): + return bool(self.val) + + def __str__(self): + return str(self.val) + + def test(foo): + # type: (Foo) -> Tuple[int, float, bool] + if foo: + pass + return int(foo), float(foo), bool(foo) + + fn = torch.jit.script(test) + self.assertEqual(fn(Foo(0.5)), test(0.5)) + self.assertEqual(fn(Foo(0.)), test(0.0)) + # str has slightly different formatting + self.assertTrue("0.5" in (str(Foo(0.5)))) + self.assertTrue("0." in (str(Foo(0.0)))) + + @torch.jit.script + class BadBool(object): + def __init__(self): + pass + + def __bool__(self): + return (1, 2) + + with self.assertRaisesRegex(RuntimeError, "expected a bool expression for condition"): + @torch.jit.script + def test(): + if BadBool(): + print(1) + pass + def test_init_compiled_first(self): @torch.jit.script # noqa: B903 class Foo(object): diff --git a/test/test_jit_fuser.py b/test/test_jit_fuser.py index d5b93381eae8..e992caee55c4 100644 --- a/test/test_jit_fuser.py +++ b/test/test_jit_fuser.py @@ -282,6 +282,22 @@ def funcOptMax(a, b): graph = backward_graph(s) self.assertAllFused(graph) + @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows") + @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA") + @skipIfRocm + def test_dropout(self): + def func(x): + x = torch.nn.functional.dropout(x) + return torch.nn.functional.relu(x) + + a = torch.randn(4, 4, dtype=torch.float, device='cuda', requires_grad=True) + s = torch.jit.script(func, (a,)) + self.assertAllFused(s.graph_for(a,), except_for={'aten::div', 'prim::Constant'}) + c = s(a) + c.sum().backward() + graph = backward_graph(s) + self.assertAllFused(graph, except_for={'aten::div', 'prim::Constant'}) + @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows") @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA") @skipIfRocm @@ -478,7 +494,7 @@ def test_norm_decompose(nm, in_opt_graph, not_in_opt_graph, in_fusegraph): # test for layernorm decompose lm = nn.LayerNorm(8) test_norm_decompose(lm, ['aten::batch_norm_stats'], - ['aten::layer_norm('], ['aten::sub', 'aten::mul', 'aten::addcmul']) + ['aten::layer_norm('], ['aten::sub', 'aten::mul', 'aten::add']) @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows") @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA") diff --git a/test/test_mkldnn.py b/test/test_mkldnn.py index ea3ae4228aa9..c82fcaf51795 100644 --- a/test/test_mkldnn.py +++ b/test/test_mkldnn.py @@ -3,8 +3,10 @@ import unittest import torch +import torch.jit from torch.utils import mkldnn as mkldnn_utils -from common_utils import TestCase, run_tests +from common_utils import TestCase, run_tests, TemporaryFileName + from torch.autograd.gradcheck import gradgradcheck, gradcheck @@ -88,7 +90,7 @@ def test_detach(self): def test_repr(self): self.assertTrue("layout=torch._mkldnn" in str(torch.randn((1, 2, 3, 4), - dtype=torch.float, device=torch.device('cpu')).to_mkldnn())) + dtype=torch.float, device=torch.device('cpu')).to_mkldnn())) def test_conv2d(self): for groups in [1, 4]: @@ -109,6 +111,9 @@ def test_conv2d(self): conv2d(x), mkldnn_conv2d(x.to_mkldnn()).to_dense()) + self._test_serialization(mkldnn_conv2d, (x.to_mkldnn(),)) + self._test_tracing(mkldnn_conv2d, (x.to_mkldnn(),)) + def test_relu(self): x = torch.randn((4, 5), dtype=torch.float32) * 10 self.assertEqual(torch.relu(x), torch.relu(x.to_mkldnn()).to_dense()) @@ -172,6 +177,9 @@ def test_batch_norm2d(self): bn(x), mkldnn_bn(x.to_mkldnn()).to_dense()) + self._test_serialization(mkldnn_bn, (x.to_mkldnn(),)) + self._test_tracing(mkldnn_bn, (x.to_mkldnn(),)) + def test_add(self): N = torch.randint(3, 10, (1,)).item() C = torch.randint(3, 100, (1,)).item() @@ -231,12 +239,41 @@ def test_linear(self): x = torch.randn(3, in_features, dtype=torch.float32) * 10 for bias in [True, False]: - linear = torch.nn.Linear(in_features, out_features).float() + linear = torch.nn.Linear(in_features, out_features, bias=bias).float() mkldnn_linear = mkldnn_utils.to_mkldnn(copy.deepcopy(linear)) self.assertEqual( linear(x), mkldnn_linear(x.to_mkldnn()).to_dense()) + self._test_serialization(mkldnn_linear, (x.to_mkldnn(),)) + self._test_tracing(mkldnn_linear, (x.to_mkldnn(),)) + + def test_sigmoid(self): + x = torch.randn(4, 5, dtype=torch.float32) * 10 + mkldnn_x = x.to_mkldnn() + self.assertEqual( + torch.sigmoid(x), + torch.sigmoid(mkldnn_x).to_dense(), + ) + # inplace + torch.sigmoid_(x) + torch.sigmoid_(mkldnn_x) + self.assertEqual(x, mkldnn_x.to_dense()) + + def _test_serialization(self, module, inputs): + with TemporaryFileName() as fname: + torch.jit.save(module, fname) + loaded = torch.jit.load(fname) + self.assertEqual( + module(*inputs).to_dense(), + loaded(*inputs).to_dense()) + + def _test_tracing(self, module, inputs): + traced = torch.jit.trace(module, inputs, check_trace=False) + self.assertEqual( + module(*inputs).to_dense(), + traced(*inputs).to_dense()) + if __name__ == '__main__': run_tests() diff --git a/test/test_nn.py b/test/test_nn.py index 9ef82d54b160..5105dabb69d1 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -2070,24 +2070,61 @@ def test_embedding_dense_grad(self): def test_embedding_dense_grad_cuda(self): self._test_embedding_dense_grad("cuda") + def test_move_sparse_half_embedding(self): + embedding = nn.Embedding(10, 3, sparse=True) + self.assertEqual(embedding.weight.device.type, 'cpu') + self.assertEqual(embedding.weight.dtype, torch.float64) + embedding.to(torch.float16) + self.assertEqual(embedding.weight.dtype, torch.float16) + self.assertEqual(embedding.embedding_dim, 3) + self.assertEqual(embedding.num_embeddings, 10) + + if torch.cuda.is_available(): + embedding.to('cuda') + self.assertEqual(embedding.weight.device.type, 'cuda') + embedding.to('cpu') + self.assertEqual(embedding.weight.device.type, 'cpu') + def test_embedding_sparse_backward(self): + self._test_embedding_backward() + + @unittest.skipIf(not TEST_CUDA, "CUDA unavailable") + def test_embedding_sparse_half_backward(self): + # same as test_embedding_sparse_backward above but testing half types in + # cuda. cpu sum not supported for half types. + self._test_embedding_backward('cuda', torch.float16) + + def _test_embedding_backward(self, device='cpu', dtype=torch.float64): embedding = nn.Embedding(10, 3, sparse=True) + tensor = torch.tensor([[7, 1, 3]]) + ones = torch.tensor(1.).expand(3, 3) + tensorTwice = tensor.repeat(1, 2) + onesTwice = torch.cat((ones, ones)) + + embedding = embedding.to(dtype=dtype).to(device) + tensor = tensor.to(device) + ones = ones.to(device) + tensorTwice = tensorTwice.to(device) + onesTwice = onesTwice.to(device) + embedding.zero_grad() - embedding(torch.LongTensor([7, 1, 3])).sum().backward() - self.assertEqual(embedding.weight.grad._indices(), torch.LongTensor([[7, 1, 3]])) - self.assertEqual(embedding.weight.grad._values(), torch.tensor(1.).expand(3, 3)) + embedding(tensor[0]).sum().backward() + self.assertEqual(embedding.weight.grad._indices(), tensor) + self.assertEqual(embedding.weight.grad._values(), ones) embedding.zero_grad() - embedding(torch.LongTensor([7, 1, 3])).sum().backward() - embedding(torch.LongTensor([7, 1, 3])).sum().backward() - self.assertEqual(embedding.weight.grad._indices(), torch.LongTensor([[7, 1, 3, 7, 1, 3]])) - self.assertEqual(embedding.weight.grad._values(), torch.tensor(1.).expand(6, 3)) + embedding(tensor[0]).sum().backward() + embedding(tensor[0]).sum().backward() + self.assertEqual(embedding.weight.grad._indices(), tensorTwice) + self.assertEqual(embedding.weight.grad._values(), onesTwice) embedding.zero_grad() - embedding(torch.LongTensor([7, 1, 3])).sum().backward() - embedding(torch.LongTensor([8, 1, 3])).sum().backward() - self.assertEqual(embedding.weight.grad._indices(), torch.LongTensor([[7, 1, 3, 8, 1, 3]])) - self.assertEqual(embedding.weight.grad._values(), torch.tensor(1.).expand(6, 3)) + embedding(tensor[0]).sum().backward() + tensor[0, 0] = 8 + embedding(tensor[0]).sum().backward() + tensorTwice[0, 3] = 8 + self.assertEqual(embedding.weight.grad._indices(), tensorTwice) + self.assertEqual(embedding.weight.grad._values(), onesTwice) def test_embedding_padding_idx(self): embedding = nn.Embedding(10, 20, padding_idx=0) @@ -2377,6 +2414,7 @@ def _test_EmbeddingBag_vs_Embedding(self, N, D, B, L, max_norm=None, needed_prec = dtype2prec[dtype] * 2 else: needed_prec = backward_prec + self.assertEqual(es_weight_grad, e.weight.grad, needed_prec) if test_per_sample_weights and trainable_per_sample_weights: @@ -2564,12 +2602,13 @@ def test_contig_wrong_stride_cudnn(self): def test_embedding_bag(self): for dtype in [torch.double, torch.float]: - # TODO: figure out why backward on float breaks - test_backward = dtype is not torch.float - self._test_EmbeddingBag(False, 'sum', False, test_backward=test_backward, dtype=dtype) - self._test_EmbeddingBag(False, 'mean', False, test_backward=test_backward, dtype=dtype) - self._test_EmbeddingBag(False, 'max', False, test_backward=test_backward, dtype=dtype) + self._test_EmbeddingBag(False, 'sum', False, dtype=dtype) + self._test_EmbeddingBag(False, 'mean', False, dtype=dtype) + self._test_EmbeddingBag(False, 'max', False, dtype=dtype) + # TODO: figure out why precision on sparse embeddings isn't the + # same as for dense. + test_backward = dtype is not torch.float self._test_EmbeddingBag(False, 'sum', True, test_backward=test_backward, dtype=dtype) self._test_EmbeddingBag(False, 'mean', True, test_backward=test_backward, dtype=dtype) @@ -2733,10 +2772,11 @@ def test_embedding_bag_cuda(self, dtype=torch.float): self._test_EmbeddingBag(True, 'sum', False, dtype) self._test_EmbeddingBag(True, 'mean', False, dtype) self._test_EmbeddingBag(True, 'max', False, dtype) - if dtype != torch.half: - # torch.cuda.sparse.HalfTensor is not enabled. - self._test_EmbeddingBag(True, 'sum', True, dtype) - self._test_EmbeddingBag(True, 'mean', True, dtype) + + # see 'todo' in test_embedding_bag. + test_backward = dtype is not torch.float16 + self._test_EmbeddingBag(True, 'sum', True, dtype, test_backward=test_backward) + self._test_EmbeddingBag(True, 'mean', True, dtype, test_backward=test_backward) def test_fractional_max_pool2d(self): x = torch.randn(1, 2, 7, 7, requires_grad=True) @@ -3197,29 +3237,36 @@ def verify_reduction_scalars(input, reduction, output): output = m(sigmoid(input), target) verify_reduction_scalars(input, reduction, output) + @unittest.skipIf((not TEST_NUMPY) or (not TEST_SCIPY) or (scipy.__version__ < '1.0.0'), + "Scipy v1.0 and/or numpy not found") def test_multihead_attention(self): - def _scaled_dot_attn_ref(Q, K, V, dims, unseen_mask=False, src_lengths=None): + def _scaled_dot_attn_ref(Q, K, V, dims, unseen_mask=None, src_lengths=None, + attn_mask=None, add_zero_attn=False): """ Numpy-based reference implementation of scaled dot attention for testing""" + QKT = _batchmatmul( Q, np.transpose(K, axes=[0, 1, 3, 2]) / np.sqrt(dims[3], dtype=np.float32), # divide by sqrt(d_head) ) - if unseen_mask or src_lengths is not None: - b1, b2, s1, s2 = QKT.shape + b1, b2, s1, s2 = QKT.shape + if unseen_mask is not None or src_lengths is not None: # assert s1 == s2 for i in range(b1): for j in range(b2): for m in range(s1): for n in range(s2): - if unseen_mask and n > m: + if unseen_mask[m][n] == 0: QKT[i, j, m, n] = -np.inf if src_lengths is not None and n >= src_lengths[i]: QKT[i, j, m, n] = -np.inf + reference = _softmax(QKT) + ref_attn_weight = reference + ref_attn_weight = np.sum(ref_attn_weight, axis=1) / b2 reference = _batchmatmul(reference, V) - return reference + return reference, ref_attn_weight def _batchmatmul(a, b): # batchmatmul over 4 dim matrix """ Numpy-based batch matrix multiply over 4 dim matrix""" @@ -3235,7 +3282,8 @@ def _batchmatmul(a, b): # batchmatmul over 4 dim matrix def _softmax(x): # softmax over 4 dim matrix """ Numpy-based reference softmax over 4 dim matrix""" - output = np.zeros(x.shape, dtype=np.float32) + np.seterr(invalid='ignore') + output = np.zeros(x.shape, dtype=np.float64) for i in range(x.shape[0]): for j in range(x.shape[1]): for k in range(x.shape[2]): @@ -3298,7 +3346,7 @@ def _create_src_lengths_mask(batch_size, src_lengths): # returns [batch_size, max_seq_len] return (src_indices < src_lengths).int().detach() - def _multihead_attn_test_helper(use_src_lengths): + def _multihead_attn_test_helper(add_key_padding_mask, add_bias_kv=False, add_zero_attn=False): for _ in range(100): batch_sz, seq_len = [random.randint(2, 10) for r in range(2)] d_head = random.randint(3, 10) @@ -3308,7 +3356,7 @@ def _multihead_attn_test_helper(use_src_lengths): src_lengths = None src_lengths_tensor = None - if use_src_lengths: + if add_key_padding_mask: src_lengths, src_lengths_tensor = _generate_src_lengths( batch_size=batch_sz, seq_len=seq_len ) @@ -3317,28 +3365,44 @@ def _multihead_attn_test_helper(use_src_lengths): K = np.random.rand(*dims).astype(np.float64) V = K Q = np.expand_dims(decoder_state, 1) + attn_mask = np.random.randint(0 , 2, size=(1, seq_len)) + attn_mask_tensor = torch.from_numpy(attn_mask).float() + attn_mask_tensor.masked_fill_(attn_mask_tensor == 0, float('-inf')) + attn_mask_tensor.masked_fill_(attn_mask_tensor > 0, float('0.0')) + attn_mask_tensor = attn_mask_tensor.double() decoder_state_tensor = torch.from_numpy(decoder_state).double() source_hid_tensor = torch.from_numpy(K).double().transpose(0, 1) - multihead_attn_module = MultiheadAttention(d_model, nheads) + multihead_attn_module = MultiheadAttention(d_model, nheads, + add_bias_kv=add_bias_kv, + add_zero_attn=add_zero_attn) + + if add_bias_kv: + bias_k = multihead_attn_module.bias_k.detach().numpy() + bias_v = multihead_attn_module.bias_v.detach().numpy() + else: + bias_k = None + bias_v = None _batch_size = decoder_state_tensor.shape[0] _Q = decoder_state_tensor.unsqueeze(1).transpose(0, 1) _V = source_hid_tensor _K = source_hid_tensor src_len_mask = None - if src_lengths is not None and use_src_lengths: + if src_lengths is not None and add_key_padding_mask: # [batch_size, 1, seq_len] src_len_mask_int = _create_src_lengths_mask( batch_size=_batch_size, src_lengths=src_lengths_tensor ) src_len_mask = src_len_mask_int != 1 - - result = multihead_attn_module( + result, result_weight = multihead_attn_module( _Q, _K, _V, key_padding_mask=src_len_mask, - need_weights=True)[0].squeeze(0).detach().numpy() + need_weights=True, + attn_mask=attn_mask_tensor) + + result = result.squeeze(0).detach().numpy() Q_fc = _fc(Q, "in_proj_", multihead_attn_module, end=d_model) K_fc = _fc( @@ -3346,20 +3410,31 @@ def _multihead_attn_test_helper(use_src_lengths): ) V_fc = _fc(V, "in_proj_", multihead_attn_module, start=2 * d_model) + if add_bias_kv: + K_fc = np.concatenate((K_fc, np.repeat(bias_k, K_fc.shape[0], axis=0)), axis=1) + V_fc = np.concatenate((V_fc, np.repeat(bias_v, V_fc.shape[0], axis=0)), axis=1) + attn_mask = np.concatenate((attn_mask, np.ones([1, 1])), axis=1) + dims[1] += 1 Q_split = _split_heads_ref( Q_fc, [batch_sz, 1, d_model], nheads, d_head ) K_split = _split_heads_ref(K_fc, dims, nheads, d_head) V_split = _split_heads_ref(V_fc, dims, nheads, d_head) - attn_heads = _scaled_dot_attn_ref( + if add_zero_attn: + dims[1] += 1 + K_split = np.concatenate((K_split, np.zeros([K_split.shape[0], K_split.shape[1], 1, K_split.shape[3]])), axis=2) + V_split = np.concatenate((V_split, np.zeros([V_split.shape[0], V_split.shape[1], 1, V_split.shape[3]])), axis=2) + attn_mask = np.concatenate((attn_mask, np.ones([1, 1])), axis=1) + + attn_heads, ref_attn_weight = _scaled_dot_attn_ref( Q=Q_split, K=K_split, V=V_split, dims=Q_split.shape, - src_lengths=src_lengths, + unseen_mask=attn_mask, + src_lengths=src_lengths ) - combined_attn_heads = _combine_heads_ref( X=attn_heads, dims=[batch_sz, 1], nheads=nheads, d_head=d_head ) @@ -3373,14 +3448,27 @@ def _multihead_attn_test_helper(use_src_lengths): self.assertEqual(tuple(result.shape), (batch_sz, d_model)) np.testing.assert_allclose(result, reference, atol=1e-5) + # result_weight = ref_attn_weight + result_weight = result_weight.detach().numpy() + self.assertEqual(tuple(result_weight.shape), tuple(ref_attn_weight.shape)) + np.testing.assert_allclose(result_weight, ref_attn_weight, atol=1e-5) + + def test_multihead_attn_add_bias_kv(): + _multihead_attn_test_helper(add_key_padding_mask=None, add_bias_kv=True) + + def test_multihead_attn_add_zero_attn(): + _multihead_attn_test_helper(add_key_padding_mask=None, add_zero_attn=True) + def test_multihead_attn_no_masking(): - _multihead_attn_test_helper(use_src_lengths=None) + _multihead_attn_test_helper(add_key_padding_mask=None) - def test_multihead_attn_with_src_lengths(): - _multihead_attn_test_helper(use_src_lengths=True) + def test_multihead_attn_key_padding_mask(): + _multihead_attn_test_helper(add_key_padding_mask=True) + test_multihead_attn_add_zero_attn() # Test MultiheadAttention with add_zero_attn + test_multihead_attn_add_bias_kv() # Test MultiheadAttention with add_bias_kv test_multihead_attn_no_masking() # Test MultiheadAttention without masking - test_multihead_attn_with_src_lengths() # Test MultiheadAttention with src lengths + test_multihead_attn_key_padding_mask() # Test MultiheadAttention with src lengths def test_normalize(self): inputs = torch.randn(1, 3, 4, 4, requires_grad=True) @@ -4308,6 +4396,19 @@ def test_load_state_dict_BC(self): self.assertEqual(bn.num_batches_tracked.dtype, torch.long) self.assertEqual(bn.num_batches_tracked.item(), 0) + @unittest.skipIf(not PY3, 'Python 2.7 generates cyclic trash') + def test_load_state_dict_ref_cycle(self): + # load_state_dict shouldn't cause a reference cycle involving Tensors + import gc + + m = torch.nn.LSTM(16, 16, bidirectional=True) + + gc.collect() + m.load_state_dict(deepcopy(m).state_dict()) + refcycles = gc.collect() + + self.assertEqual(refcycles, 0) + def test_parameter_assignment(self): l = nn.Linear(5, 5) @@ -4939,6 +5040,17 @@ def test_invalid_dropout_p(self): self.assertRaises(ValueError, lambda: F.dropout(v, -0.1)) self.assertRaises(ValueError, lambda: F.dropout(v, 1.1)) + def test_empty_dropout(self): + x = torch.Tensor([]) + out = torch.nn.functional.dropout(x) + self.assertEqual(out.size(), x.size()) + + @unittest.skipIf(not TEST_CUDA, 'CUDA not available') + def test_empty_dropout_cuda(self): + x = torch.Tensor([]).to('cuda') + out = torch.nn.functional.dropout(x) + self.assertEqual(out.size(), x.size()) + def test_pad_sequence(self): def pad(tensor, length): return torch.cat( diff --git a/test/test_nn_quantized.py b/test/test_nn_quantized.py index 845fc0f7b00b..fa7fafaa8831 100644 --- a/test/test_nn_quantized.py +++ b/test/test_nn_quantized.py @@ -18,7 +18,7 @@ def test_functional_api(self): Y = X.numpy().copy() Y[Y < 0] = 0 qY = _quantize(Y, scale, zero_point) - qX = X.quantize_linear(scale=scale, zero_point=zero_point) + qX = X.quantize_linear(scale=scale, zero_point=zero_point, dtype=torch.quint8) qY_hat = F.relu(qX) np.testing.assert_equal(qY, qY_hat.int_repr()) diff --git a/test/test_numba_integration.py b/test/test_numba_integration.py index 105e181a128f..deacc58902e9 100644 --- a/test/test_numba_integration.py +++ b/test/test_numba_integration.py @@ -256,6 +256,94 @@ def test_active_device(self): numba.cuda.as_cuda_array(cudat), numba.cuda.devicearray.DeviceNDArray ) + @unittest.skipIf(not TEST_NUMPY, "No numpy") + @unittest.skipIf(not TEST_CUDA, "No cuda") + @unittest.skipIf(not TEST_NUMBA_CUDA, "No numba.cuda") + def test_from_cuda_array_interface(self): + """torch.as_tensor() and torch.tensor() supports the __cuda_array_interface__ protocol. + + If an object exposes the __cuda_array_interface__, .as_tensor() and .tensor() + will use the exposed device memory. + + See: + https://numba.pydata.org/numba-doc/latest/cuda/cuda_array_interface.html + """ + + dtypes = [ + numpy.float64, + numpy.float32, + numpy.int64, + numpy.int32, + numpy.int16, + numpy.int8, + numpy.uint8, + ] + for dtype in dtypes: + numpy_arys = [ + numpy.arange(6).reshape(2, 3).astype(dtype), + numpy.arange(6).reshape(2, 3).astype(dtype)[1:], # View offset should be ignored + numpy.arange(6).reshape(2, 3).astype(dtype)[:, None], # change the strides but still contiguous + ] + # Zero-copy when using `torch.as_tensor()` + for numpy_ary in numpy_arys: + numba_ary = numba.cuda.to_device(numpy_ary) + torch_ary = torch.as_tensor(numba_ary, device="cuda") + self.assertEqual(numba_ary.__cuda_array_interface__, torch_ary.__cuda_array_interface__) + self.assertEqual(torch_ary.cpu().data.numpy(), numpy.asarray(numba_ary)) + + # Check that `torch_ary` and `numba_ary` points to the same device memory + torch_ary += 42 + self.assertEqual(torch_ary.cpu().data.numpy(), numpy.asarray(numba_ary)) + + # Implicit-copy because `torch_ary` is a CPU array + for numpy_ary in numpy_arys: + numba_ary = numba.cuda.to_device(numpy_ary) + torch_ary = torch.as_tensor(numba_ary, device="cpu") + self.assertEqual(torch_ary.data.numpy(), numpy.asarray(numba_ary)) + + # Check that `torch_ary` and `numba_ary` points to different memory + torch_ary += 42 + self.assertEqual(torch_ary.data.numpy(), numpy.asarray(numba_ary) + 42) + + # Explict-copy when using `torch.tensor()` + for numpy_ary in numpy_arys: + numba_ary = numba.cuda.to_device(numpy_ary) + torch_ary = torch.tensor(numba_ary, device="cuda") + self.assertEqual(torch_ary.cpu().data.numpy(), numpy.asarray(numba_ary)) + + # Check that `torch_ary` and `numba_ary` points to different memory + torch_ary += 42 + self.assertEqual(torch_ary.cpu().data.numpy(), numpy.asarray(numba_ary) + 42) + + @unittest.skipIf(not TEST_NUMPY, "No numpy") + @unittest.skipIf(not TEST_CUDA, "No cuda") + @unittest.skipIf(not TEST_NUMBA_CUDA, "No numba.cuda") + def test_from_cuda_array_interface_lifetime(self): + """torch.as_tensor(obj) tensor grabs a reference to obj so that the lifetime of obj exceeds the tensor""" + numba_ary = numba.cuda.to_device(numpy.arange(6)) + torch_ary = torch.as_tensor(numba_ary, device="cuda") + self.assertEqual(torch_ary.__cuda_array_interface__, numba_ary.__cuda_array_interface__) # No copy + del numba_ary + self.assertEqual(torch_ary.cpu().data.numpy(), numpy.arange(6)) # `torch_ary` is still alive + + @unittest.skipIf(not TEST_NUMPY, "No numpy") + @unittest.skipIf(not TEST_CUDA, "No cuda") + @unittest.skipIf(not TEST_NUMBA_CUDA, "No numba.cuda") + @unittest.skipIf(not TEST_MULTIGPU, "No multigpu") + def test_from_cuda_array_interface_active_device(self): + """torch.as_tensor() tensor device must match active numba context.""" + + # Both torch/numba default to device 0 and can interop freely + numba_ary = numba.cuda.to_device(numpy.arange(6)) + torch_ary = torch.as_tensor(numba_ary, device="cuda") + self.assertEqual(torch_ary.cpu().data.numpy(), numpy.asarray(numba_ary)) + self.assertEqual(torch_ary.__cuda_array_interface__, numba_ary.__cuda_array_interface__) + + # Torch should raise `RuntimeError` when the Numba and Torch device differ + numba_ary = numba.cuda.to_device(numpy.arange(6)) + with self.assertRaises(RuntimeError): + torch.as_tensor(numba_ary, device=torch.device("cuda", 1)) + if __name__ == "__main__": common.run_tests() diff --git a/test/test_quantized.py b/test/test_quantized.py index e1c6b0ec2f74..8553265d372e 100644 --- a/test/test_quantized.py +++ b/test/test_quantized.py @@ -25,13 +25,21 @@ def _dequantize(qx, scale, zero_point): return x +def _requantize(x, multiplier, zero_point, qmin=0, qmax=255, qtype=np.uint8): + """Requantizes a numpy array, i.e., intermediate int32 or int16 values are + converted back to given type""" + qx = (x * multiplier).round() + zero_point + qx = np.clip(qx, qmin, qmax).astype(qtype) + return qx + + # Make sure we won't have overflows from vpmaddubsw instruction used in FBGEMM. # On the current Intel x86 architecture, we need to utilize vpmaddubsw instruction # for the 8-bit int multiplication. This instruction vertically multiplies each # unsigned 8-bit integer from a with the corresponding signed 8-bit integer from # b, producing intermediate signed 16-bit integers. This function modifies the # weights to eliminate the overflow on the signed 16-bit integers. -def avoid_vpmaddubsw_overflow_fc( +def avoid_vpmaddubsw_overflow_linear( batch_size, input_channels, output_channels, X, X_min, X_max, W, W_min, W_max ): for i, j in np.ndindex((batch_size, output_channels)): @@ -57,8 +65,8 @@ def avoid_vpmaddubsw_overflow_fc( assert -(1 << 15) <= x0 * w0 + x1 * w1 < (1 << 15) -# Reference quantized FC operator -def qfc_ref(X_q, X_scale, X_zp, W_q, W_scale, W_zp, b_q, Y_scale, Y_zp): +# Reference quantized Linear operator +def qlinear_ref(X_q, X_scale, X_zp, W_q, W_scale, W_zp, b_q, Y_scale, Y_zp): row_offsets_ref = X_q.sum(axis=1).astype(np.int32).reshape((-1, 1)) col_offsets_ref = W_q.sum(axis=1).astype(np.int32).reshape((1, -1)) assert X_q.ndim == 2 @@ -122,9 +130,7 @@ def test_qrelu(self): X = torch.arange(-5, 5, dtype=torch.float) scale = 2.0 zero_point = 1 - qX = X.quantize_linear(scale=scale, zero_point=zero_point) - # print("X:\n{}".format(X)) - # print("\nQuantized:\n{}\nFake:\n{}".format(qX.int_repr(), _quantize(X.numpy(), scale, zero_point))) + qX = X.quantize_linear(scale=scale, zero_point=zero_point, dtype=torch.quint8) Y = X.numpy().copy() Y[Y < 0] = 0 @@ -132,28 +138,37 @@ def test_qrelu(self): qY_hat = relu(qX) np.testing.assert_equal(qY, qY_hat.int_repr()) - """Tests the correctness of the quantized::sum_relu op.""" - def test_qsumrelu_same_qparams(self): - sum_relu = torch.ops.quantized.sum_relu + """Tests the correctness of the add and add_relu op.""" + def test_qadd_relu_same_qparams(self): + add_relu = torch.ops.quantized.add_relu + add = torch.ops.quantized.add A = torch.arange(-25, 25, dtype=torch.float) B = torch.arange(-25, 25, dtype=torch.float) scale = 2.0 zero_point = 127 - qA = A.quantize_linear(scale=scale, zero_point=zero_point) - qB = A.quantize_linear(scale=scale, zero_point=zero_point) + qA = A.quantize_linear(scale=scale, zero_point=zero_point, dtype=torch.quint8) + qB = A.quantize_linear(scale=scale, zero_point=zero_point, dtype=torch.quint8) - # Sum + ReLU ground truth + # Add ReLU ground truth C = (qA.dequantize() + qB.dequantize()).numpy() - C[C < 0] = 0 qC = _quantize(C, scale, zero_point) - - qC_hat = sum_relu(qA, qB, scale=scale, zero_point=zero_point) - np.testing.assert_equal(qC, qC_hat.int_repr()) - - """Tests the correctness of the quantized::sum_relu op.""" - def test_qsumrelu_different_qparams(self): - sum_relu = torch.ops.quantized.sum_relu + qC_hat = add(qA, qB, scale=scale, zero_point=zero_point) + np.testing.assert_equal(qC, qC_hat.int_repr(), + "Quantized addition failed.") + + # Add + ReLU ground truth + Crelu = C.copy() + Crelu[C < 0] = 0 + qCrelu = _quantize(Crelu, scale, zero_point) + qCrelu_hat = add_relu(qA, qB, scale=scale, zero_point=zero_point) + np.testing.assert_equal(qCrelu, qCrelu_hat.int_repr(), + "Quantized addition with ReLU failed.") + + """Tests the correctness of the add and add_relu op.""" + def test_qadd_relu_different_qparams(self): + add_relu = torch.ops.quantized.add_relu + add = torch.ops.quantized.add A = torch.arange(-25, 25, dtype=torch.float) B = torch.arange(-25, 25, dtype=torch.float) @@ -165,30 +180,37 @@ def test_qsumrelu_different_qparams(self): scale_C = 0.5 zero_point_C = 5 - qA = A.quantize_linear(scale=scale_A, zero_point=zero_point_A) - qB = A.quantize_linear(scale=scale_B, zero_point=zero_point_B) + qA = A.quantize_linear(scale=scale_A, zero_point=zero_point_A, dtype=torch.quint8) + qB = A.quantize_linear(scale=scale_B, zero_point=zero_point_B, dtype=torch.quint8) - # Sum + ReLU ground truth + # Add ground truth C = (qA.dequantize() + qB.dequantize()).numpy() - C[C < 0] = 0 qC = _quantize(C, scale_C, zero_point_C) + qC_hat = add(qA, qB, scale=scale_C, zero_point=zero_point_C) + np.testing.assert_equal(qC, qC_hat.int_repr(), + "Quantized addition failed.") - qC_hat = sum_relu(qA, qB, scale=scale_C, zero_point=zero_point_C) - np.testing.assert_equal(qC, qC_hat.int_repr()) + # Add + ReLU ground truth + Crelu = C.copy() + Crelu[C < 0] = 0 + qCrelu = _quantize(Crelu, scale_C, zero_point_C) + qCrelu_hat = add_relu(qA, qB, scale=scale_C, zero_point=zero_point_C) + np.testing.assert_equal(qCrelu, qCrelu_hat.int_repr(), + "Quantized addition with ReLU failed.") @unittest.skipIf( TEST_WITH_UBSAN or not torch.fbgemm_is_cpu_supported(), - " Quantized FC requires FBGEMM. FBGEMM does not play" + " Quantized Linear requires FBGEMM. FBGEMM does not play" " well with UBSAN at the moment, so we skip the test if" " we are in a UBSAN environment.", ) -class TestQuantizedFC(unittest.TestCase): - """Tests the correctness of the quantized::fc op.""" +class TestQuantizedLinear(unittest.TestCase): + """Tests the correctness of the quantized::fbgemm_linear op.""" - def test_qfc(self): - qfc_prepack = torch.ops.quantized.fbgemm_linear_prepack - qfc = torch.ops.quantized.fbgemm_linear + def test_qlinear(self): + qlinear_prepack = torch.ops.quantized.fbgemm_linear_prepack + qlinear = torch.ops.quantized.fbgemm_linear batch_size = 4 input_channels = 16 @@ -204,7 +226,6 @@ def test_qfc(self): ).astype(np.uint8) W_scale = 0.4 - # W_zp is the zero point for int8 quantization. W_zp = 2 W_value_min = -128 W_value_max = 127 @@ -214,7 +235,13 @@ def test_qfc(self): + W_value_min ).astype(np.int8) - avoid_vpmaddubsw_overflow_fc( + b_value_min = -10 + b_value_max = 10 + b_q0 = np.round( + np.random.rand(output_channels) * (b_value_max - b_value_min) + b_value_min + ).astype(np.int32) + + avoid_vpmaddubsw_overflow_linear( batch_size, input_channels, output_channels, @@ -228,24 +255,24 @@ def test_qfc(self): X = torch.from_numpy(_dequantize(X_q0, X_scale, X_zp)).to(dtype=torch.float) W = torch.from_numpy(_dequantize(W_q0, W_scale, W_zp)).to(dtype=torch.float) + b = torch.from_numpy(_dequantize(b_q0, X_scale * W_scale, 0)).to(dtype=torch.float) - X_q = X.quantize_linear(scale=X_scale, zero_point=X_zp) - # W_zp + 128 is the zero point for uint8 quantization. - W_q = W.quantize_linear(scale=W_scale, zero_point=W_zp + 128) - b_q = torch.round(torch.rand(output_channels) * 10 - 10).to(dtype=torch.int32) + X_q = X.quantize_linear(scale=X_scale, zero_point=X_zp, dtype=torch.quint8) + W_q = W.quantize_linear(scale=W_scale, zero_point=W_zp, dtype=torch.qint8) + b_q = b.quantize_linear(scale=X_scale * W_scale, zero_point=0, dtype=torch.qint32) # Compare X_scale * W_scale * input_channels * X_value_max * W_value_max with # Y_scale * 255 (max for uint8). Y_scale = 125.1234 Y_zp = 5 - # Reference quantized FC operator - Y_q_ref = qfc_ref(X_q0, X_scale, X_zp, W_q0, W_scale, W_zp, b_q.numpy(), Y_scale, Y_zp) + # Reference quantized Linear operator + Y_q_ref = qlinear_ref(X_q0, X_scale, X_zp, W_q0, W_scale, W_zp, b_q0, Y_scale, Y_zp) - # Weight prepacking operator for quantized FC - W_prepack = qfc_prepack(W_q) - # Quantized FC operator with prepacked weight - Y_q = qfc(X_q, W_prepack, b_q, Y_scale, Y_zp) + # Weight prepacking operator for quantized Linear + W_prepack = qlinear_prepack(W_q) + # Quantized Linear operator with prepacked weight + Y_q = qlinear(X_q, W_prepack, b_q, Y_scale, Y_zp) # Y_q_ref_real = _dequantize(Y_q_ref, Y_scale, Y_zp) # Y_q_real = Y_q.dequantize() @@ -256,18 +283,18 @@ def test_qfc(self): # Reference quantized result from PyTorch Linear operator W_fp32 = W_q.dequantize().to(dtype=torch.float) X_fp32 = X_q.dequantize().to(dtype=torch.float) - b_fp32 = torch.from_numpy(_dequantize(b_q.numpy(), W_scale * X_scale, 0).astype(np.float)).to(dtype=torch.float) + b_fp32 = b_q.dequantize().to(dtype=torch.float) Y_fp32_ref = F.linear(X_fp32, W_fp32, b_fp32) - Y_q_ref2 = Y_fp32_ref.quantize_linear(Y_scale, Y_zp) + Y_q_ref2 = Y_fp32_ref.quantize_linear(Y_scale, Y_zp, torch.quint8) # Assert equal np.testing.assert_equal(Y_q_ref2.int_repr().numpy(), Y_q.int_repr().numpy()) - """Tests the correctness of the quantized::fc op.""" - def test_qfcrelu(self): - qfc_prepack = torch.ops.quantized.fbgemm_linear_prepack - qfcrelu = torch.ops.quantized.fbgemm_linear_relu + """Tests the correctness of the quantized::fbgemm_linear_relu op.""" + def test_qlinear_relu(self): + qlinear_prepack = torch.ops.quantized.fbgemm_linear_prepack + qlinear_relu = torch.ops.quantized.fbgemm_linear_relu batch_size = 4 input_channels = 16 @@ -292,7 +319,13 @@ def test_qfcrelu(self): + W_value_min ).astype(np.int8) - avoid_vpmaddubsw_overflow_fc( + b_value_min = -10 + b_value_max = 10 + b_q0 = np.round( + np.random.rand(output_channels) * (b_value_max - b_value_min) + b_value_min + ).astype(np.int32) + + avoid_vpmaddubsw_overflow_linear( batch_size, input_channels, output_channels, @@ -306,24 +339,25 @@ def test_qfcrelu(self): X = torch.from_numpy(_dequantize(X_q0, X_scale, X_zp)).to(dtype=torch.float) W = torch.from_numpy(_dequantize(W_q0, W_scale, W_zp)).to(dtype=torch.float) + b = torch.from_numpy(_dequantize(b_q0, X_scale * W_scale, 0)).to(dtype=torch.float) - X_q = X.quantize_linear(scale=X_scale, zero_point=X_zp) - W_q = W.quantize_linear(scale=W_scale, zero_point=W_zp + 128) - b_q = torch.round(torch.rand(output_channels) * 10 - 10).to(dtype=torch.int32) + X_q = X.quantize_linear(scale=X_scale, zero_point=X_zp, dtype=torch.quint8) + W_q = W.quantize_linear(scale=W_scale, zero_point=W_zp, dtype=torch.qint8) + b_q = b.quantize_linear(scale=X_scale * W_scale, zero_point=0, dtype=torch.qint32) # Compare X_scale * W_scale * input_channels * X_value_max * W_value_max with # Y_scale * 255 (max for uint8). Y_scale = 125.1234 Y_zp = 5 - # Reference quantized FC operator - Y_q_ref = qfc_ref(X_q0, X_scale, X_zp, W_q0, W_scale, W_zp, b_q.numpy(), Y_scale, Y_zp) + # Reference quantized Linear operator + Y_q_ref = qlinear_ref(X_q0, X_scale, X_zp, W_q0, W_scale, W_zp, b_q0, Y_scale, Y_zp) Y_q_ref[Y_q_ref < Y_zp] = Y_zp - # Weight prepacking operator for quantized FC - W_prepack = qfc_prepack(W_q) - # Quantized FC operator with prepacked weight - Y_q = qfcrelu(X_q, W_prepack, b_q, Y_scale, Y_zp) + # Weight prepacking operator for quantized Linear + W_prepack = qlinear_prepack(W_q) + # Quantized Linear operator with prepacked weight + Y_q = qlinear_relu(X_q, W_prepack, b_q, Y_scale, Y_zp) # Y_q_ref_real = _dequantize(Y_q_ref, Y_scale, Y_zp) # Y_q_real = Y_q.dequantize() @@ -334,14 +368,128 @@ def test_qfcrelu(self): # Reference quantized result from PyTorch Linear operator W_fp32 = W_q.dequantize().to(dtype=torch.float) X_fp32 = X_q.dequantize().to(dtype=torch.float) - b_fp32 = torch.from_numpy(_dequantize(b_q.numpy(), W_scale * X_scale, 0).astype(np.float)).to(dtype=torch.float) + b_fp32 = b_q.dequantize().to(dtype=torch.float) Y_fp32_ref = F.linear(X_fp32, W_fp32, b_fp32) Y_fp32_ref[Y_fp32_ref < 0.0] = 0.0 - Y_q_ref2 = Y_fp32_ref.quantize_linear(Y_scale, Y_zp) + Y_q_ref2 = Y_fp32_ref.quantize_linear(Y_scale, Y_zp, torch.quint8) # Assert equal np.testing.assert_equal(Y_q_ref2.int_repr().numpy(), Y_q.int_repr().numpy()) +@unittest.skipIf( + TEST_WITH_UBSAN or not torch.fbgemm_is_cpu_supported(), + " Quantized convolution requires FBGEMM. FBGEMM does not play" + " well with UBSAN at the moment, so we skip the test if" + " we are in a UBSAN environment.", +) +class TestQuantizedConv(unittest.TestCase): + """Tests the correctness of quantized convolution op.""" + def test_qconv(self): + + qconv = torch.ops.quantized.fbgemm_conv2d + qconv_prepack = torch.ops.quantized.fbgemm_conv_prepack + + # N + batch_size = 1 + # C + input_channels = 16 + # H, W + height = width = 24 + # K + output_channels = 8 + + kernel_h = kernel_w = 3 + stride_h = stride_w = 1 + padding_h = padding_w = 1 + dilation_h = dilation_w = 1 + groups = 1 + + W_value_min = 0 + W_value_max = 5 + # We use small values to avoid overflow. + # (the operator expects them in the format (output_channels, input_channels/groups, kernel_h, kernel_w)) + + W_init = torch.randint( + W_value_min, + W_value_max, + (output_channels, int(input_channels / groups), kernel_h, kernel_w), + ) + + b_init = torch.randint(0, 10, (output_channels,)) + + # Existing floating point conv operator + conv_op = torch.nn.Conv2d( + input_channels, + output_channels, + (kernel_h, kernel_w), + (stride_h, stride_w), + (padding_h, padding_w), + (dilation_h, dilation_w), + groups, + ) + + # assign the weights + conv_op.weight = torch.nn.Parameter( + W_init.to(dtype=torch.float), requires_grad=False + ) + conv_op.bias = torch.nn.Parameter( + b_init.to(dtype=torch.float), requires_grad=False + ) + + X_value_min = 0 + X_value_max = 4 + X_init = torch.randint( + X_value_min, X_value_max, (batch_size, input_channels, height, width) + ) + + # run on an input tensor + result_ref = conv_op(X_init.to(dtype=torch.float)) + + # reformat X_init and W_init in the required format by conv operator + # NCHW -> NHWC + X_NHWC = X_init.permute([0, 2, 3, 1]).contiguous() + # KCRS -> RSCK + W_RSCK = W_init.permute([2, 3, 1, 0]).contiguous() + + X_scale = 1.5 + # Currently only 0 as zero point is supported. + X_zero_point = 0 + X = X_scale * (X_NHWC - X_zero_point).to(dtype=torch.float) + + W_scale = 2.5 + W_zero_point = 0 + W = W_scale * (W_RSCK - W_zero_point).to(dtype=torch.float) + + X_q = X.quantize_linear(scale=X_scale, zero_point=X_zero_point, dtype=torch.quint8) + W_q = W.quantize_linear(scale=W_scale, zero_point=W_zero_point, dtype=torch.quint8) + b_q = b_init.to(dtype=torch.int32) + + W_prepack = qconv_prepack(W_q, groups) + Y_scale = 7.3 + Y_zero_point = 5 + + Y_q = qconv( + X_q, + W_prepack, + b_q, + [1, 1], # stride + [1, 1], # padding + [1, 1], # dilation + [0, 0], # output_padding + 1, # groups + Y_scale, + Y_zero_point, + ) + + result_NHWK = result_ref.permute([0, 2, 3, 1]) + result_q = _requantize( + result_NHWK.numpy(), X_scale * W_scale / Y_scale, Y_zero_point + ) + + # Make sure the results match + np.testing.assert_equal(result_q, Y_q.int_repr().numpy()) + + if __name__ == "__main__": run_tests() diff --git a/test/test_sparse.py b/test/test_sparse.py index 764f0a38c552..e105f91139d9 100644 --- a/test/test_sparse.py +++ b/test/test_sparse.py @@ -234,33 +234,44 @@ def fn(x): [0, 0, 0, 3], [0, 0, 1, 4], ]) - v = self.value_tensor([2, 1, 3, 4]) - x = self.sparse_tensor(i, v, torch.Size([3, 4, 5])) - res = self.value_tensor([ - [[2, 0, 0, 0, 0], - [0, 0, 0, 0, 0], - [0, 0, 0, 0, 0], - [0, 0, 0, 0, 0]], - [[1, 0, 0, 0, 0], - [0, 0, 0, 0, 0], - [0, 0, 0, 0, 0], - [0, 0, 0, 0, 0]], - [[0, 3, 0, 0, 0], - [0, 0, 0, 0, 0], - [0, 0, 0, 0, 0], - [0, 0, 0, 0, 4]], - ]) - test_tensor(x, res) - - i = self.index_tensor([ - [0, 1, 2, 2], - [0, 0, 0, 3], - [0, 0, 1, 4], - ]) - v = self.value_empty(4, 0) - x = self.sparse_tensor(i, v, torch.Size([3, 4, 5, 0])) - res = self.value_empty(3, 4, 5, 0) - test_tensor(x, res) + # we don't have to_dense for half types on CPU because it is implemented + # with a slower add_ operation + for dtype in [torch.float16, torch.float64] if self.device != 'cpu' else [torch.float64]: + v = self.value_tensor([2, 1, 3, 4]).to(dtype=dtype) + x = self.sparse_tensor(i, v, torch.Size([3, 4, 5])) + res = self.value_tensor([ + [[2, 0, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 0]], + [[1, 0, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 0]], + [[0, 3, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 4]], + ]).to(dtype=dtype) + + test_tensor(x, res) + + i = self.index_tensor([ + [0, 1, 2, 2], + [0, 0, 0, 3], + [0, 0, 1, 4], + ]) + v = self.value_empty(4, 0).to(dtype=dtype) + x = self.sparse_tensor(i, v, torch.Size([3, 4, 5, 0])) + res = self.value_empty(3, 4, 5, 0).to(dtype=dtype) + test_tensor(x, res) + + # half tensors on cpu don't implement to_dense, so need to convert to float + def _to_dense_half_safe(self, tensor): + if(tensor.dtype == torch.half and tensor.device.type == 'cpu'): + return tensor.to(torch.float).to_dense().to(torch.half) + else: + return tensor.to_dense() def test_to_sparse(self): shape = [10, 5, 19, 8] @@ -269,12 +280,15 @@ def test_to_sparse(self): max_nnz *= dim_sz rnnz = torch.randint(2, max_nnz, (1,)).item() for nnz in [0, 1, rnnz]: - expected, _, _ = self._gen_sparse(dim, nnz, shape) - d = expected.to_dense() - result = d.to_sparse(dim) - self.assertEqual(d, result.to_dense()) # == not implemented for sparse tensors yet - self.assertEqual(expected.size(), result.size()) - self.assertEqual(dim, result.sparse_dim()) + for dtype in [torch.float16, torch.float64, torch.int]: + expected, _, _ = self._gen_sparse(dim, nnz, shape) + expected = expected.to(dtype) + + d = self._to_dense_half_safe(expected) + result = d.to_sparse(dim) + self.assertEqual(d, self._to_dense_half_safe(result)) # == not implemented for sparse tensors yet + self.assertEqual(expected.size(), result.size()) + self.assertEqual(dim, result.sparse_dim()) sp, _, _ = self._gen_sparse(2, 10, [3, 3, 3]) self.assertRaises(RuntimeError, lambda: sp.to_sparse()) @@ -563,6 +577,12 @@ def test_Sparse_to_Sparse_copy_(self): # test type conversion (when x1.copy_(x2), x1.dtype should stay the same) x1 = x1.to(torch.float32) + + x2 = x2.to(torch.float16) + x1_dtype = x1.dtype + x1.copy_(x2) + self.assertEqual(x1_dtype, x1.dtype) + x2 = x2.to(torch.float64) x1_dtype = x1.dtype x1.copy_(x2) @@ -630,6 +650,12 @@ def test_tensor(x): x = torch.sparse.FloatTensor(2, 3, 4) test_tensor(x) + x = torch.sparse.HalfTensor(2, 3, 4) + test_tensor(x) + + x = torch.cuda.sparse.HalfTensor(2, 3, 4) + test_tensor(x) + x = torch.sparse.FloatTensor(2, 3, 4, 0) test_tensor(x) @@ -1512,33 +1538,33 @@ def test_factory(self): for use_tensor_idx in [True, False]: for use_tensor_val in [True, False]: for use_cuda in ([False] if not torch.cuda.is_available() else [True, False]): - # have to include size with cuda sparse tensors - include_size = include_size or use_cuda - dtype = torch.float64 - long_dtype = torch.int64 - device = torch.device('cpu') if not use_cuda else \ - torch.device(torch.cuda.device_count() - 1) - indices = torch.tensor(([0], [2]), dtype=long_dtype) if use_tensor_idx else ([0], [2]) - if test_empty_tensor: - values = self.value_empty(1, 0) - else: - if use_tensor_val: - values = torch.tensor([1.], dtype=dtype) + for dtype in [torch.float64, torch.float16]: + # have to include size with cuda sparse tensors + include_size = include_size or use_cuda + long_dtype = torch.int64 + device = torch.device('cpu') if not use_cuda else \ + torch.device(torch.cuda.device_count() - 1) + indices = torch.tensor(([0], [2]), dtype=long_dtype) if use_tensor_idx else ([0], [2]) + if test_empty_tensor: + values = self.value_empty(1, 0).to(dtype) else: - values = 1. - if include_size: - sparse_tensor = torch.sparse_coo_tensor(indices, values, size, dtype=dtype, - device=device, requires_grad=True) - else: - sparse_tensor = torch.sparse_coo_tensor(indices, values, dtype=dtype, - device=device, requires_grad=True) - self.assertEqual(indices, sparse_tensor._indices()) - self.assertEqual(values, sparse_tensor._values()) - self.assertEqual(size if include_size else default_size, sparse_tensor.size()) - self.assertEqual(dtype, sparse_tensor.dtype) - if use_cuda: - self.assertEqual(device, sparse_tensor._values().device) - self.assertEqual(True, sparse_tensor.requires_grad) + if use_tensor_val: + values = torch.tensor([1.], dtype=dtype) + else: + values = 1. + if include_size: + sparse_tensor = torch.sparse_coo_tensor(indices, values, size, dtype=dtype, + device=device, requires_grad=True) + else: + sparse_tensor = torch.sparse_coo_tensor(indices, values, dtype=dtype, + device=device, requires_grad=True) + self.assertEqual(indices, sparse_tensor._indices()) + self.assertEqual(values, sparse_tensor._values()) + self.assertEqual(size if include_size else default_size, sparse_tensor.size()) + self.assertEqual(dtype, sparse_tensor.dtype) + if use_cuda: + self.assertEqual(device, sparse_tensor._values().device) + self.assertEqual(True, sparse_tensor.requires_grad) def test_factory_size_check(self): indices = self.index_tensor([[1, 2], @@ -1653,6 +1679,8 @@ def test_factory_dense_dim(self): @cpu_only def test_factory_type_inference(self): + t = torch.sparse_coo_tensor(torch.tensor(([0], [2])), torch.tensor([1.], dtype=torch.float16)) + self.assertEqual(torch.float16, t.dtype) t = torch.sparse_coo_tensor(torch.tensor(([0], [2])), torch.tensor([1.], dtype=torch.float32)) self.assertEqual(torch.float32, t.dtype) t = torch.sparse_coo_tensor(torch.tensor(([0], [2])), torch.tensor([1.], dtype=torch.float64)) @@ -1660,6 +1688,8 @@ def test_factory_type_inference(self): t = torch.sparse_coo_tensor(torch.tensor(([0], [2])), torch.tensor([1])) self.assertEqual(torch.int64, t.dtype) + t = torch.sparse_coo_tensor(torch.tensor(([0], [2])), torch.HalfTensor(1, 0)) + self.assertEqual(torch.float16, t.dtype) t = torch.sparse_coo_tensor(torch.tensor(([0], [2])), torch.FloatTensor(1, 0)) self.assertEqual(torch.float32, t.dtype) t = torch.sparse_coo_tensor(torch.tensor(([0], [2])), torch.DoubleTensor(1, 0)) @@ -1713,6 +1743,10 @@ def test_tensor(indices, values, indices_equal, values_equal): values = torch.tensor([1.], dtype=torch.float32) test_tensor(indices, values, True, False) + indices = torch.tensor(([0], [2]), dtype=torch.int64) + values = torch.tensor([1.], dtype=torch.float16) + test_tensor(indices, values, True, False) + indices = torch.tensor(([0], [2]), dtype=torch.int64) values = torch.FloatTensor(1, 0) test_tensor(indices, values, True, True) # An empty tensor's data_ptr is always equal to 0 @@ -1766,14 +1800,14 @@ def test_constructor_device_legacy(self): @cpu_only # not really, but we only really want to run this once def test_dtypes(self): - all_sparse_dtypes = [dtype for dtype in torch.testing.get_all_dtypes() if dtype != torch.float16] + all_sparse_dtypes = [dtype for dtype in torch.testing.get_all_dtypes()] do_test_dtypes(self, all_sparse_dtypes, torch.sparse_coo, torch.device('cpu')) if torch.cuda.is_available(): do_test_dtypes(self, all_sparse_dtypes, torch.sparse_coo, torch.device('cuda:0')) @cpu_only # not really, but we only really want to run this once def test_empty_full(self): - all_sparse_dtypes = [dtype for dtype in torch.testing.get_all_dtypes() if dtype != torch.float16] + all_sparse_dtypes = [dtype for dtype in torch.testing.get_all_dtypes()] do_test_empty_full(self, all_sparse_dtypes, torch.sparse_coo, torch.device('cpu')) if torch.cuda.device_count() > 0: do_test_empty_full(self, all_sparse_dtypes, torch.sparse_coo, None) @@ -1923,6 +1957,47 @@ def do_test(t): do_test(self.sparse_empty(3, 0).data) do_test(self.sparse_empty(3, 0).detach()) + def test_change_tensor_metadata(self): + i = self.index_tensor([[0], [1]]) + v = self.value_tensor([[3, 4, 5]]) + t = torch.sparse_coo_tensor(i, v, torch.Size([1, 2, 3])) + i.resize_(2, 3) + v.resize_(4, 5) + self.assertEqual(list(t.coalesce().indices().size()), [2, 1]) + self.assertEqual(list(t.coalesce().values().size()), [1, 3]) + + i = self.index_tensor([[0], [1]]) + v = self.value_tensor([[3, 4, 5]]) + t = torch.sparse_coo_tensor(i, v, torch.Size([1, 2, 3])) + i.resize_as_(self.index_tensor([0, 1])) + v.resize_as_(self.value_tensor([3, 4, 5])) + self.assertEqual(list(t.coalesce().indices().size()), [2, 1]) + self.assertEqual(list(t.coalesce().values().size()), [1, 3]) + + i = self.index_tensor([[0], [1]]) + v = self.value_tensor([[3, 4, 5]]) + t = torch.sparse_coo_tensor(i, v, torch.Size([1, 2, 3])) + i.as_strided_((2, 1), (1, 1)) + v.as_strided_((1, 3), (1, 1)) + self.assertEqual(list(t.coalesce().indices().size()), [2, 1]) + self.assertEqual(list(t.coalesce().values().size()), [1, 3]) + + i = self.index_tensor([[0], [1]]) + v = self.value_tensor([[3, 4, 5]]) + t = torch.sparse_coo_tensor(i, v, torch.Size([1, 2, 3])) + i.set_(self.index_tensor([0, 1])) + v.set_(self.value_tensor([3, 4, 5])) + self.assertEqual(list(t.coalesce().indices().size()), [2, 1]) + self.assertEqual(list(t.coalesce().values().size()), [1, 3]) + + i = self.index_tensor([[0], [1]]) + v = self.value_tensor([[3, 4, 5]]) + t = torch.sparse_coo_tensor(i, v, torch.Size([1, 2, 3])) + i.transpose_(0, 1) + v.transpose_(0, 1) + self.assertEqual(list(t.coalesce().indices().size()), [2, 1]) + self.assertEqual(list(t.coalesce().values().size()), [1, 3]) + class TestUncoalescedSparse(TestSparse): def setUp(self): diff --git a/test/test_tensorboard.py b/test/test_tensorboard.py index 078c2822cf6d..6990e5787b74 100644 --- a/test/test_tensorboard.py +++ b/test/test_tensorboard.py @@ -22,6 +22,13 @@ HAS_TORCHVISION = False skipIfNoTorchVision = unittest.skipIf(not HAS_TORCHVISION, "no torchvision") +TEST_CAFFE2 = True +try: + from caffe2.python import workspace +except ImportError: + TEST_CAFFE2 = False +skipIfNoCaffe2 = unittest.skipIf(not TEST_CAFFE2, "no caffe2") + TEST_MATPLOTLIB = True try: import matplotlib @@ -72,6 +79,10 @@ def test_pytorch_np(self): self.assertIsInstance(make_np(0), np.ndarray) self.assertIsInstance(make_np(0.1), np.ndarray) + def test_pytorch_autograd_np(self): + x = torch.autograd.Variable(torch.Tensor(1)) + self.assertIsInstance(make_np(x), np.ndarray) + def test_pytorch_write(self): with SummaryWriter() as w: w.add_scalar('scalar', torch.autograd.Variable(torch.rand(1)), 0) @@ -94,7 +105,7 @@ def test_pytorch_histogram_raw(self): num=num, sum=floats.sum().item(), sum_squares=sum_sq, - bucket_limits=limits.tolist(), + bucket_limits=limits[1:].tolist(), bucket_counts=counts.tolist()) ints = make_np(torch.randint(0, 100, (num,))) @@ -107,7 +118,7 @@ def test_pytorch_histogram_raw(self): num=num, sum=ints.sum().item(), sum_squares=sum_sq, - bucket_limits=limits.tolist(), + bucket_limits=limits[1:].tolist(), bucket_counts=counts.tolist()) ints = torch.tensor(range(0, 100)).float() @@ -137,13 +148,31 @@ def test_to_HWC(self): self.assertEqual(converted.shape, (32, 32, 3)) def test_prepare_video(self): - # at each timestep the sum over all other dimensions of the video should stay the same - V_before = np.random.random((4, 10, 3, 20, 20)) - V_after = _prepare_video(np.copy(V_before)) - V_before = np.swapaxes(V_before, 0, 1) - V_before = np.reshape(V_before, newshape=(10, -1)) - V_after = np.reshape(V_after, newshape=(10, -1)) - np.testing.assert_array_almost_equal(np.sum(V_before, axis=1), np.sum(V_after, axis=1)) + # At each timeframe, the sum over all other + # dimensions of the video should be the same. + shapes = [(16, 30, 3, 28, 28), + (36, 30, 3, 28, 28), + (19, 29, 3, 23, 19), + (3, 3, 3, 3, 3)] + for s in shapes: + V_input = np.random.random(s) + V_after = _prepare_video(np.copy(V_input)) + total_frame = s[1] + V_input = np.swapaxes(V_input, 0, 1) + for f in range(total_frame): + x = np.reshape(V_input[f], newshape=(-1)) + y = np.reshape(V_after[f], newshape=(-1)) + np.testing.assert_array_almost_equal(np.sum(x), np.sum(y)) + + def test_numpy_vid_uint8(self): + V_input = np.random.randint(0, 256, (16, 30, 3, 28, 28)).astype(np.uint8) + V_after = _prepare_video(np.copy(V_input)) * 255 + total_frame = V_input.shape[1] + V_input = np.swapaxes(V_input, 0, 1) + for f in range(total_frame): + x = np.reshape(V_input[f], newshape=(-1)) + y = np.reshape(V_after[f], newshape=(-1)) + np.testing.assert_array_almost_equal(np.sum(x), np.sum(y)) freqs = [262, 294, 330, 349, 392, 440, 440, 440, 440, 440, 440] @@ -500,15 +529,19 @@ def test_scalar(self): res = make_np(np.int64(100000000000)) self.assertIsInstance(res, np.ndarray) and self.assertEqual(res.shape, (1,)) - def test_numpy_vid(self): - shapes = [(16, 3, 30, 28, 28), (19, 3, 30, 28, 28), (19, 3, 29, 23, 19)] - for s in shapes: - x = np.random.random_sample(s) - # assert make_np(x, 'VID').shape[3] == 3 + @skipIfNoCaffe2 + def test_caffe2_np(self): + workspace.FeedBlob("testBlob", np.random.randn(1, 3, 64, 64).astype(np.float32)) + self.assertIsInstance(make_np('testBlob'), np.ndarray) - def test_numpy_vid_uint8(self): - x = np.random.randint(0, 256, (16, 3, 30, 28, 28)).astype(np.uint8) - # make_np(x, 'VID').shape[3] == 3 + @skipIfNoCaffe2 + def test_caffe2_np_expect_fail(self): + with self.assertRaises(RuntimeError): + res = make_np('This_blob_does_not_exist') + + def test_pytorch_np_expect_fail(self): + with self.assertRaises(NotImplementedError): + res = make_np({'pytorch': 1.0}) if __name__ == '__main__': run_tests() diff --git a/test/test_torch.py b/test/test_torch.py index 6eb773c54d13..bf814b561b04 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -1447,6 +1447,18 @@ def test_mv(self): self.assertEqual(res1, res2) + def test_numpy_args(self): + x1 = torch.randn(10) + x2 = torch.randn(10) + res1 = torch.add(input=x1, other=x2) + res2 = torch.add(x1=x1, x2=x2) + self.assertEqual(res1, res2) + + x1 = torch.randn(10, 10, 10) + res1 = x1.sum(dim=(0, 2), keepdim=True) + res2 = x1.sum(axis=(0, 2), keepdims=True) + self.assertEqual(res1, res2) + def test_add(self): # [res] torch.add([res,] tensor1, tensor2) m1 = torch.randn(100, 100) @@ -1783,8 +1795,7 @@ def run_test(matrix_size, batches, cast): # Info should be positive for rank deficient matrices a = cast(torch.ones(5, 3, 3)) - if not (a.is_cuda and any(x in torch.version.cuda for x in ['8.0', '9.2'])): - self.assertGreater(a.lu(get_infos=True)[2][0], 0) + self.assertGreater(a.lu(get_infos=True)[2][0], 0) # Error checking, no pivoting variant on CPU with self.assertRaisesRegex(RuntimeError, @@ -2422,6 +2433,50 @@ def test_zeros(self): expected = torch.tensor([[0.]], dtype=torch.float16) self.assertEqual(halfTensor, expected) + def test_std_mean(self): + for device in torch.testing.get_all_device_types(): + x = torch.rand(100, 50, 20, device=device) + for dim in range(x.dim()): + for unbiased in [False, True]: + for keepdim in [False, True]: + std1, mean1 = torch.std_mean(x, dim=dim, unbiased=unbiased, keepdim=keepdim) + std2 = x.std(dim=dim, unbiased=unbiased, keepdim=keepdim) + mean2 = x.mean(dim=dim, keepdim=keepdim) + self.assertEqual(std1, std2) + self.assertEqual(mean1, mean2) + + def test_std_mean_all_dims(self): + for device in torch.testing.get_all_device_types(): + x = torch.rand(100, 50, 20, device=device) + for unbiased in [False, True]: + std1, mean1 = torch.std_mean(x, unbiased=unbiased) + std2 = x.std(unbiased=unbiased) + mean2 = x.mean() + self.assertEqual(std1, std2) + self.assertEqual(mean1, mean2) + + def test_var_mean(self): + for device in torch.testing.get_all_device_types(): + x = torch.rand(100, 300, 50, device=device) + for dim in range(x.dim()): + for unbiased in [False, True]: + for keepdim in [False, True]: + var1, mean1 = torch.var_mean(x, dim=dim, unbiased=unbiased, keepdim=keepdim) + var2 = x.var(dim=dim, unbiased=unbiased, keepdim=keepdim) + mean2 = x.mean(dim=dim, keepdim=keepdim) + self.assertEqual(var1, var2) + self.assertEqual(mean1, mean2) + + def test_var_mean_all_dims(self): + for device in torch.testing.get_all_device_types(): + x = torch.rand(100, 50, 20, device=device) + for unbiased in [False, True]: + var1, mean1 = torch.var_mean(x, unbiased=unbiased) + var2 = x.var(unbiased=unbiased) + mean2 = x.mean() + self.assertEqual(var1, var2) + self.assertEqual(mean1, mean2) + def test_zeros_like(self): expected = torch.zeros(100, 100) @@ -2458,58 +2513,59 @@ def test_zeros_out(self): self.assertEqual(torch.zeros(shape), torch.zeros(shape, layout=torch.strided, out=out)) self.assertEqual(torch.zeros(shape), torch.zeros(shape, device='cpu', out=out)) - @staticmethod - def _test_histc(self, device): - # negative nbins throws - with self.assertRaisesRegex(RuntimeError, 'bins must be > 0'): - torch.histc(torch.tensor([1], dtype=torch.float, device=device), bins=-1) - - # without nbins - actual = torch.histc( - torch.tensor([2, 5], dtype=torch.float, device=device)) - expected = torch.zeros(100, dtype=torch.float, device=device) - expected.data[0] = 1 - expected.data[99] = 1 - self.assertEqual(expected, actual) - # tensor with the same element - actual = torch.histc(torch.ones(5, dtype=torch.float, device=device), bins=5) - self.assertEqual( - torch.tensor([0, 0, 5, 0, 0], dtype=torch.float, device=device), - actual) - # no element falls between [min, max] - actual = torch.histc( - torch.ones(5, dtype=torch.float, device=device), bins=5, min=2, max=3) - self.assertEqual( - torch.tensor([0, 0, 0, 0, 0], dtype=torch.float, device=device), - actual) - # element falls below min + integral bin size and - actual = torch.histc( - torch.tensor([2, 4, 2, 2, 5, 4], dtype=torch.float, device=device), - bins=5, min=1, max=5) - self.assertEqual( - torch.tensor([0, 3, 0, 2, 1], dtype=torch.float, device=device), - actual) - # non-integral bin size - actual = torch.histc( - torch.tensor([1, 2, 1], dtype=torch.float, device=device), - bins=4, min=0, max=3) - self.assertEqual( - torch.tensor([0, 2, 1, 0], dtype=torch.float, device=device), - actual) - # double input - actual = torch.histc( - torch.tensor([1, 2, 1], dtype=torch.double, device=device), - bins=4, min=0, max=3) - self.assertEqual( - torch.tensor([0, 2, 1, 0], dtype=torch.double, device=device), - actual) - # mixed input - actual = torch.histc( - torch.tensor([1., 2, 1], dtype=torch.float, device=device), - bins=4, min=0, max=3) - self.assertEqual( - torch.tensor([0, 2, 1, 0], dtype=torch.float, device=device), - actual) + def test_histc(self): + for device in torch.testing.get_all_device_types(): + # negative nbins throws + with self.assertRaisesRegex(RuntimeError, 'bins must be > 0'): + torch.histc(torch.tensor([1], dtype=torch.float, device=device), bins=-1) + + # without nbins + actual = torch.histc( + torch.tensor([2, 5], dtype=torch.float, device=device)) + expected = torch.zeros(100, dtype=torch.float, device=device) + expected.data[0] = 1 + expected.data[99] = 1 + self.assertEqual(expected, actual) + # tensor with the same element + actual = torch.histc(torch.ones(5, dtype=torch.float, device=device), bins=5) + self.assertEqual( + torch.tensor([0, 0, 5, 0, 0], dtype=torch.float, device=device), + actual) + # no element falls between [min, max] + actual = torch.histc( + torch.ones(5, dtype=torch.float, device=device), bins=5, min=2, max=3) + self.assertEqual( + torch.tensor([0, 0, 0, 0, 0], dtype=torch.float, device=device), + actual) + # element falls below min + integral bin size and + actual = torch.histc( + torch.tensor([2, 4, 2, 2, 5, 4], dtype=torch.float, device=device), + bins=5, min=1, max=5) + self.assertEqual( + torch.tensor([0, 3, 0, 2, 1], dtype=torch.float, device=device), + actual) + # non-integral bin size + actual = torch.histc( + torch.tensor([1, 2, 1], dtype=torch.float, device=device), + bins=4, min=0, max=3) + self.assertEqual( + torch.tensor([0, 2, 1, 0], dtype=torch.float, device=device), + actual) + # double input + actual = torch.histc( + torch.tensor([1, 2, 1], dtype=torch.double, device=device), bins=4, min=0, max=3) + self.assertEqual( + torch.tensor([0, 2, 1, 0], dtype=torch.double, device=device), + actual) + self.assertEqual(actual.dtype, torch.double) + # mixed input + actual = torch.histc( + torch.tensor([1., 2, 1], dtype=torch.float, device=device), + bins=4, min=0, max=3) + self.assertEqual( + torch.tensor([0, 2, 1, 0], dtype=torch.float, device=device), + actual) + self.assertEqual(actual.dtype, torch.float) # test against numpy.histogram() def test_against_np(tensor, bins=100, min=0, max=0): @@ -2540,9 +2596,6 @@ def test_against_np(tensor, bins=100, min=0, max=0): expanded = torch.randn(1, 5, 1, 2, device=device).expand(3, 5, 7, 2) test_against_np(expanded) - def test_histc_cpu(self): - self._test_histc(self, 'cpu') - def test_ones(self): res1 = torch.ones(100, 100) res2 = torch.Tensor() @@ -2591,6 +2644,18 @@ def test_copy_dtypes(self): copied_dtype = copy.deepcopy(dtype) self.assertIs(dtype, copied_dtype) + def test_copy_transpose(self): + x = torch.arange(100 * 100, dtype=torch.float).reshape(100, 100).t() + y = torch.empty(100, 100, dtype=torch.float) + y.copy_(x) + self.assertEqual(y[:, 0], range(100)) + self.assertEqual(y[:, 40], range(4000, 4100)) + + y = torch.empty(100, 100, dtype=torch.double) + y.copy_(x) + self.assertEqual(y[:, 0], range(100)) + self.assertEqual(y[:, 40], range(4000, 4100)) + def test_device(self): cpu = torch.device('cpu') self.assertEqual('cpu', str(cpu)) @@ -2680,7 +2745,7 @@ def test_qtensor(self): r = torch.ones(num_elements, dtype=torch.float) scale = 1.0 zero_point = 2 - qr = r.quantize_linear(scale, zero_point) + qr = r.quantize_linear(scale, zero_point, torch.quint8) self.assertEqual(qr.q_scale(), scale) self.assertEqual(qr.q_zero_point(), zero_point) self.assertTrue(qr.is_quantized) @@ -2698,7 +2763,7 @@ def test_qtensor(self): # Scalar Tensor # item r = torch.ones(1, dtype=torch.float) - qr = r.quantize_linear(scale, zero_point) + qr = r.quantize_linear(scale, zero_point, torch.quint8) self.assertEqual(qr.item(), 1) self.assertEqual(qr[0].item(), 1) # assignment @@ -2711,12 +2776,12 @@ def test_qtensor(self): self.assertEqual(qr.item(), 15) # we can also print a qtensor self.assertEqual(str(qr), - "tensor([15.], size=(1,), dtype=torch.qint8, " + + "tensor([15.], size=(1,), dtype=torch.quint8, " + "scale=1.0, zero_point=2)") empty_r = torch.ones((0, 1), dtype=torch.float) - empty_qr = empty_r.quantize_linear(scale, zero_point) + empty_qr = empty_r.quantize_linear(scale, zero_point, torch.quint8) self.assertEqual(str(empty_qr), - "tensor([], size=(0, 1), dtype=torch.qint8, " + + "tensor([], size=(0, 1), dtype=torch.quint8, " + "scale=1.0, zero_point=2)") def test_qtensor_quant_dequant(self): @@ -2724,7 +2789,7 @@ def test_qtensor_quant_dequant(self): r = torch.from_numpy(r).float() scale = 2 zero_point = 2 - qr = r.quantize_linear(scale, zero_point) + qr = r.quantize_linear(scale, zero_point, torch.quint8) rqr = qr.dequantize() self.assertTrue(np.allclose(r.numpy(), rqr.numpy(), atol=2 / scale)) @@ -2733,8 +2798,38 @@ def test_qtensor_creation(self): zero_point = 10 val = 100 numel = 10 - q = torch._empty_affine_quantized(numel, dtype=torch.qint8, scale=scale, zero_point=zero_point) - # TODO: check dequantized values? + q = torch._empty_affine_quantized([numel], scale=scale, zero_point=zero_point, dtype=torch.quint8) + self.assertEqual(scale, q.q_scale()) + self.assertEqual(zero_point, q.q_zero_point()) + + # create Tensor from uint8_t Tensor, scale and zero_point + int_tensor = torch.randint(0, 100, size=(10,), dtype=torch.uint8) + q = torch._per_tensor_affine_qtensor(int_tensor, scale, zero_point) + self.assertEqual(int_tensor, q.int_repr()) + self.assertEqual(scale, q.q_scale()) + self.assertEqual(zero_point, q.q_zero_point()) + + def test_qtensor_dtypes(self): + r = np.random.rand(3, 2) * 2 - 4 + r = torch.from_numpy(r).float() + scale = 2 + zero_point = 2 + qr = r.quantize_linear(scale, zero_point, torch.qint8) + rqr = qr.dequantize() + self.assertTrue(np.allclose(r.numpy(), rqr.numpy(), atol=2 / scale)) + qr = r.quantize_linear(scale, zero_point, torch.quint8) + rqr = qr.dequantize() + self.assertTrue(np.allclose(r.numpy(), rqr.numpy(), atol=2 / scale)) + qr = r.quantize_linear(scale, zero_point, torch.qint32) + rqr = qr.dequantize() + self.assertTrue(np.allclose(r.numpy(), rqr.numpy(), atol=2 / scale)) + + def test_qtensor_dequantize_linear(self): + t = torch.arange(-10, 10, dtype=torch.int8) + scale = 3 + zero_point = 2 + qt = torch.dequantize_linear(t, scale, zero_point, torch.float) + @unittest.skipIf(torch.cuda.device_count() < 2, 'fewer than 2 GPUs detected') def test_device_guard(self): @@ -5872,18 +5967,10 @@ def test_single_det(M, target, desc): eye = torch.eye(5, device=device) test_single_det(eye, (torch.ones((), device=device), torch.zeros((), device=device)), 'identity') - # TODO: Remove when MAGMA 2.5.0 is built for CUDA 8 and CUDA 9.2 - is_cuda_8_92 = False - if torch.cuda.is_available() and torch.version.cuda is not None: - is_cuda_8_92 = any(x in torch.version.cuda for x in ['8.0', '9.2']) - def test(M): assert M.size(0) >= 5, 'this helper fn assumes M to be at least 5x5' M = M.to(device) - if M.is_cuda and is_cuda_8_92: - return - ref_M_sdet, ref_M_logabsdet = reference_slogdet(M) test_single_det(M, (ref_M_sdet, ref_M_logabsdet), 'basic') @@ -8140,6 +8227,98 @@ def fn(torchfn, *args): A_LU, pivots = fn(torch.lu, (2, 0, 0)) self.assertEqual([(2, 0, 0), (2, 0)], [A_LU.shape, pivots.shape]) + def check_single_matmul(self, x, y, shape): + a = np.array(x, copy=False) + b = np.array(y, copy=False) + expected = np.matmul(a, b) + self.assertTrue(expected.flags['C_CONTIGUOUS']) + + ans = torch.matmul(x, y) + self.assertTrue(ans.is_contiguous()) + self.assertTrue(np.array_equal(ans, expected)) + + out = torch.zeros(*shape, dtype=torch.int64) + ans = torch.matmul(x, y, out=out) + self.assertIs(ans, out) + self.assertTrue(ans.is_contiguous()) + self.assertTrue(np.array_equal(ans, expected)) + + @unittest.skipIf(not TEST_NUMPY, "Numpy not found") + def test_matmul_small_brute_force_1d_Nd(self): + # Issue #20452: range(0, 10) does not work. + n = 1 + for m in range(1, 8): + for p in range(1, 8): + for o in range(1, 5): + # 1d, 3d, inner dimensions C + x = torch.arange(m) + y = torch.arange(o * m * p).reshape(o, m, p) + self.check_single_matmul(x, y, (o, n, p)) + + # 1d, 3d, inner dimensions Fortran + x = torch.arange(m) + y = torch.arange(o * p * m).reshape(o, p, m).transpose(-1, -2) + self.check_single_matmul(x, y, (o, n, p)) + + # 1d, 3d, inner dimensions non-contiguous + x = torch.arange(2 * m)[::2] + y = torch.arange(o * m * 2 * p).reshape(o, m, 2 * p)[:, :, ::2] + self.check_single_matmul(x, y, (o, n, p)) + + for r in range(1, 5): + # 1d, 4d, inner dimensions C + x = torch.arange(m) + y = torch.arange(r * o * m * p).reshape(r, o, m, p) + self.check_single_matmul(x, y, (r, o, n, p)) + + # 1d, 4d, inner dimensions Fortran + x = torch.arange(m) + y = torch.arange(r * o * p * m).reshape(r, o, p, m).transpose(-1, -2) + self.check_single_matmul(x, y, (r, o, n, p)) + + # 1d, 4d, inner dimensions non-contiguous + x = torch.arange(2 * m)[::2] + y = torch.arange(r * o * m * 2 * p).reshape(r, o, m, 2 * p)[:, :, :, ::2] + self.check_single_matmul(x, y, (r, o, n, p)) + + @unittest.skipIf(not TEST_NUMPY, "Numpy not found") + def test_matmul_small_brute_force_2d_Nd(self): + # Issue #20452: range(0, 10) does not work. + for n in range(1, 5): + for m in range(1, 5): + for p in range(1, 5): + for o in range(1, 3): + # 2d, 3d, inner dimensions C + x = torch.arange(n * m).reshape(n, m) + y = torch.arange(o * m * p).reshape(o, m, p) + self.check_single_matmul(x, y, (o, n, p)) + + # 2d, 3d, inner dimensions Fortran + x = torch.arange(m * n).reshape(m, n).transpose(-1, -2) + y = torch.arange(o * p * m).reshape(o, p, m).transpose(-1, -2) + self.check_single_matmul(x, y, (o, n, p)) + + # 2d, 3d, inner dimensions non-contiguous + x = torch.arange(n * 2 * m).reshape(n, 2 * m)[:, ::2] + y = torch.arange(o * m * 2 * p).reshape(o, m, 2 * p)[:, :, ::2] + self.check_single_matmul(x, y, (o, n, p)) + + for r in range(1, 2): + # 2d, 4d, inner dimensions C + x = torch.arange(n * m).reshape(n, m) + y = torch.arange(r * o * m * p).reshape(r, o, m, p) + self.check_single_matmul(x, y, (r, o, n, p)) + + # 2d, 4d, inner dimensions Fortran + x = torch.arange(m * n).reshape(m, n).transpose(-1, -2) + y = torch.arange(r * o * p * m).reshape(r, o, p, m).transpose(-1, -2) + self.check_single_matmul(x, y, (r, o, n, p)) + + # 2d, 4d, inner dimensions non-contiguous + x = torch.arange(n * 2 * m).reshape(n, 2 * m)[:, ::2] + y = torch.arange(r * o * m * 2 * p).reshape(r, o, m, 2 * p)[:, :, :, ::2] + self.check_single_matmul(x, y, (r, o, n, p)) + @skipIfRocm def test_blas_alpha_beta_empty(self): for device in torch.testing.get_all_device_types(): @@ -9314,7 +9493,7 @@ def test_serialization_offset_filelike(self): i, j = 41, 43 with BytesIOContext() as f: pickle.dump(i, f) - torch.save(a, f) + torch.save(a, f) pickle.dump(j, f) torch.save(b, f) f.seek(0) @@ -11348,6 +11527,21 @@ def test_c10_layer_norm(self): weight), torch.tensor(bias), 1, epsilon, True) torch.testing.assert_allclose(expected_norm, actual_norm) + def test_memory_format(self): + x = torch.randn(10, 3, 32, 32) + nhwc = x.contiguous(memory_format=torch.channels_last) + self.assertFalse(nhwc.is_contiguous()) + self.assertTrue(nhwc.is_contiguous(memory_format=torch.channels_last)) + self.assertEqual(nhwc, x) + + + @unittest.skipIf(not torch.cuda.is_available(), 'no CUDA') + def test_memory_format_permute_cuda(self): + x = torch.randn(10, 3, 32, 32) + nhwc = x.contiguous(memory_format=torch.channels_last).cuda() + y = nhwc.permute(0, 1, 3, 2).permute(0, 1, 3, 2) + self.assertFalse(y.is_contiguous(memory_format=torch.channels_last)) + def test_subclass_tensors(self): # raise an error when trying to subclass FloatTensor with self.assertRaisesRegex(TypeError, "type 'torch.FloatTensor' is not an acceptable base type"): @@ -11361,13 +11555,20 @@ def foo(self): f = Foo2() self.assertEqual(f.foo(), 5) + def test_ndim(self): + a = torch.randn(1, 2, 3) + self.assertEqual(3, a.ndim) + b = torch.randn(()) + self.assertEqual(0, b.ndim) + c = torch.randn(1, 0) + self.assertEqual(2, c.ndim) + # Functions to test negative dimension wrapping METHOD = 1 INPLACE_METHOD = 2 FUNCTIONAL = 4 DIM_ARG = None - def make_neg_dim_test(name, tensor_arg, arg_constr, types, extra_dim=0): def neg_dim_test(self): if isinstance(tensor_arg, list): diff --git a/third_party/fbgemm b/third_party/fbgemm index 6ec218e6ed5d..9ae8912fc9d0 160000 --- a/third_party/fbgemm +++ b/third_party/fbgemm @@ -1 +1 @@ -Subproject commit 6ec218e6ed5dcb9b5397a608a3b5b8027b236819 +Subproject commit 9ae8912fc9d09cd22f333c226188cc161d9509a6 diff --git a/third_party/onnx b/third_party/onnx index 5bde6371620b..cc2333a3f929 160000 --- a/third_party/onnx +++ b/third_party/onnx @@ -1 +1 @@ -Subproject commit 5bde6371620b76302864bce90f521d72eda95d0e +Subproject commit cc2333a3f929caca7223b98699237f19388dd585 diff --git a/third_party/sleef b/third_party/sleef index 191f655caa25..9b249c53a803 160000 --- a/third_party/sleef +++ b/third_party/sleef @@ -1 +1 @@ -Subproject commit 191f655caa25526ae226cf88dd2529265176014a +Subproject commit 9b249c53a80343cc1a394ca961d7d5696ea76409 diff --git a/tools/amd_build/build_amd.py b/tools/amd_build/build_amd.py index e765e5aaaf1f..121e069e0b1e 100644 --- a/tools/amd_build/build_amd.py +++ b/tools/amd_build/build_amd.py @@ -93,7 +93,7 @@ for new_dir in args.extra_include_dir: abs_new_dir = os.path.join(proj_dir, new_dir) if os.path.exists(abs_new_dir): - new_dir = os.path.join(new_dir, '*') + new_dir = os.path.join(new_dir, '**/*') includes.append(new_dir) ignores = [ diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml index aa51c670528a..fbdb4c6a3b33 100644 --- a/tools/autograd/derivatives.yaml +++ b/tools/autograd/derivatives.yaml @@ -498,13 +498,13 @@ other: grad.clone().masked_fill_(self > other, 0) - name: mean(Tensor self) - self: grad.expand(self.sizes()) / self.numel() + self: mean_backward(grad, self.sizes(), self.numel()) - name: mean(Tensor self, ScalarType dtype) self: grad.expand(self.sizes()).to(self.scalar_type()) / self.numel() - name: mean(Tensor self, IntArrayRef dim, bool keepdim) - self: sum_backward(grad, self.sizes(), dim, keepdim) / _safe_size(self.sizes(), dim) + self: mean_backward(grad, self.sizes(), dim, keepdim) - name: mean(Tensor self, IntArrayRef dim, ScalarType dtype) self: sum_backward(grad, self.sizes(), dim, false).to(self.scalar_type()) / _safe_size(self.sizes(), dim) @@ -774,10 +774,10 @@ self: unsqueeze_to(grad, dim, self.sizes()) - name: std(Tensor self, bool unbiased) - self: var_backward(grad / (result * 2), self, unbiased) + self: std_backward(result, grad, self, unbiased) - name: std(Tensor self, IntArrayRef dim, bool unbiased, bool keepdim) - self: var_backward(grad / (result * 2), self, dim, unbiased, keepdim) + self: std_backward(result, grad, self, dim, unbiased, keepdim) - name: sub(Tensor self, Tensor other, *, Scalar alpha) self: grad @@ -1486,3 +1486,15 @@ # PackedSequence helpers - name: _pack_padded_sequence(Tensor input, Tensor lengths, bool batch_first) input: _pack_padded_sequence_backward(grad, input.sizes(), result1, batch_first) + +- name: std_mean(Tensor self, IntArrayRef dim, bool unbiased, bool keepdim) + self: var_std_mean_backward(grads, self, result0, result1, dim, unbiased, keepdim, true) + +- name: var_mean(Tensor self, IntArrayRef dim, bool unbiased, bool keepdim) + self: var_std_mean_backward(grads, self, result0, result1, dim, unbiased, keepdim, false) + +- name: std_mean(Tensor self, bool unbiased) + self: var_std_mean_backward(grads, self, result0, result1, unbiased, true) + +- name: var_mean(Tensor self, bool unbiased) + self: var_std_mean_backward(grads, self, result0, result1, unbiased, false) diff --git a/tools/autograd/templates/Functions.cpp b/tools/autograd/templates/Functions.cpp index 698899c51c75..48199d332e3b 100644 --- a/tools/autograd/templates/Functions.cpp +++ b/tools/autograd/templates/Functions.cpp @@ -676,6 +676,46 @@ Tensor var_backward(Tensor grad, const Tensor & self, IntArrayRef dim, bool unbi return (2.0 / (_safe_size(self.sizes(), dim) - unbiased)) * grad * (self - self.mean(dim, true)); } +Tensor std_backward(const Tensor & result, const Tensor & grad, const Tensor & self, bool unbiased) { + return var_backward(grad / (result * 2), self, unbiased); +} + +Tensor std_backward(const Tensor & result, Tensor grad, const Tensor & self, IntArrayRef dim, bool unbiased, bool keepdim) { + return var_backward(grad / (result * 2), self, dim, unbiased, keepdim); +} + +Tensor mean_backward(Tensor grad, const IntArrayRef sizes, IntArrayRef dim, bool keepdim) { + return sum_backward(grad, sizes, dim, keepdim) / _safe_size(sizes, dim); +} + +Tensor mean_backward(Tensor grad, const IntArrayRef sizes, int numel) { + return grad.expand(sizes) / numel; +} + +Tensor var_std_mean_backward(const variable_list& grads, const Tensor & self, const Tensor & r1, const Tensor & r2, IntArrayRef dim, bool unbiased, bool keepdim, bool is_std) { + Tensor grad; + if (grads[0].defined()) { + grad = is_std ? std_backward(r1, grads[0], self, dim, unbiased, keepdim) : var_backward(grads[0], self, dim, unbiased, keepdim); + } + if (grads[1].defined()) { + Tensor mean_grad = mean_backward(grads[1], self.sizes(), dim, keepdim); + grad = grads[0].defined() ? grad + mean_grad : mean_grad; + } + return grad; +} + +Tensor var_std_mean_backward(const variable_list& grads, const Tensor & self, const Tensor & r1, const Tensor & r2, bool unbiased, bool is_std) { + Tensor grad; + if (grads[0].defined()) { + grad = is_std ? std_backward(r1, grads[0], self, unbiased) : var_backward(grads[0], self, unbiased); + } + if (grads[1].defined()) { + Tensor mean_grad = mean_backward(grads[1], self.sizes(), self.numel()); + grad = grads[0].defined() ? grad + mean_grad : mean_grad; + } + return grad; +} + Tensor masked_scatter_backward(const Tensor & grad, const Tensor & mask, IntArrayRef sizes) { int64_t numel = 1; for (auto size : sizes) { @@ -1581,7 +1621,7 @@ std::tuple prelu_double_backward( // This makes no assumption on the signs of sigma. Tensor svd_backward(const std::vector &grads, const Tensor& self, bool some, bool compute_uv, const Tensor& raw_u, const Tensor& sigma, const Tensor& raw_v) { - AT_CHECK(compute_uv, + TORCH_CHECK(compute_uv, "svd_backward: Setting compute_uv to false in torch.svd doesn't compute singular matrices, ", "and hence we cannot compute backward. Please use torch.svd(compute_uv=True)"); @@ -1664,7 +1704,7 @@ Tensor svd_backward(const std::vector &grads, const T // http://eprints.maths.ox.ac.uk/1079/1/NA-08-01.pdf Tensor symeig_backward(const std::vector &grads, const Tensor& self, bool eigenvectors, bool upper, const Tensor& lambda, const Tensor& v) { - AT_CHECK(eigenvectors, + TORCH_CHECK(eigenvectors, "symeig_backward: Setting eigenvectors to false in torch.symeig doesn't compute eigenvectors ", "and hence we cannot compute backward. Please use torch.symeig(eigenvectors=True)"); diff --git a/tools/autograd/templates/VariableType.h b/tools/autograd/templates/VariableType.h index b368f99c0839..d12cc2dea11a 100644 --- a/tools/autograd/templates/VariableType.h +++ b/tools/autograd/templates/VariableType.h @@ -21,6 +21,7 @@ using at::Context; using at::Device; using at::Generator; using at::IntArrayRef; +using at::MemoryFormat; using at::Scalar; using at::ScalarType; using at::SparseTensorRef; diff --git a/tools/autograd/templates/python_nn_functions.cpp b/tools/autograd/templates/python_nn_functions.cpp index 96de55065a71..628fd740117d 100644 --- a/tools/autograd/templates/python_nn_functions.cpp +++ b/tools/autograd/templates/python_nn_functions.cpp @@ -2,6 +2,7 @@ // ${generated_comment} + #include "torch/csrc/Device.h" #include "torch/csrc/DynamicTypes.h" #include "torch/csrc/Exceptions.h" @@ -15,6 +16,7 @@ using at::Tensor; using at::Scalar; +using at::MemoryFormat; using namespace torch::autograd::utils; namespace torch { namespace autograd { diff --git a/tools/autograd/templates/python_torch_functions.cpp b/tools/autograd/templates/python_torch_functions.cpp index 6542dddc3748..3b62d90ee04c 100644 --- a/tools/autograd/templates/python_torch_functions.cpp +++ b/tools/autograd/templates/python_torch_functions.cpp @@ -121,7 +121,7 @@ static PyObject * THPVariable_arange(PyObject* self, PyObject* args, PyObject* k .pinned_memory(r.toBool(5)); return wrap(dispatch_arange(end, options)); } else { - AT_CHECK(!r.toBool(5), " `pin_memory` and `out` parameters are incompatible"); + TORCH_CHECK(!r.toBool(5), " `pin_memory` and `out` parameters are incompatible"); check_out_type_matches(r.tensor(1), r.scalartype(2), r.isNone(2), r.layout(3), r.isNone(3), r.device(4), r.isNone(4)); return wrap(dispatch_arange(r.scalar(0), r.tensor(1)).set_requires_grad(r.toBool(6))); @@ -141,7 +141,7 @@ static PyObject * THPVariable_arange(PyObject* self, PyObject* args, PyObject* k .pinned_memory(r.toBool(7)); return wrap(dispatch_arange(start, end, step, options)); } else { - AT_CHECK(!r.toBool(7), " `pin_memory` and `out` parameters are incompatible"); + TORCH_CHECK(!r.toBool(7), " `pin_memory` and `out` parameters are incompatible"); check_out_type_matches(r.tensor(3), r.scalartype(4), r.isNone(4), r.layout(5), r.isNone(5), r.device(6), r.isNone(6)); return wrap(dispatch_arange(r.scalar(0), r.scalar(1), r.scalar(2), r.tensor(3)).set_requires_grad(r.toBool(8))); diff --git a/tools/autograd/templates/python_variable_methods.cpp b/tools/autograd/templates/python_variable_methods.cpp index eb694febbda0..4397697bf2cd 100644 --- a/tools/autograd/templates/python_variable_methods.cpp +++ b/tools/autograd/templates/python_variable_methods.cpp @@ -143,17 +143,24 @@ static PyObject * THPVariable_dim(PyObject* self, PyObject* args) END_HANDLE_TH_ERRORS } -static Tensor dispatch_contiguous(const Tensor & self) { +static Tensor dispatch_contiguous(const Tensor & self, at::MemoryFormat memory_format) { AutoNoGIL no_gil; OptionalDeviceGuard device_guard(device_of(self)); - return self.contiguous(); + return self.contiguous(memory_format); } - static PyObject * THPVariable_contiguous(PyObject* self, PyObject* args) + +static PyObject * THPVariable_contiguous(PyObject* self, PyObject* args, PyObject* kwargs) { HANDLE_TH_ERRORS + static PythonArgParser parser({ + "contiguous(*, MemoryFormat memory_format=contiguous_format)", + }); + ParsedArgs<1> parsed_args; + auto r = parser.parse(args, kwargs, parsed_args); auto& self_ = reinterpret_cast(self)->cdata; + auto memory_format = r.toMemoryFormat(0); // avoids touching the GIL or current device if self is already contiguous - if (self_.is_contiguous()) { + if (self_.is_contiguous(memory_format)) { // NOTE: this logic is duplicated from VariableType.cpp. Since we need to // record this call to contiguous() in the trace regardless of whether // we actually call contiguous here, we need to record this information @@ -163,13 +170,14 @@ static Tensor dispatch_contiguous(const Tensor & self) { auto node = tracer_state->graph->create(jit::aten::contiguous, /*num_outputs=*/0); jit::tracer::recordSourceLocation(node); jit::tracer::addInputs(node, "self", self_); + jit::tracer::addInputs(node, "memory_format", memory_format); tracer_state->graph->insertNode(node); jit::tracer::addOutput(node, self_); } Py_INCREF(self); return self; } - return THPVariable_Wrap(dispatch_contiguous(self_)); + return THPVariable_Wrap(dispatch_contiguous(self_, memory_format)); END_HANDLE_TH_ERRORS } @@ -321,7 +329,7 @@ static PyObject * THPVariable_cuda(PyObject* self, PyObject* args, PyObject* kwa ParsedArgs<2> parsed_args; auto r = parser.parse(args, kwargs, parsed_args); auto device = r.isNone(0) ? at::Device(at::DeviceType::CUDA) : r.device(0); - AT_CHECK(device.is_cuda(), "Invalid device, must be cuda device"); + TORCH_CHECK(device.is_cuda(), "Invalid device, must be cuda device"); torch::utils::cuda_lazy_init(); return THPVariable_Wrap(dispatch_to(self_, device, r.toBool(1), false)); END_HANDLE_TH_ERRORS @@ -432,15 +440,21 @@ static PyObject * THPVariable_requires_grad_(PyObject* self, PyObject* args, PyO END_HANDLE_TH_ERRORS } -inline bool dispatch_is_contiguous(Tensor & self) { - return self.is_contiguous(); +inline bool dispatch_is_contiguous(Tensor & self, MemoryFormat memory_format) { + return self.is_contiguous(memory_format); } -static PyObject * THPVariable_is_contiguous(PyObject* self_, PyObject* args) +static PyObject * THPVariable_is_contiguous(PyObject* self_, PyObject* args, PyObject* kwargs) { HANDLE_TH_ERRORS + static PythonArgParser parser({ + "is_contiguous(*, MemoryFormat memory_format=contiguous_format)", + }); + ParsedArgs<1> parsed_args; + auto r = parser.parse(args, kwargs, parsed_args); + auto memory_format = r.toMemoryFormat(0); auto& self = reinterpret_cast(self_)->cdata; - return wrap(dispatch_is_contiguous(self)); + return wrap(dispatch_is_contiguous(self, memory_format)); END_HANDLE_TH_ERRORS } @@ -686,7 +700,7 @@ PyMethodDef variable_methods[] = { {"apply_", (PyCFunction)THPVariable_apply_, METH_O, NULL}, {"byte", (PyCFunction)THPVariable_byte, METH_NOARGS, NULL}, {"char", (PyCFunction)THPVariable_char, METH_NOARGS, NULL}, - {"contiguous", (PyCFunction)THPVariable_contiguous, METH_NOARGS, NULL}, + {"contiguous", (PyCFunction)THPVariable_contiguous, METH_VARARGS | METH_KEYWORDS, NULL}, {"copy_", (PyCFunction)THPVariable_copy_, METH_VARARGS | METH_KEYWORDS, NULL}, {"cpu", (PyCFunction)THPVariable_cpu, METH_NOARGS, NULL}, {"cuda", (PyCFunction)THPVariable_cuda, METH_VARARGS | METH_KEYWORDS, NULL}, @@ -698,7 +712,7 @@ PyMethodDef variable_methods[] = { {"bool", (PyCFunction)THPVariable_bool, METH_NOARGS, NULL}, {"half", (PyCFunction)THPVariable_half, METH_NOARGS, NULL}, {"int", (PyCFunction)THPVariable_int, METH_NOARGS, NULL}, - {"is_contiguous", (PyCFunction)THPVariable_is_contiguous, METH_NOARGS, NULL}, + {"is_contiguous", (PyCFunction)THPVariable_is_contiguous, METH_VARARGS | METH_KEYWORDS, NULL}, {"item", (PyCFunction)THPVariable_item, METH_NOARGS, NULL}, {"long", (PyCFunction)THPVariable_long, METH_NOARGS, NULL}, {"map_", (PyCFunction)THPVariable_map_, METH_VARARGS | METH_KEYWORDS, NULL}, diff --git a/tools/autograd/templates/variable_factories.h b/tools/autograd/templates/variable_factories.h index 29b1e559392f..7fbcea2ffd99 100644 --- a/tools/autograd/templates/variable_factories.h +++ b/tools/autograd/templates/variable_factories.h @@ -4,7 +4,6 @@ #include #include -#include #include #include diff --git a/tools/build_pytorch_libs.py b/tools/build_pytorch_libs.py index 62ec73c0f5b9..4d21610bb41e 100644 --- a/tools/build_pytorch_libs.py +++ b/tools/build_pytorch_libs.py @@ -81,14 +81,17 @@ def cmake_defines(lst, **kwargs): def overlay_windows_vcvars(env): - from distutils._msvccompiler import _get_vc_env - vc_arch = 'x64' if IS_64BIT else 'x86' - vc_env = _get_vc_env(vc_arch) - for k, v in env.items(): - lk = k.lower() - if lk not in vc_env: - vc_env[lk] = v - return vc_env + if sys.version_info >= (3, 5): + from distutils._msvccompiler import _get_vc_env + vc_arch = 'x64' if IS_64BIT else 'x86' + vc_env = _get_vc_env(vc_arch) + for k, v in env.items(): + lk = k.lower() + if lk not in vc_env: + vc_env[lk] = v + return vc_env + else: + return env def mkdir_p(dir): @@ -111,10 +114,10 @@ def create_build_env(): my_env['CUDA_BIN_PATH'] = escape_path(CUDA_HOME) if IS_WINDOWS: - my_env = overlay_windows_vcvars(my_env) # When using Ninja under Windows, the gcc toolchain will be chosen as default. # But it should be set to MSVC as the user's first choice. if USE_NINJA: + my_env = overlay_windows_vcvars(my_env) cc = my_env.get('CC', 'cl') cxx = my_env.get('CXX', 'cl') my_env['CC'] = cc @@ -134,10 +137,10 @@ def run_cmake(version, if USE_NINJA: cmake_args.append('-GNinja') elif IS_WINDOWS: + cmake_args.append('-GVisual Studio 15 2017') if IS_64BIT: - cmake_args.append('-GVisual Studio 15 2017 Win64') - else: - cmake_args.append('-GVisual Studio 15 2017') + cmake_args.append('-Ax64') + cmake_args.append('-Thost=x64') try: import numpy as np NUMPY_INCLUDE_DIR = np.get_include() @@ -163,7 +166,7 @@ def run_cmake(version, BUILDING_WITH_TORCH_LIBS=os.getenv("BUILDING_WITH_TORCH_LIBS", "ON"), TORCH_BUILD_VERSION=version, CMAKE_BUILD_TYPE=build_type, - BUILD_TORCH=os.getenv("BUILD_TORCH", "ON"), + CMAKE_VERBOSE_MAKEFILE="ON", BUILD_PYTHON=build_python, BUILD_SHARED_LIBS=os.getenv("BUILD_SHARED_LIBS", "ON"), BUILD_BINARY=check_env_flag('BUILD_BINARY'), @@ -207,8 +210,12 @@ def run_cmake(version, USE_REDIS=os.getenv('USE_REDIS'), USE_GLOG=os.getenv('USE_GLOG'), USE_GFLAGS=os.getenv('USE_GFLAGS'), + USE_ASAN=check_env_flag('USE_ASAN'), WERROR=os.getenv('WERROR')) + if os.getenv('_GLIBCXX_USE_CXX11_ABI'): + cmake_defines(cmake_args, GLIBCXX_USE_CXX11_ABI=os.getenv('_GLIBCXX_USE_CXX11_ABI')) + if os.getenv('USE_OPENMP'): cmake_defines(cmake_args, USE_OPENMP=check_env_flag('USE_OPENMP')) diff --git a/tools/build_variables.py b/tools/build_variables.py index 9080b47a911a..81744686a3e4 100644 --- a/tools/build_variables.py +++ b/tools/build_variables.py @@ -3,7 +3,7 @@ # not currently relevant so they are combined into one list. from __future__ import absolute_import, division, print_function, unicode_literals load("@bazel_skylib//lib:new_sets.bzl", "sets") - +load("//caffe2/caffe2/fb:defs_gpu.bzl", "gpu_library_selector") GENERATED_CPP = [ "Functions.cpp", @@ -68,6 +68,7 @@ "torch/csrc/jit/register_c10_ops.cpp", "torch/csrc/jit/subgraph_matcher.cpp", "torch/csrc/jit/symbolic_script.cpp", + "torch/csrc/jit/profiling_graph_executor_impl.cpp", "torch/csrc/jit/profiling_record.cpp", "torch/csrc/jit/operator.cpp", "torch/csrc/jit/passes/alias_analysis.cpp", @@ -84,6 +85,7 @@ "torch/csrc/jit/passes/graph_fuser.cpp", "torch/csrc/jit/passes/inline_autodiff_subgraphs.cpp", "torch/csrc/jit/passes/inplace_check.cpp", + "torch/csrc/jit/passes/insert_guards.cpp", "torch/csrc/jit/passes/loop_unrolling.cpp", "torch/csrc/jit/passes/lower_grad_of.cpp", "torch/csrc/jit/passes/lower_tuples.cpp", @@ -143,7 +145,6 @@ def add_torch_libs(): r = {} - c2_gpu = (read_config("caffe2", "gpu", "1") == "1") libtorch_python_sources = [ ":generate-code=THNN.cpp", ":generate-code=python_functions.cpp", @@ -157,6 +158,7 @@ def add_torch_libs(): "torch/csrc/DynamicTypes.cpp", "torch/csrc/Generator.cpp", "torch/csrc/Layout.cpp", + "torch/csrc/MemoryFormat.cpp", "torch/csrc/Module.cpp", "torch/csrc/PtrWrapper.cpp", "torch/csrc/Size.cpp", @@ -200,6 +202,7 @@ def add_torch_libs(): "torch/csrc/autograd/python_variable_indexing.cpp", "torch/csrc/byte_order.cpp", "torch/csrc/distributed/Module.cpp", + "torch/csrc/distributed/c10d/comm.cpp", "torch/csrc/distributed/c10d/init.cpp", "torch/csrc/distributed/c10d/reducer.cpp", "torch/csrc/jit/init.cpp", @@ -231,6 +234,7 @@ def add_torch_libs(): "torch/csrc/utils/tensor_apply.cpp", "torch/csrc/utils/tensor_dtypes.cpp", "torch/csrc/utils/tensor_layouts.cpp", + "torch/csrc/utils/tensor_memoryformats.cpp", "torch/csrc/utils/tensor_list.cpp", "torch/csrc/utils/tensor_new.cpp", "torch/csrc/utils/tensor_numpy.cpp", @@ -252,25 +256,26 @@ def add_torch_libs(): "torch/csrc/distributed/c10d/ddp.cpp", ] + compiler_flags_cpu = [ + "-D_THP_CORE", + "-DUSE_C10D", + "-DUSE_DISTRIBUTED", + "-DUSE_NUMPY", + "-DUSE_SCALARS", + "-DNO_CUDNN_DESTROY_HANDLE", + "-DPYTORCH_ONNX_CAFFE2_BUNDLE", + "-Wno-write-strings", + "-Wno-format", + "-Wno-strict-aliasing", + "-Wno-non-virtual-dtor", + "-Wno-shadow-compatible-local", + "-Wno-empty-body", + ] + compiler_flags_cuda = [ + "-DUSE_CUDNN", + "-DUSE_NCCL", + ] common_flags = { - "compiler_flags": [ - "-D_THP_CORE", - "-DUSE_C10D", - "-DUSE_DISTRIBUTED", - "-DUSE_NUMPY", - "-DUSE_SCALARS", - "-DNO_CUDNN_DESTROY_HANDLE", - "-DPYTORCH_ONNX_CAFFE2_BUNDLE", - "-Wno-write-strings", - "-Wno-format", - "-Wno-strict-aliasing", - "-Wno-non-virtual-dtor", - "-Wno-shadow-compatible-local", - "-Wno-empty-body", - ] + ([ - "-DUSE_CUDNN", - "-DUSE_NCCL", - ] if c2_gpu else []), "compiler_specific_flags": { "clang": [ "-Wno-absolute-value", @@ -307,6 +312,7 @@ def add_torch_libs(): ("nanopb", None, "protobuf-nanopb"), ("protobuf", None), ], + compiler_flags=compiler_flags_cpu, **common_flags ) @@ -336,29 +342,55 @@ def add_torch_libs(): ("cuda", None, "nvrtc-lazy"), ("cuda", None, "nvrtc-builtins-lazy"), ], + compiler_flags=compiler_flags_cpu + compiler_flags_cuda, **common_flags ) # TODO: split it into cpp and cuda parts similarly to libtorch - cpp_library( + gpu_library_selector( name="_C_impl", - srcs=libtorch_python_sources + ( - libtorch_python_cuda_sources if c2_gpu else [] - ), + deps_cpu=[":_C_impl_cpu"], + deps_cuda=[":_C_impl_cuda"], + merge_cpu_deps=False, + ) + + cpp_library( + name="_C_impl_cpu", + srcs=libtorch_python_sources, + link_whole=True, + deps=[ + ":libtorch", + ":thnn", + "//caffe2/torch/lib/THD:THD_cpu", + "//caffe2/torch/lib/c10d:c10d_cpu", + "//caffe2/torch/lib/libshm:libshm", + ], + external_deps=[ + ("numpy", None, "cpp"), + ("pybind11", None), + ("python", None), + ], + compiler_flags=compiler_flags_cpu, + **common_flags + ) + + cpp_library( + name="_C_impl_cuda", + srcs=libtorch_python_sources + libtorch_python_cuda_sources, link_whole=True, deps=[ + ":libtorch_cuda", ":thnn", "//caffe2/torch/lib/THD:THD", "//caffe2/torch/lib/c10d:c10d", "//caffe2/torch/lib/libshm:libshm", - ] + [ - ":libtorch_cuda" if c2_gpu else ":libtorch", ], external_deps=[ ("numpy", None, "cpp"), ("pybind11", None), ("python", None), ], + compiler_flags=compiler_flags_cpu + compiler_flags_cuda, **common_flags ) diff --git a/tools/jit/gen_jit_dispatch.py b/tools/jit/gen_jit_dispatch.py index 483fa5f047ef..bde2d2035c2a 100644 --- a/tools/jit/gen_jit_dispatch.py +++ b/tools/jit/gen_jit_dispatch.py @@ -40,6 +40,7 @@ 'std::array': 'bool[4]', 'std::string': 'str', 'Scalar': 'Scalar', + 'MemoryFormat': 'MemoryFormat', 'Scalar?': 'Scalar?', 'Tensor': 'Tensor', 'Tensor?': 'Tensor?', @@ -96,6 +97,7 @@ def jit_type_of(arg): 'IntArrayRef': '{}.toIntList()->elements()', 'Layout': '{}.toLayout()', 'Layout?': '{}.toOptional()', + 'MemoryFormat': '{}.toMemoryFormat()', 'Scalar': '{}.toScalar()', 'Scalar?': '{}.toOptional()', 'ScalarType': '{}.toScalarType()', @@ -483,6 +485,7 @@ def format_arg(arg): .replace('true', 'True') \ .replace('false', 'False') \ .replace('Reduction::Mean', 'Mean') \ + .replace('MemoryFormat::Contiguous', 'contiguous_format') \ .replace('{}', 'None' if is_tensor_arg(arg) else '[]') \ .replace('{', '[') \ .replace('}', ']') diff --git a/tools/pyi/gen_pyi.py b/tools/pyi/gen_pyi.py index f3ddc100e346..fd4abf591783 100644 --- a/tools/pyi/gen_pyi.py +++ b/tools/pyi/gen_pyi.py @@ -335,6 +335,8 @@ def gen_pyi(declarations_path, out): 'as_tensor': ["def as_tensor(data: Any, dtype: _dtype=None, device: Optional[_device]=None) -> Tensor: ..."], 'get_num_threads': ['def get_num_threads() -> _int: ...'], 'set_num_threads': ['def set_num_threads(num: _int) -> None: ...'], + 'get_num_interop_threads': ['def get_num_interop_threads() -> _int: ...'], + 'set_num_interop_threads': ['def set_num_interop_threads(num: _int) -> None: ...'], # These functions are explicitly disabled by # SKIP_PYTHON_BINDINGS because they are hand bound. # Correspondingly, we must hand-write their signatures. diff --git a/tools/run-clang-tidy-in-ci.sh b/tools/run-clang-tidy-in-ci.sh index 57ce28212305..39e72e1eb109 100755 --- a/tools/run-clang-tidy-in-ci.sh +++ b/tools/run-clang-tidy-in-ci.sh @@ -16,7 +16,7 @@ if [[ ! -d build ]]; then mkdir build pushd build # We really only need compile_commands.json, so no need to build! - time cmake -DBUILD_TORCH=ON .. + time cmake .. popd # Generate ATen files. diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt index 5654b639e45a..e7f72d6ad9de 100644 --- a/torch/CMakeLists.txt +++ b/torch/CMakeLists.txt @@ -1,8 +1,7 @@ -if (CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO) - if (NOT BUILD_TORCH) - return() - endif() -else() +# This file used to build libtorch.so. +# Now it only builds the Torch python bindings. + +if (NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO) cmake_minimum_required(VERSION 3.5 FATAL_ERROR) project(torch CXX C) find_package(Caffe2 REQUIRED) @@ -10,498 +9,34 @@ else() set(CMAKE_EXPORT_COMPILE_COMMANDS ON) endif() -option(TORCH_STATIC "Build libtorch.a rather than libtorch.so" OFF) +if (NOT BUILD_PYTHON) + return() +endif() set(TORCH_SRC_DIR "${CMAKE_CURRENT_SOURCE_DIR}") set(TORCH_ROOT "${TORCH_SRC_DIR}/..") -if(NOT TORCH_INSTALL_BIN_DIR) - set(TORCH_INSTALL_BIN_DIR bin) -endif() - -if(NOT TORCH_INSTALL_INCLUDE_DIR) - set(TORCH_INSTALL_INCLUDE_DIR include) -endif() - if(NOT TORCH_INSTALL_LIB_DIR) set(TORCH_INSTALL_LIB_DIR lib) endif() -set(CMAKE_POSITION_INDEPENDENT_CODE TRUE) - -# Generate files -set(TOOLS_PATH "${TORCH_ROOT}/tools") - -configure_file("${TORCH_ROOT}/aten/src/ATen/common_with_cwrap.py" - "${TOOLS_PATH}/shared/cwrap_common.py" - COPYONLY) - -configure_file("${TORCH_SRC_DIR}/_utils_internal.py" - "${TOOLS_PATH}/shared/_utils_internal.py" - COPYONLY) - -add_custom_command( - OUTPUT - "${TORCH_SRC_DIR}/csrc/nn/THNN.cpp" - "${TORCH_SRC_DIR}/csrc/nn/THCUNN.cpp" - "${TORCH_SRC_DIR}/csrc/autograd/generated/VariableType.h" - "${TORCH_SRC_DIR}/csrc/autograd/generated/VariableType_0.cpp" - "${TORCH_SRC_DIR}/csrc/autograd/generated/VariableType_1.cpp" - "${TORCH_SRC_DIR}/csrc/autograd/generated/VariableType_2.cpp" - "${TORCH_SRC_DIR}/csrc/autograd/generated/VariableType_3.cpp" - "${TORCH_SRC_DIR}/csrc/autograd/generated/VariableType_4.cpp" - "${TORCH_SRC_DIR}/csrc/autograd/generated/Functions.h" - "${TORCH_SRC_DIR}/csrc/autograd/generated/Functions.cpp" - "${TORCH_SRC_DIR}/csrc/autograd/generated/python_functions.h" - "${TORCH_SRC_DIR}/csrc/autograd/generated/python_functions.cpp" - "${TORCH_SRC_DIR}/csrc/autograd/generated/python_variable_methods.cpp" - "${TORCH_SRC_DIR}/csrc/autograd/generated/python_variable_methods_dispatch.h" - "${TORCH_SRC_DIR}/csrc/autograd/generated/python_torch_functions.cpp" - "${TORCH_SRC_DIR}/csrc/autograd/generated/python_torch_functions_dispatch.h" - "${TORCH_SRC_DIR}/csrc/autograd/generated/python_nn_functions.cpp" - "${TORCH_SRC_DIR}/csrc/autograd/generated/python_nn_functions.h" - "${TORCH_SRC_DIR}/csrc/autograd/generated/python_nn_functions_dispatch.h" - "${TORCH_SRC_DIR}/csrc/autograd/generated/variable_factories.h" - "${TORCH_SRC_DIR}/csrc/jit/generated/register_aten_ops_0.cpp" - "${TORCH_SRC_DIR}/csrc/jit/generated/register_aten_ops_1.cpp" - "${TORCH_SRC_DIR}/csrc/jit/generated/register_aten_ops_2.cpp" - COMMAND - "${PYTHON_EXECUTABLE}" tools/setup_helpers/generate_code.py - --declarations-path "${CMAKE_BINARY_DIR}/aten/src/ATen/Declarations.yaml" - --nn-path "aten/src/" - DEPENDS - "${CMAKE_BINARY_DIR}/aten/src/ATen/Declarations.yaml" - "${CMAKE_CURRENT_LIST_DIR}/../aten/src/THNN/generic/THNN.h" - "${TOOLS_PATH}/autograd/templates/VariableType.h" - "${TOOLS_PATH}/autograd/templates/VariableType.cpp" - "${TOOLS_PATH}/autograd/templates/Functions.h" - "${TOOLS_PATH}/autograd/templates/Functions.cpp" - "${TOOLS_PATH}/autograd/templates/python_functions.h" - "${TOOLS_PATH}/autograd/templates/python_functions.cpp" - "${TOOLS_PATH}/autograd/templates/python_variable_methods.cpp" - "${TOOLS_PATH}/autograd/templates/python_variable_methods_dispatch.h" - "${TOOLS_PATH}/autograd/templates/python_torch_functions.cpp" - "${TOOLS_PATH}/autograd/templates/python_torch_functions_dispatch.h" - "${TOOLS_PATH}/autograd/templates/python_nn_functions.cpp" - "${TOOLS_PATH}/autograd/templates/python_nn_functions.h" - "${TOOLS_PATH}/autograd/templates/python_nn_functions_dispatch.h" - "${TOOLS_PATH}/autograd/templates/variable_factories.h" - "${TOOLS_PATH}/autograd/deprecated.yaml" - "${TOOLS_PATH}/autograd/derivatives.yaml" - "${TOOLS_PATH}/autograd/gen_autograd_functions.py" - "${TOOLS_PATH}/autograd/gen_autograd.py" - "${TOOLS_PATH}/autograd/gen_python_functions.py" - "${TOOLS_PATH}/autograd/gen_variable_factories.py" - "${TOOLS_PATH}/autograd/gen_variable_type.py" - "${TOOLS_PATH}/autograd/load_derivatives.py" - "${TOOLS_PATH}/autograd/nested_dict.py" - "${TOOLS_PATH}/autograd/utils.py" - "${TOOLS_PATH}/jit/gen_jit_dispatch.py" - "${TOOLS_PATH}/jit/templates/register_aten_ops.cpp" - WORKING_DIRECTORY "${TORCH_ROOT}") - -set(TORCH_SRCS - ${TORCH_SRC_DIR}/csrc/autograd/anomaly_mode.cpp - ${TORCH_SRC_DIR}/csrc/autograd/engine.cpp - ${TORCH_SRC_DIR}/csrc/autograd/function.cpp - ${TORCH_SRC_DIR}/csrc/autograd/function_hook.cpp - ${TORCH_SRC_DIR}/csrc/autograd/functions/accumulate_grad.cpp - ${TORCH_SRC_DIR}/csrc/autograd/functions/basic_ops.cpp - ${TORCH_SRC_DIR}/csrc/autograd/functions/tensor.cpp - ${TORCH_SRC_DIR}/csrc/autograd/functions/utils.cpp - ${TORCH_SRC_DIR}/csrc/autograd/generated/Functions.cpp - ${TORCH_SRC_DIR}/csrc/autograd/generated/VariableType_0.cpp - ${TORCH_SRC_DIR}/csrc/autograd/generated/VariableType_1.cpp - ${TORCH_SRC_DIR}/csrc/autograd/generated/VariableType_2.cpp - ${TORCH_SRC_DIR}/csrc/autograd/generated/VariableType_3.cpp - ${TORCH_SRC_DIR}/csrc/autograd/generated/VariableType_4.cpp - ${TORCH_SRC_DIR}/csrc/autograd/grad_mode.cpp - ${TORCH_SRC_DIR}/csrc/autograd/input_buffer.cpp - ${TORCH_SRC_DIR}/csrc/autograd/profiler.cpp - ${TORCH_SRC_DIR}/csrc/autograd/record_function.cpp - ${TORCH_SRC_DIR}/csrc/autograd/saved_variable.cpp - ${TORCH_SRC_DIR}/csrc/autograd/variable.cpp - ${TORCH_SRC_DIR}/csrc/autograd/VariableTypeManual.cpp - ${TORCH_SRC_DIR}/csrc/jit/autodiff.cpp - ${TORCH_SRC_DIR}/csrc/jit/attributes.cpp - ${TORCH_SRC_DIR}/csrc/jit/argument_spec.cpp - ${TORCH_SRC_DIR}/csrc/jit/export.cpp - ${TORCH_SRC_DIR}/csrc/jit/pass_manager.cpp - ${TORCH_SRC_DIR}/csrc/jit/pickler.cpp - ${TORCH_SRC_DIR}/csrc/jit/generated/register_aten_ops_0.cpp - ${TORCH_SRC_DIR}/csrc/jit/generated/register_aten_ops_1.cpp - ${TORCH_SRC_DIR}/csrc/jit/generated/register_aten_ops_2.cpp - ${TORCH_SRC_DIR}/csrc/jit/graph_executor.cpp - ${TORCH_SRC_DIR}/csrc/jit/import_source.cpp - ${TORCH_SRC_DIR}/csrc/jit/import.cpp - ${TORCH_SRC_DIR}/csrc/jit/import_export_helpers.cpp - ${TORCH_SRC_DIR}/csrc/jit/interpreter.cpp - ${TORCH_SRC_DIR}/csrc/jit/constants.cpp - ${TORCH_SRC_DIR}/csrc/jit/node_hashing.cpp - ${TORCH_SRC_DIR}/csrc/jit/ir.cpp - ${TORCH_SRC_DIR}/csrc/jit/irparser.cpp - ${TORCH_SRC_DIR}/csrc/jit/netdef_converter.cpp - ${TORCH_SRC_DIR}/csrc/jit/operator.cpp - ${TORCH_SRC_DIR}/csrc/jit/register_c10_ops.cpp - ${TORCH_SRC_DIR}/csrc/jit/subgraph_matcher.cpp - ${TORCH_SRC_DIR}/csrc/jit/symbolic_script.cpp - ${TORCH_SRC_DIR}/csrc/jit/profiling_record.cpp - ${TORCH_SRC_DIR}/csrc/jit/passes/alias_analysis.cpp - ${TORCH_SRC_DIR}/csrc/jit/passes/batch_mm.cpp - ${TORCH_SRC_DIR}/csrc/jit/passes/canonicalize.cpp - ${TORCH_SRC_DIR}/csrc/jit/passes/constant_propagation.cpp - ${TORCH_SRC_DIR}/csrc/jit/passes/constant_pooling.cpp - ${TORCH_SRC_DIR}/csrc/jit/passes/common_subexpression_elimination.cpp - ${TORCH_SRC_DIR}/csrc/jit/passes/create_autodiff_subgraphs.cpp - ${TORCH_SRC_DIR}/csrc/jit/passes/inline_autodiff_subgraphs.cpp - ${TORCH_SRC_DIR}/csrc/jit/passes/dead_code_elimination.cpp - ${TORCH_SRC_DIR}/csrc/jit/passes/decompose_ops.cpp - ${TORCH_SRC_DIR}/csrc/jit/passes/canonicalize_ops.cpp - ${TORCH_SRC_DIR}/csrc/jit/passes/erase_number_types.cpp - ${TORCH_SRC_DIR}/csrc/jit/passes/inline_fork_wait.cpp - ${TORCH_SRC_DIR}/csrc/jit/passes/graph_fuser.cpp - ${TORCH_SRC_DIR}/csrc/jit/passes/inplace_check.cpp - ${TORCH_SRC_DIR}/csrc/jit/passes/loop_unrolling.cpp - ${TORCH_SRC_DIR}/csrc/jit/passes/lower_grad_of.cpp - ${TORCH_SRC_DIR}/csrc/jit/passes/lower_tuples.cpp - ${TORCH_SRC_DIR}/csrc/jit/passes/peephole.cpp - ${TORCH_SRC_DIR}/csrc/jit/passes/remove_expands.cpp - ${TORCH_SRC_DIR}/csrc/jit/passes/remove_inplace_ops.cpp - ${TORCH_SRC_DIR}/csrc/jit/passes/shape_analysis.cpp - ${TORCH_SRC_DIR}/csrc/jit/passes/requires_grad_analysis.cpp - ${TORCH_SRC_DIR}/csrc/jit/passes/specialize_autogradzero.cpp - ${TORCH_SRC_DIR}/csrc/jit/passes/subgraph_rewrite.cpp - ${TORCH_SRC_DIR}/csrc/jit/passes/python_print.cpp - ${TORCH_SRC_DIR}/csrc/jit/passes/utils/subgraph_utils.cpp - ${TORCH_SRC_DIR}/csrc/jit/passes/utils/check_alias_annotation.cpp - ${TORCH_SRC_DIR}/csrc/jit/passes/utils/memory_dag.cpp - ${TORCH_SRC_DIR}/csrc/jit/passes/quantization.cpp - ${TORCH_SRC_DIR}/csrc/jit/fuser/interface.cpp - ${TORCH_SRC_DIR}/csrc/jit/register_prim_ops.cpp - ${TORCH_SRC_DIR}/csrc/jit/register_special_ops.cpp - ${TORCH_SRC_DIR}/csrc/jit/register_quantized_ops.cpp - ${TORCH_SRC_DIR}/csrc/jit/scope.cpp - ${TORCH_SRC_DIR}/csrc/jit/script/compiler.cpp - ${TORCH_SRC_DIR}/csrc/api/src/jit.cpp - ${TORCH_SRC_DIR}/csrc/jit/testing/file_check.cpp - ${TORCH_SRC_DIR}/csrc/jit/script/final_returns.cpp - ${TORCH_SRC_DIR}/csrc/jit/script/schema_matching.cpp - ${TORCH_SRC_DIR}/csrc/jit/script/script_type_parser.cpp - ${TORCH_SRC_DIR}/csrc/jit/script/sugared_value.cpp - ${TORCH_SRC_DIR}/csrc/jit/script/class_type.cpp - ${TORCH_SRC_DIR}/csrc/jit/script/parser.cpp - ${TORCH_SRC_DIR}/csrc/jit/script/builtin_functions.cpp - ${TORCH_SRC_DIR}/csrc/jit/script/edit_distance.cpp - ${TORCH_SRC_DIR}/csrc/jit/script/logging.cpp - ${TORCH_SRC_DIR}/csrc/jit/script/module.cpp - ${TORCH_SRC_DIR}/csrc/jit/script/jit_exception.cpp - ${TORCH_SRC_DIR}/csrc/jit/tracer.cpp - ${TORCH_SRC_DIR}/csrc/jit/hooks_for_testing.cpp - ${TORCH_SRC_DIR}/csrc/utils/tensor_flatten.cpp - ${TORCH_SRC_DIR}/csrc/utils/variadic.cpp - ${TORCH_SRC_DIR}/csrc/jit/fuser/kernel_cache.cpp - ${TORCH_SRC_DIR}/csrc/jit/fuser/compiler.cpp - ${TORCH_SRC_DIR}/csrc/jit/fuser/executor.cpp - ${TORCH_SRC_DIR}/csrc/jit/fuser/codegen.cpp - ${TORCH_SRC_DIR}/csrc/jit/fuser/fallback.cpp - ${TORCH_ROOT}/test/cpp/jit/test.cpp - ) - -if (WIN32) - list(APPEND TORCH_SRCS - ${TORCH_SRC_DIR}/csrc/jit/fuser/cpu/dynamic_library_win.cpp - ) -else () - list(APPEND TORCH_SRCS - ${TORCH_SRC_DIR}/csrc/jit/fuser/cpu/dynamic_library_unix.cpp - ${TORCH_SRC_DIR}/csrc/jit/fuser/cpu/fused_kernel.cpp - ) - if (USE_CUDA AND NOT USE_ROCM) - list(APPEND TORCH_SRCS - ${TORCH_SRC_DIR}/csrc/jit/fuser/cuda/fused_kernel.cpp - ) - add_library(thnvrtc SHARED ${TORCH_SRC_DIR}/csrc/jit/fuser/cuda/thnvrtc.cpp) - target_link_libraries(thnvrtc ${CUDA_NVRTC} ${CUDA_CUDA_LIB} ${CUDA_NVRTC_LIB}) - target_include_directories(thnvrtc PRIVATE ${CUDA_INCLUDE_DIRS}) - install(TARGETS thnvrtc DESTINATION "${TORCH_INSTALL_LIB_DIR}") - endif() -endif () - -if (USE_CUDA) - list(APPEND TORCH_SRCS - ${TORCH_SRC_DIR}/csrc/autograd/profiler_cuda.cpp - ${TORCH_SRC_DIR}/csrc/autograd/functions/comm.cpp - ${TORCH_SRC_DIR}/csrc/cuda/comm.cpp - ) -endif() - - -if (USE_ROCM) - list(APPEND TORCH_SRCS - ${TORCH_SRC_DIR}/csrc/cuda/comm.cpp - ) -endif() - - -if (NOT NO_API) - list(APPEND TORCH_SRCS - ${TORCH_SRC_DIR}/csrc/api/src/cuda.cpp - ${TORCH_SRC_DIR}/csrc/api/src/data/datasets/mnist.cpp - ${TORCH_SRC_DIR}/csrc/api/src/data/samplers/distributed.cpp - ${TORCH_SRC_DIR}/csrc/api/src/data/samplers/random.cpp - ${TORCH_SRC_DIR}/csrc/api/src/data/samplers/sequential.cpp - ${TORCH_SRC_DIR}/csrc/api/src/data/samplers/stream.cpp - ${TORCH_SRC_DIR}/csrc/api/src/nn/init.cpp - ${TORCH_SRC_DIR}/csrc/api/src/nn/module.cpp - ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/batchnorm.cpp - ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/conv.cpp - ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/dropout.cpp - ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/embedding.cpp - ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/functional.cpp - ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/linear.cpp - ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/named_any.cpp - ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/rnn.cpp - ${TORCH_SRC_DIR}/csrc/api/src/optim/adagrad.cpp - ${TORCH_SRC_DIR}/csrc/api/src/optim/adam.cpp - ${TORCH_SRC_DIR}/csrc/api/src/optim/lbfgs.cpp - ${TORCH_SRC_DIR}/csrc/api/src/optim/optimizer.cpp - ${TORCH_SRC_DIR}/csrc/api/src/optim/rmsprop.cpp - ${TORCH_SRC_DIR}/csrc/api/src/optim/serialize.cpp - ${TORCH_SRC_DIR}/csrc/api/src/optim/sgd.cpp - ${TORCH_SRC_DIR}/csrc/api/src/serialize/input-archive.cpp - ${TORCH_SRC_DIR}/csrc/api/src/serialize/output-archive.cpp - ) -endif() - - -if (TORCH_STATIC) - add_library(torch STATIC ${TORCH_SRCS}) - target_compile_definitions(torch PUBLIC TORCH_BUILD_STATIC_LIBS) -else() - add_library(torch SHARED ${TORCH_SRCS}) -endif() - -target_compile_definitions(torch PUBLIC _THP_CORE) - -# until they can be unified, keep these lists synced with setup.py -if(MSVC) - target_compile_options(torch PUBLIC - ${MSVC_RUNTIME_LIBRARY_OPTION} - /Z7 - /EHa - /DNOMINMAX - /wd4267 - /wd4251 - /wd4522 - /wd4522 - /wd4838 - /wd4305 - /wd4244 - /wd4190 - /wd4101 - /wd4996 - /wd4275 - /bigobj - ) -else() - target_compile_options(torch PUBLIC - -std=c++11 - -Wall - -Wextra - -Wno-unused-parameter - -Wno-missing-field-initializers - -Wno-write-strings - -Wno-unknown-pragmas - # Clang has an unfixed bug leading to spurious missing braces - # warnings, see https://bugs.llvm.org/show_bug.cgi?id=21629 - -Wno-missing-braces - ) - - if(NOT APPLE) - target_compile_options(torch PRIVATE - # Considered to be flaky. See the discussion at - # https://github.com/pytorch/pytorch/pull/9608 - -Wno-maybe-uninitialized) - endif() - -endif() - -if (MSVC) -elseif (WERROR) - target_compile_options(torch PRIVATE -Werror -Wno-strict-overflow) -endif() - -if (MSVC) - target_link_libraries(torch onnx onnx_library) -endif() - -target_link_libraries(torch caffe2_library) - -find_package(OpenMP QUIET) -if(USE_OPENMP AND OPENMP_FOUND) - message(STATUS "pytorch is compiling with OpenMP. \n" - "OpenMP CXX_FLAGS: ${OpenMP_CXX_FLAGS}. \n" - "OpenMP libraries: ${OpenMP_CXX_LIBRARIES}.") - target_compile_options(torch INTERFACE ${OpenMP_CXX_FLAGS}) - target_link_libraries(torch ${OpenMP_CXX_LIBRARIES}) -endif() - -if (NOT NO_API) - target_include_directories(torch PUBLIC - ${TORCH_SRC_DIR}/csrc/api - ${TORCH_SRC_DIR}/csrc/api/include) -endif() - -if(USE_CUDA) - if(MSVC) - if (NOT NVTOOLEXT_HOME) - set(NVTOOLEXT_HOME "C:/Program Files/NVIDIA Corporation/NvToolsExt") - endif() - if ($ENV{NVTOOLEXT_HOME}) - set(NVTOOLEXT_HOME $ENV{NVTOOLEXT_HOME}) - endif() - set(TORCH_CUDA_LIBRARIES - ${NVTOOLEXT_HOME}/lib/x64/nvToolsExt64_1.lib - ${CUDA_LIBRARIES}) - target_include_directories(torch PUBLIC "${NVTOOLEXT_HOME}/include") - elseif(APPLE) - set(TORCH_CUDA_LIBRARIES - ${CUDA_TOOLKIT_ROOT_DIR}/lib/libcudart.dylib - ${CUDA_TOOLKIT_ROOT_DIR}/lib/libnvrtc.dylib - ${CUDA_TOOLKIT_ROOT_DIR}/lib/libnvToolsExt.dylib - ${CUDA_LIBRARIES}) - set_target_properties(torch PROPERTIES LINK_FLAGS "-undefined dynamic_lookup") - else() - find_library(LIBNVTOOLSEXT libnvToolsExt.so PATHS ${CUDA_TOOLKIT_ROOT_DIR}/lib64/) - set(TORCH_CUDA_LIBRARIES - ${LIBNVTOOLSEXT} - ${CUDA_LIBRARIES}) - endif() - - target_link_libraries(torch caffe2_gpu_library ${TORCH_CUDA_LIBRARIES}) - target_compile_definitions(torch PRIVATE USE_CUDA) -endif() - -if(USE_ROCM) - target_link_libraries(torch caffe2_hip_library) - target_compile_definitions(torch PRIVATE - USE_ROCM - __HIP_PLATFORM_HCC__ - ) - target_include_directories(torch PRIVATE - /opt/rocm/include - /opt/rocm/hcc/include - /opt/rocm/rocblas/include - /opt/rocm/hipsparse/include - ) -endif() - - -set(TH_CPU_INCLUDE - # dense - ${TORCH_ROOT}/aten/src/TH - ${CMAKE_CURRENT_BINARY_DIR}/../aten/src/TH - ${TORCH_ROOT}/aten/src - ${CMAKE_CURRENT_BINARY_DIR}/../aten/src - ${CMAKE_BINARY_DIR}/aten/src) -target_include_directories(torch PRIVATE ${TH_CPU_INCLUDE}) - -set(ATen_CPU_INCLUDE - ${TORCH_ROOT}/aten/src - ${CMAKE_CURRENT_BINARY_DIR}/../aten/src - ${CMAKE_CURRENT_BINARY_DIR}/../aten/src/ATen - ${CMAKE_BINARY_DIR}/aten/src) -target_include_directories(torch PUBLIC ${ATen_CPU_INCLUDE}) - -target_include_directories(torch PUBLIC - ${TORCH_SRC_DIR}/csrc) - -target_include_directories(torch PUBLIC - ${TORCH_ROOT}/third_party/miniz-2.0.8) - -set_target_properties(torch PROPERTIES VERSION 1 SOVERSION 1) - -if(NOT ${CMAKE_VERSION} VERSION_LESS "3.1") - set_property(TARGET torch PROPERTY CXX_STANDARD 11) -endif() - -# Prevent the unused functions being optimized away -# Otherwise torch.dll will be linked without caffe2_gpu.dll if (MSVC) - set_target_properties(torch PROPERTIES LINK_FLAGS "/OPT:NOREF") -endif(MSVC) - -install(DIRECTORY "${TORCH_SRC_DIR}/csrc" - DESTINATION ${TORCH_INSTALL_INCLUDE_DIR}/torch - FILES_MATCHING PATTERN "*.h") -install(FILES "${TORCH_SRC_DIR}/script.h" "${TORCH_SRC_DIR}/extension.h" - DESTINATION ${TORCH_INSTALL_INCLUDE_DIR}/torch) - -install(TARGETS torch DESTINATION "${TORCH_INSTALL_LIB_DIR}") - -if (MSVC AND NOT TORCH_STATIC) - install(FILES $ DESTINATION "${TORCH_INSTALL_LIB_DIR}" OPTIONAL) -endif() - -if (BUILD_TEST AND NOT MSVC AND NOT USE_ROCM) - add_subdirectory(${TORCH_ROOT}/test/cpp/jit ${CMAKE_BINARY_DIR}/test_jit) -endif() - -if (BUILD_TEST AND NOT NO_API) - add_subdirectory(${TORCH_ROOT}/test/cpp/api ${CMAKE_BINARY_DIR}/test_api) + set(LIBSHM_SUBDIR libshm_windows) +else() + set(LIBSHM_SUBDIR libshm) endif() -if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") - message(STATUS "${CMAKE_CXX_COMPILER} ${CMAKE_CURRENT_LIST_DIR}/abi-check.cpp -o ${CMAKE_BINARY_DIR}/abi-check") - execute_process( - COMMAND - "${CMAKE_CXX_COMPILER}" - "${CMAKE_CURRENT_LIST_DIR}/abi-check.cpp" - "-o" - "${CMAKE_BINARY_DIR}/abi-check" - RESULT_VARIABLE ABI_CHECK_COMPILE_RESULT) - if (ABI_CHECK_COMPILE_RESULT) - message(FATAL_ERROR "Could not compile ABI Check: ${ABI_CHECK_COMPILE_RESULT}") - endif() - execute_process( - COMMAND "${CMAKE_BINARY_DIR}/abi-check" - RESULT_VARIABLE ABI_CHECK_RESULT - OUTPUT_VARIABLE GLIBCXX_USE_CXX11_ABI) - if (ABI_CHECK_RESULT) - message(WARNING "Could not run ABI Check: ${ABI_CHECK_RESULT}") - endif() - message(STATUS "Determined _GLIBCXX_USE_CXX11_ABI=${GLIBCXX_USE_CXX11_ABI}") -endif() +set(LIBSHM_SRCDIR ${TORCH_SRC_DIR}/lib/${LIBSHM_SUBDIR}) +add_subdirectory(${LIBSHM_SRCDIR}) -# CMake config for external projects. -configure_file( - ${PROJECT_SOURCE_DIR}/cmake/TorchConfigVersion.cmake.in - ${PROJECT_BINARY_DIR}/TorchConfigVersion.cmake - @ONLY) -configure_file( - ${TORCH_ROOT}/cmake/TorchConfig.cmake.in - ${PROJECT_BINARY_DIR}/TorchConfig.cmake - @ONLY) -install(FILES - ${PROJECT_BINARY_DIR}/TorchConfigVersion.cmake - ${PROJECT_BINARY_DIR}/TorchConfig.cmake - DESTINATION share/cmake/Torch) -if (USE_DISTRIBUTED) - add_subdirectory(${TORCH_SRC_DIR}/lib/THD) - if (NOT MSVC AND NOT APPLE) - add_subdirectory(${TORCH_SRC_DIR}/lib/c10d) - endif() -endif() +# Generate files +set(TOOLS_PATH "${TORCH_ROOT}/tools") -if (BUILD_PYTHON) - if (MSVC) - add_subdirectory(${TORCH_SRC_DIR}/lib/libshm_windows) - else() - add_subdirectory(${TORCH_SRC_DIR}/lib/libshm) - endif() - set(TORCH_PYTHON_SRCS +set(TORCH_PYTHON_SRCS + ${GENERATED_THNN_CXX} + ${GENERATED_CXX_PYTHON} ${TORCH_SRC_DIR}/csrc/CudaIPCTypes.cpp ${TORCH_SRC_DIR}/csrc/DataLoader.cpp ${TORCH_SRC_DIR}/csrc/Device.cpp @@ -511,16 +46,13 @@ if (BUILD_PYTHON) ${TORCH_SRC_DIR}/csrc/TypeInfo.cpp ${TORCH_SRC_DIR}/csrc/Generator.cpp ${TORCH_SRC_DIR}/csrc/Layout.cpp + ${TORCH_SRC_DIR}/csrc/MemoryFormat.cpp ${TORCH_SRC_DIR}/csrc/Module.cpp ${TORCH_SRC_DIR}/csrc/PtrWrapper.cpp ${TORCH_SRC_DIR}/csrc/Size.cpp ${TORCH_SRC_DIR}/csrc/Storage.cpp ${TORCH_SRC_DIR}/csrc/api/src/python/init.cpp ${TORCH_SRC_DIR}/csrc/autograd/functions/init.cpp - ${TORCH_SRC_DIR}/csrc/autograd/generated/python_functions.cpp - ${TORCH_SRC_DIR}/csrc/autograd/generated/python_nn_functions.cpp - ${TORCH_SRC_DIR}/csrc/autograd/generated/python_torch_functions.cpp - ${TORCH_SRC_DIR}/csrc/autograd/generated/python_variable_methods.cpp ${TORCH_SRC_DIR}/csrc/autograd/init.cpp ${TORCH_SRC_DIR}/csrc/autograd/python_anomaly_mode.cpp ${TORCH_SRC_DIR}/csrc/autograd/python_cpp_function.cpp @@ -545,7 +77,6 @@ if (BUILD_PYTHON) ${TORCH_SRC_DIR}/csrc/jit/script/python_sugared_value.cpp ${TORCH_SRC_DIR}/csrc/jit/script/python_tree_views.cpp ${TORCH_SRC_DIR}/csrc/multiprocessing/init.cpp - ${TORCH_SRC_DIR}/csrc/nn/THNN.cpp ${TORCH_SRC_DIR}/csrc/onnx/init.cpp ${TORCH_SRC_DIR}/csrc/serialization.cpp ${TORCH_SRC_DIR}/csrc/tensor/python_tensor.cpp @@ -559,13 +90,14 @@ if (BUILD_PYTHON) ${TORCH_SRC_DIR}/csrc/utils/tensor_dtypes.cpp ${TORCH_SRC_DIR}/csrc/utils/tensor_layouts.cpp ${TORCH_SRC_DIR}/csrc/utils/tensor_list.cpp + ${TORCH_SRC_DIR}/csrc/utils/tensor_memoryformats.cpp ${TORCH_SRC_DIR}/csrc/utils/tensor_new.cpp ${TORCH_SRC_DIR}/csrc/utils/tensor_numpy.cpp ${TORCH_SRC_DIR}/csrc/utils/tensor_types.cpp ${TORCH_SRC_DIR}/csrc/utils/tuple_parser.cpp ) - set(TORCH_PYTHON_INCLUDE_DIRECTORIES +set(TORCH_PYTHON_INCLUDE_DIRECTORIES ${PYTHON_INCLUDE_DIR} ${TORCH_ROOT} @@ -588,40 +120,35 @@ if (BUILD_PYTHON) ${TORCH_SRC_DIR}/lib ) - if (MSVC) - list(APPEND TORCH_PYTHON_INCLUDE_DIRECTORIES - ${TORCH_SRC_DIR}/lib/libshm_windows) - else() - list(APPEND TORCH_PYTHON_INCLUDE_DIRECTORIES - ${TORCH_SRC_DIR}/lib/libshm) - endif() - set(TORCH_PYTHON_LINK_LIBRARIES - torch +list(APPEND TORCH_PYTHON_INCLUDE_DIRECTORIES ${LIBSHM_SRCDIR}) + +set(TORCH_PYTHON_LINK_LIBRARIES + caffe2_library shm) - set(TORCH_PYTHON_COMPILE_DEFINITIONS) +set(TORCH_PYTHON_COMPILE_DEFINITIONS) - set(TORCH_PYTHON_COMPILE_OPTIONS) +set(TORCH_PYTHON_COMPILE_OPTIONS) - set(TORCH_PYTHON_LINK_FLAGS "") +set(TORCH_PYTHON_LINK_FLAGS "") - if (MSVC) +if (MSVC) string(APPEND TORCH_PYTHON_LINK_FLAGS " /NODEFAULTLIB:LIBCMT.LIB") list(APPEND TORCH_PYTHON_LINK_LIBRARIES ${PYTHON_LIBRARIES}) if (NOT ${CMAKE_BUILD_TYPE} MATCHES "Release") string(APPEND TORCH_PYTHON_LINK_FLAGS " /DEBUG:FULL") endif() - elseif (APPLE) +elseif (APPLE) string(APPEND TORCH_PYTHON_LINK_FLAGS " -undefined dynamic_lookup") - else() +else() list(APPEND TORCH_PYTHON_COMPILE_OPTIONS -fno-strict-aliasing -Wno-write-strings -Wno-strict-aliasing) - endif() +endif() - if (USE_CUDA) +if (USE_CUDA) list(APPEND TORCH_PYTHON_SRCS ${TORCH_SRC_DIR}/csrc/cuda/Module.cpp ${TORCH_SRC_DIR}/csrc/cuda/Storage.cpp @@ -630,7 +157,7 @@ if (BUILD_PYTHON) ${TORCH_SRC_DIR}/csrc/cuda/utils.cpp ${TORCH_SRC_DIR}/csrc/cuda/python_comm.cpp ${TORCH_SRC_DIR}/csrc/cuda/serialization.cpp - ${TORCH_SRC_DIR}/csrc/nn/THCUNN.cpp + ${GENERATED_THNN_CXX_CUDA} ) list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_CUDA) @@ -643,9 +170,11 @@ if (BUILD_PYTHON) find_library(LIBNVTOOLSEXT libnvToolsExt.so PATHS ${CUDA_TOOLKIT_ROOT_DIR}/lib64/) list(APPEND TORCH_PYTHON_LINK_LIBRARIES ${LIBNVTOOLSEXT}) endif() - endif() - if (USE_CUDNN) + list(APPEND TORCH_PYTHON_LINK_LIBRARIES caffe2_gpu_library) +endif() + +if (USE_CUDNN) list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_CUDNN) # NOTE: these are at the front, in case there's another cuDNN in @@ -655,19 +184,19 @@ if (BUILD_PYTHON) # we're not careful. list(INSERT 0 TORCH_PYTHON_LINK_LIBRARIES ${CUDNN_LIBRARY}) list(INSERT 0 TORCH_PYTHON_INCLUDE_DIRECTORIES ${CUDNN_INCLUDE_DIR}) - endif() +endif() - if (USE_MIOPEN) +if (USE_MIOPEN) list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_MIOPEN) list(APPEND TORCH_PYTHON_INCLUDE_DIRECTORIES ${MIOPEN_INCLUDE_DIR}) list(APPEND TORCH_PYTHON_LINK_LIBRARIES ${MIOPEN_LIBRARY}) - endif() +endif() - if (USE_NUMPY) +if (USE_NUMPY) list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_NUMPY) - endif() +endif() - if (USE_ROCM) +if (USE_ROCM) list(APPEND TORCH_PYTHON_SRCS ${TORCH_SRC_DIR}/csrc/cuda/Module.cpp ${TORCH_SRC_DIR}/csrc/cuda/Storage.cpp @@ -676,32 +205,35 @@ if (BUILD_PYTHON) ${TORCH_SRC_DIR}/csrc/cuda/utils.cpp ${TORCH_SRC_DIR}/csrc/cuda/python_comm.cpp ${TORCH_SRC_DIR}/csrc/cuda/serialization.cpp - ${TORCH_SRC_DIR}/csrc/nn/THCUNN.cpp + ${GENERATED_THNN_CXX_CUDA} ) list(APPEND TORCH_PYTHON_LINK_LIBRARIES caffe2_hip_library) list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_ROCM __HIP_PLATFORM_HCC__ ) - endif() +endif() - if (USE_DISTRIBUTED) +if (USE_DISTRIBUTED) list(APPEND TORCH_PYTHON_SRCS ${TORCH_SRC_DIR}/csrc/distributed/Module.cpp) list(APPEND TORCH_PYTHON_INCLUDE_DIRECTORIES ${TORCH_SRC_DIR}/lib/THD) list(APPEND TORCH_PYTHON_LINK_LIBRARIES THD) list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_DISTRIBUTED) if (NOT MSVC AND NOT APPLE) - list(APPEND TORCH_PYTHON_SRCS ${TORCH_SRC_DIR}/csrc/distributed/c10d/init.cpp) - list(APPEND TORCH_PYTHON_SRCS ${TORCH_SRC_DIR}/csrc/distributed/c10d/reducer.cpp) + list(APPEND TORCH_PYTHON_SRCS + ${TORCH_SRC_DIR}/csrc/distributed/c10d/comm.cpp + ${TORCH_SRC_DIR}/csrc/distributed/c10d/init.cpp + ${TORCH_SRC_DIR}/csrc/distributed/c10d/reducer.cpp + ) list(APPEND TORCH_PYTHON_LINK_LIBRARIES c10d) list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D) if (USE_CUDA OR USE_ROCM) list(APPEND TORCH_PYTHON_SRCS ${TORCH_SRC_DIR}/csrc/distributed/c10d/ddp.cpp) endif() endif() - endif() +endif() - if (USE_NCCL) +if (USE_NCCL) list(APPEND TORCH_PYTHON_SRCS ${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp ${TORCH_SRC_DIR}/csrc/cuda/python_nccl.cpp) @@ -709,12 +241,16 @@ if (BUILD_PYTHON) list(APPEND TORCH_PYTHON_LINK_LIBRARIES __caffe2_nccl) if (USE_SYSTEM_NCCL) endif() - endif() +endif() - add_custom_target(torch_python_stubs DEPENDS "${TORCH_SRC_DIR}/__init__.pyi") - # For Declarations.yaml dependency - add_dependencies(torch_python_stubs ATEN_CPU_FILES_GEN_TARGET) - add_custom_command( + + + + +add_custom_target(torch_python_stubs DEPENDS "${TORCH_SRC_DIR}/__init__.pyi") +# For Declarations.yaml dependency +add_dependencies(torch_python_stubs ATEN_CPU_FILES_GEN_TARGET) +add_custom_command( OUTPUT "${TORCH_SRC_DIR}/__init__.pyi" COMMAND @@ -727,21 +263,34 @@ if (BUILD_PYTHON) "${TORCH_ROOT}" ) - add_library(torch_python SHARED ${TORCH_PYTHON_SRCS}) - add_dependencies(torch_python torch_python_stubs) - target_link_libraries(torch_python ${TORCH_PYTHON_LINK_LIBRARIES}) +add_library(torch_python SHARED ${TORCH_PYTHON_SRCS}) - target_compile_definitions(torch_python PRIVATE ${TORCH_PYTHON_COMPILE_DEFINITIONS}) +# Required workaround for generated sources +# See https://samthursfield.wordpress.com/2015/11/21/cmake-dependencies-between-targets-and-files-and-custom-commands/#custom-commands-in-different-directories +add_dependencies(torch_python generate-torch-sources) +set_source_files_properties( + ${GENERATED_THNN_SOURCES} + ${GENERATED_CXX_PYTHON} + PROPERTIES GENERATED TRUE + ) - target_compile_options(torch_python PRIVATE ${TORCH_PYTHON_COMPILE_OPTIONS}) +target_compile_definitions(torch_python PUBLIC _THP_CORE) - target_include_directories(torch_python PUBLIC ${TORCH_PYTHON_INCLUDE_DIRECTORIES}) +add_dependencies(torch_python torch_python_stubs) - if (NOT TORCH_PYTHON_LINK_FLAGS STREQUAL "") - set_target_properties(torch_python PROPERTIES LINK_FLAGS ${TORCH_PYTHON_LINK_FLAGS}) - endif() +target_link_libraries(torch_python ${TORCH_PYTHON_LINK_LIBRARIES}) - install(TARGETS torch_python DESTINATION "${TORCH_INSTALL_LIB_DIR}") +target_compile_definitions(torch_python PRIVATE ${TORCH_PYTHON_COMPILE_DEFINITIONS}) + +target_compile_options(torch_python PRIVATE ${TORCH_PYTHON_COMPILE_OPTIONS}) + +target_include_directories(torch_python PUBLIC ${TORCH_PYTHON_INCLUDE_DIRECTORIES}) + + +if (NOT TORCH_PYTHON_LINK_FLAGS STREQUAL "") + set_target_properties(torch_python PROPERTIES LINK_FLAGS ${TORCH_PYTHON_LINK_FLAGS}) endif() + +install(TARGETS torch_python DESTINATION "${TORCH_INSTALL_LIB_DIR}") diff --git a/torch/_jit_internal.py b/torch/_jit_internal.py index 8c2cff48e6c6..d7a779dd8265 100644 --- a/torch/_jit_internal.py +++ b/torch/_jit_internal.py @@ -179,7 +179,7 @@ def decorator(fn): try: import typing - from typing import Tuple, List, Dict + from typing import Tuple, List, Dict, Optional def is_tuple(ann): # For some reason Python 3.7 violates the Type[A, B].__origin__ == Type rule @@ -196,6 +196,22 @@ def is_dict(ann): return ann.__module__ == 'typing' and \ (getattr(ann, '__origin__', None) is typing.Dict or getattr(ann, '__origin__', None) is dict) + + def is_optional(ann): + # Optional[T] is just shorthand for Union[T, None], so check for both + union_optional = False + if ann.__module__ == 'typing' and \ + (getattr(ann, '__origin__', None) is typing.Union): + args = getattr(ann, '__args__', ()) + if len(args) == 2: + union_optional = (issubclass(args[1], type(None)) and not issubclass(args[0], type(None))) \ + or (issubclass(args[0], type(None)) and not issubclass(args[1], type(None))) + + optional = ann.__module__ == 'typing' and \ + (getattr(ann, '__origin__', None) is typing.Optional) + + return optional or union_optional + except ImportError: # A minimal polyfill for versions of Python that don't have typing. # Note that this means that they also don't support the fancy annotation syntax, so @@ -232,9 +248,20 @@ class DictCls(object): def __getitem__(self, types): return DictInstance(types) + class OptionalInstance(object): + __slots__ = ['__args__'] + + def __init__(self, types): + self.__args__ = types + + class OptionalCls(object): + def __getitem__(self, types): + return OptionalInstance(types) + Tuple = TupleCls() # noqa: T484 List = ListCls() # noqa: T484 Dict = DictCls() # noqa: T484 + Optional = DictCls() # noqa: T484 def is_tuple(ann): return isinstance(ann, TupleInstance) @@ -245,6 +272,9 @@ def is_list(ann): def is_dict(ann): return isinstance(ann, DictInstance) + def is_optional(ann): + return isinstance(ann, OptionalInstance) + # allows BroadcastingList instance to be subscriptable class BroadcastingListCls(object): diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py index b904f09009fa..6e39e36dfcc9 100644 --- a/torch/_tensor_docs.py +++ b/torch/_tensor_docs.py @@ -3031,6 +3031,15 @@ def callable(a, b) -> number See :func:`torch.det` """) +add_docstr_all('dequantize_linear', + r""" +dequantize_linear(int_tensor, scale, zero_point) -> Tensor + +Dequantize an int Tensor that represents the underlying quantized data +using affine quantization scheme with given scale and zero_point. +returns a float Tensor. +""") + add_docstr_all('where', r""" where(condition, y) -> Tensor @@ -3191,3 +3200,8 @@ def callable(a, b) -> number r""" Is the :class:`torch.device` where this Tensor is. """) + +add_docstr_all('ndim', + r""" +Alias for :meth:`~Tensor.dim()` +""") diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py index 291a5871fbe0..8683f3434552 100644 --- a/torch/_torch_docs.py +++ b/torch/_torch_docs.py @@ -2147,7 +2147,15 @@ def merge_dicts(*dicts): r""" get_num_threads() -> int -Gets the number of threads used for parallelizing CPU operations +Returns the number of threads used for parallelizing CPU operations +""") + +add_docstr(torch.get_num_interop_threads, + r""" +get_num_interop_threads() -> int + +Returns the number of threads used for inter-op parallelism on CPU +(e.g. in JIT interpreter) """) add_docstr(torch.gt, @@ -4304,6 +4312,16 @@ def merge_dicts(*dicts): must be called before running eager, JIT or autograd code. """) +add_docstr(torch.set_num_interop_threads, + r""" +set_num_interop_threads(int) + +Sets the number of threads used for interop parallelism +(e.g. in JIT interpreter) on CPU. +WARNING: Can only be called once and before any inter-op parallel work +is started (e.g. JIT execution). +""") + add_docstr(torch.sigmoid, r""" sigmoid(input, out=None) -> Tensor @@ -4660,6 +4678,56 @@ def merge_dicts(*dicts): tensor([ 1.0311, 0.7477, 1.2204, 0.9087]) """.format(**multi_dim_common)) +add_docstr(torch.std_mean, + r""" +.. function:: std_mean(input, unbiased=True) -> (Tensor, Tensor) + +Returns the standard-deviation and mean of all elements in the :attr:`input` tensor. + +If :attr:`unbiased` is ``False``, then the standard-deviation will be calculated +via the biased estimator. Otherwise, Bessel's correction will be used. + +Args: + input (Tensor): the input tensor + unbiased (bool): whether to use the unbiased estimation or not + +Example:: + + >>> a = torch.randn(1, 3) + >>> a + tensor([[0.3364, 0.3591, 0.9462]]) + >>> torch.std_mean(a) + (tensor(0.3457), tensor(0.5472)) + +.. function:: std(input, dim, keepdim=False, unbiased=True) -> (Tensor, Tensor) + +Returns the standard-deviation and mean of each row of the :attr:`input` tensor in the +dimension :attr:`dim`. If :attr:`dim` is a list of dimensions, +reduce over all of them. + +{keepdim_details} + +If :attr:`unbiased` is ``False``, then the standard-deviation will be calculated +via the biased estimator. Otherwise, Bessel's correction will be used. + +Args: + input (Tensor): the input tensor + {dim} + {keepdim} + unbiased (bool): whether to use the unbiased estimation or not + +Example:: + + >>> a = torch.randn(4, 4) + >>> a + tensor([[ 0.5648, -0.5984, -1.2676, -1.4471], + [ 0.9267, 1.0612, 1.1050, -0.6014], + [ 0.0154, 1.9301, 0.0125, -1.0904], + [-1.9711, -0.7748, -1.3840, 0.5067]]) + >>> torch.std_mean(a, 1) + (tensor([0.9110, 0.8197, 1.2552, 1.0608]), tensor([-0.6871, 0.6229, 0.2169, -0.9058])) +""".format(**multi_dim_common)) + add_docstr(torch.sum, r""" .. function:: sum(input, dtype=None) -> Tensor @@ -5513,6 +5581,55 @@ def merge_dicts(*dicts): tensor([ 1.7444, 1.1363, 0.7356, 0.5112]) """.format(**multi_dim_common)) +add_docstr(torch.var_mean, + r""" +.. function:: var_mean(input, unbiased=True) -> (Tensor, Tensor) + +Returns the variance and mean of all elements in the :attr:`input` tensor. + +If :attr:`unbiased` is ``False``, then the variance will be calculated via the +biased estimator. Otherwise, Bessel's correction will be used. + +Args: + input (Tensor): the input tensor + unbiased (bool): whether to use the unbiased estimation or not + +Example:: + + >>> a = torch.randn(1, 3) + >>> a + tensor([[0.0146, 0.4258, 0.2211]]) + >>> torch.var_mean(a) + (tensor(0.0423), tensor(0.2205)) + +.. function:: var_mean(input, dim, keepdim=False, unbiased=True) -> (Tensor, Tensor) + +Returns the variance and mean of each row of the :attr:`input` tensor in the given +dimension :attr:`dim`. + +{keepdim_details} + +If :attr:`unbiased` is ``False``, then the variance will be calculated via the +biased estimator. Otherwise, Bessel's correction will be used. + +Args: + input (Tensor): the input tensor + {dim} + {keepdim} + unbiased (bool): whether to use the unbiased estimation or not + +Example:: + + >>> a = torch.randn(4, 4) + >>> a + tensor([[-1.5650, 2.0415, -0.1024, -0.5790], + [ 0.2325, -2.6145, -1.6428, -0.3537], + [-0.2159, -1.1069, 1.2882, -1.3265], + [-0.6706, -1.5893, 0.6827, 1.6727]]) + >>> torch.var_mean(a, 1) + (tensor([2.3174, 1.6403, 1.4092, 2.0791]), tensor([-0.0512, -1.0946, -0.3403, 0.0239])) +""".format(**multi_dim_common)) + add_docstr(torch.zeros, r""" zeros(*sizes, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor diff --git a/torch/csrc/Device.cpp b/torch/csrc/Device.cpp index 46af266f3fde..d17bbabc1246 100644 --- a/torch/csrc/Device.cpp +++ b/torch/csrc/Device.cpp @@ -67,7 +67,7 @@ PyObject *THPDevice_pynew(PyTypeObject *type, PyObject *args, PyObject *kwargs) device_index = r.toInt64(1); // -1 is allowed in ATen/C++, to mean the default device, but not in // Python. - AT_CHECK(device_index >= 0, "Device index must not be negative"); + TORCH_CHECK(device_index >= 0, "Device index must not be negative"); } at::Device device(as_device.type(), device_index); return THPDevice_New(device); diff --git a/torch/csrc/DynamicTypes.cpp b/torch/csrc/DynamicTypes.cpp index c0578002d29f..97679f290f6d 100644 --- a/torch/csrc/DynamicTypes.cpp +++ b/torch/csrc/DynamicTypes.cpp @@ -25,18 +25,6 @@ namespace torch { namespace { -const std::unordered_map attype_names = { - {"Float", at::kFloat}, - {"Double", at::kDouble}, - {"Half", at::kHalf}, - {"Byte", at::kByte}, - {"Char", at::kChar}, - {"Short", at::kShort}, - {"Int", at::kInt}, - {"Long", at::kLong}, - {"Bool", at::kBool}, -}; - std::unordered_map attype_to_py_storage_type; std::unordered_map py_storage_type_to_attype; @@ -62,12 +50,11 @@ at::Backend get_backend(bool is_cuda, bool is_sparse) { } } -at::DeprecatedTypeProperties* get_type(const std::string& name, bool is_cuda, bool is_sparse) { - if (is_sparse && name == "Half") { +at::DeprecatedTypeProperties* get_type(at::Backend backend, at::ScalarType scalarType) { + if (isSparse(backend) && scalarType == at::kHalf) { return nullptr; } - at::Backend backend = get_backend(is_cuda, is_sparse); - return &at::getNonVariableDeprecatedTypeProperties(backend, attype_names.at(name)); + return &at::getNonVariableDeprecatedTypeProperties(backend, scalarType); } PyTypeObject* getPyTypeObject(const at::Storage& storage) @@ -85,9 +72,8 @@ PyTypeObject* getPyTypeObject(const at::Storage& storage) } } // namespace -void registerStoragePyTypeObject(PyTypeObject *pytype, const std::string& name, bool is_cuda, bool is_sparse) -{ - auto attype = get_type(name, is_cuda, is_sparse); +void registerStoragePyTypeObject(PyTypeObject *pytype, at::Backend backend, at::ScalarType scalarType) { + auto attype = get_type(backend, scalarType); if (attype) { attype_to_py_storage_type[attype] = pytype; py_storage_type_to_attype[pytype] = attype; diff --git a/torch/csrc/DynamicTypes.h b/torch/csrc/DynamicTypes.h index 1f3c4bc342bf..93ce9f2af914 100644 --- a/torch/csrc/DynamicTypes.h +++ b/torch/csrc/DynamicTypes.h @@ -25,8 +25,7 @@ struct Type; namespace torch { // Register a PyTypeObject* with the given attributes void registerStoragePyTypeObject( - PyTypeObject *pytype, const std::string& name, - bool is_cuda, bool is_sparse); + PyTypeObject *pytype, at::Backend backend, at::ScalarType scalarType); void registerDtypeObject(THPDtype *dtype, at::ScalarType scalarType); void registerLayoutObject(THPLayout *layout, at::Backend backend); diff --git a/torch/csrc/MemoryFormat.cpp b/torch/csrc/MemoryFormat.cpp new file mode 100644 index 000000000000..54ce6aad1240 --- /dev/null +++ b/torch/csrc/MemoryFormat.cpp @@ -0,0 +1,80 @@ +#include + +#include +#include +#include + +#include + +#include +#include +#include + +PyObject *THPMemoryFormat_New(at::MemoryFormat memory_format, const std::string& name) +{ + auto type = (PyTypeObject*)&THPMemoryFormatType; + auto self = THPObjectPtr{type->tp_alloc(type, 0)}; + if (!self) throw python_error(); + auto self_ = reinterpret_cast(self.get()); + self_->memory_format = memory_format; + std::strncpy (self_->name, name.c_str(), MEMORY_FORMAT_NAME_LEN); + self_->name[MEMORY_FORMAT_NAME_LEN] = '\0'; + return self.release(); +} + +PyObject *THPMemoryFormat_repr(THPMemoryFormat *self) +{ + return THPUtils_packString(self->name); +} + +PyTypeObject THPMemoryFormatType = { + PyVarObject_HEAD_INIT(nullptr, 0) + "torch.memory_format", /* tp_name */ + sizeof(THPMemoryFormat), /* tp_basicsize */ + 0, /* tp_itemsize */ + nullptr, /* tp_dealloc */ + nullptr, /* tp_print */ + nullptr, /* tp_getattr */ + nullptr, /* tp_setattr */ + nullptr, /* tp_reserved */ + (reprfunc)THPMemoryFormat_repr, /* tp_repr */ + nullptr, /* tp_as_number */ + nullptr, /* tp_as_sequence */ + nullptr, /* tp_as_mapping */ + nullptr, /* tp_hash */ + nullptr, /* tp_call */ + nullptr, /* tp_str */ + nullptr, /* tp_getattro */ + nullptr, /* tp_setattro */ + nullptr, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT, /* tp_flags */ + nullptr, /* tp_doc */ + nullptr, /* tp_traverse */ + nullptr, /* tp_clear */ + nullptr, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + nullptr, /* tp_iter */ + nullptr, /* tp_iternext */ + nullptr, /* tp_methods */ + nullptr, /* tp_members */ + nullptr, /* tp_getset */ + nullptr, /* tp_base */ + nullptr, /* tp_dict */ + nullptr, /* tp_descr_get */ + nullptr, /* tp_descr_set */ + 0, /* tp_dictoffset */ + nullptr, /* tp_init */ + nullptr, /* tp_alloc */ + nullptr, /* tp_new */ +}; + +void THPMemoryFormat_init(PyObject *module) +{ + if (PyType_Ready(&THPMemoryFormatType) < 0) { + throw python_error(); + } + Py_INCREF(&THPMemoryFormatType); + if (PyModule_AddObject(module, "memory_format", (PyObject *)&THPMemoryFormatType) != 0) { + throw python_error(); + } +} diff --git a/torch/csrc/MemoryFormat.h b/torch/csrc/MemoryFormat.h new file mode 100644 index 000000000000..835b8e92b38e --- /dev/null +++ b/torch/csrc/MemoryFormat.h @@ -0,0 +1,25 @@ +#pragma once + +#include + +#include + +#include + +const int MEMORY_FORMAT_NAME_LEN = 64; + +struct THPMemoryFormat { + PyObject_HEAD + at::MemoryFormat memory_format; + char name[MEMORY_FORMAT_NAME_LEN + 1]; +}; + +extern PyTypeObject THPMemoryFormatType; + +inline bool THPMemoryFormat_Check(PyObject *obj) { + return Py_TYPE(obj) == &THPMemoryFormatType; +} + +PyObject * THPMemoryFormat_New(at::MemoryFormat memory_format, const std::string& name); + +void THPMemoryFormat_init(PyObject *module); diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp index b0658e824213..3675a146b189 100644 --- a/torch/csrc/Module.cpp +++ b/torch/csrc/Module.cpp @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -34,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -97,6 +99,7 @@ static PyObject * THPModule_initExtension(PyObject *_unused, PyObject *shm_manag return nullptr; } torch::utils::initializeLayouts(); + torch::utils::initializeMemoryFormats(); torch::utils::initializeDtypes(); torch::tensors::initialize_python_bindings(); std::string path = THPUtils_unpackString(shm_manager_path); @@ -155,7 +158,24 @@ static PyObject * THPModule_setNumThreads(PyObject *module, PyObject *arg) { THPUtils_assert(THPUtils_checkLong(arg), "set_num_threads expects an int, " "but got %s", THPUtils_typename(arg)); - at::set_num_threads((int)THPUtils_unpackLong(arg)); + int nthreads = (int)THPUtils_unpackLong(arg); + THPUtils_assert(nthreads > 0, "set_num_threads expects a positive integer"); + at::set_num_threads(nthreads); + Py_RETURN_NONE; +} + +static PyObject * THPModule_getNumInteropThreads(PyObject *module) +{ + return PyLong_FromLong(at::get_num_interop_threads()); +} + +static PyObject * THPModule_setNumInteropThreads(PyObject *module, PyObject *arg) +{ + THPUtils_assert(THPUtils_checkLong(arg), "set_num_interop_threads expects an int, " + "but got %s", THPUtils_typename(arg)); + int nthreads = (int)THPUtils_unpackLong(arg); + THPUtils_assert(nthreads > 0, "set_num_interop_threads expects a positive integer"); + at::set_num_interop_threads(nthreads); Py_RETURN_NONE; } @@ -455,6 +475,8 @@ static PyMethodDef TorchMethods[] = { {"_get_backcompat_keepdim_warn", (PyCFunction)THPModule_getBackcompatKeepdimWarn, METH_NOARGS, nullptr}, {"get_num_threads", (PyCFunction)THPModule_getNumThreads, METH_NOARGS, nullptr}, {"set_num_threads", (PyCFunction)THPModule_setNumThreads, METH_O, nullptr}, + {"get_num_interop_threads", (PyCFunction)THPModule_getNumInteropThreads, METH_NOARGS, nullptr}, + {"set_num_interop_threads", (PyCFunction)THPModule_setNumInteropThreads, METH_O, nullptr}, {"_get_cudnn_enabled", (PyCFunction)THPModule_userEnabledCuDNN, METH_NOARGS, nullptr}, {"_set_cudnn_enabled", (PyCFunction)THPModule_setUserEnabledCuDNN, METH_O, nullptr}, {"_get_cudnn_benchmark", (PyCFunction)THPModule_benchmarkCuDNN, METH_NOARGS, nullptr}, @@ -589,6 +611,7 @@ PyObject* initModule() { THPDtype_init(module); THPDTypeInfo_init(module); THPLayout_init(module); + THPMemoryFormat_init(module); THPDevice_init(module); ASSERT_TRUE(THPVariable_initModule(module)); ASSERT_TRUE(THPFunction_initModule(module)); diff --git a/torch/csrc/TypeInfo.cpp b/torch/csrc/TypeInfo.cpp index dcccbb217d34..12b97fb8ec1b 100644 --- a/torch/csrc/TypeInfo.cpp +++ b/torch/csrc/TypeInfo.cpp @@ -55,7 +55,7 @@ PyObject* THPFInfo_pynew(PyTypeObject* type, PyObject* args, PyObject* kwargs) { torch::ParsedArgs<1> parsed_args; auto r = parser.parse(args, kwargs, parsed_args); - AT_CHECK(r.idx < 2, "Not a type"); + TORCH_CHECK(r.idx < 2, "Not a type"); at::ScalarType scalar_type; if (r.idx == 1) { scalar_type = torch::tensors::get_default_scalar_type(); @@ -81,7 +81,7 @@ PyObject* THPIInfo_pynew(PyTypeObject* type, PyObject* args, PyObject* kwargs) { }); torch::ParsedArgs<1> parsed_args; auto r = parser.parse(args, kwargs, parsed_args); - AT_CHECK(r.idx == 0, "Not a type"); + TORCH_CHECK(r.idx == 0, "Not a type"); at::ScalarType scalar_type = r.scalartype(0); if (!at::isIntegralType(scalar_type)) { diff --git a/torch/csrc/WindowsTorchApiMacro.h b/torch/csrc/WindowsTorchApiMacro.h index 371355aa9943..7f8ef4e01677 100644 --- a/torch/csrc/WindowsTorchApiMacro.h +++ b/torch/csrc/WindowsTorchApiMacro.h @@ -1,21 +1,7 @@ #pragma once -#ifdef _WIN32 -#if !defined(TORCH_BUILD_STATIC_LIBS) -#if defined(torch_EXPORTS) -#define TORCH_API __declspec(dllexport) -#else -#define TORCH_API __declspec(dllimport) -#endif -#else -#define TORCH_API -#endif -#elif defined(__GNUC__) -#if defined(torch_EXPORTS) -#define TORCH_API __attribute__((__visibility__("default"))) -#else -#define TORCH_API -#endif -#else -#define TORCH_API -#endif +#include + +// There's no difference between aten, torch and caffe2 libs any more +// TODO: clean up the naming for consistency +#define TORCH_API CAFFE2_API diff --git a/torch/csrc/api/include/torch/data/dataloader.h b/torch/csrc/api/include/torch/data/dataloader.h index 14da9edbaf9f..61b220ad4819 100644 --- a/torch/csrc/api/include/torch/data/dataloader.h +++ b/torch/csrc/api/include/torch/data/dataloader.h @@ -38,7 +38,7 @@ make_data_loader( Dataset dataset, DataLoaderOptions options = DataLoaderOptions()) { const optional size = dataset.size(); - AT_CHECK( + TORCH_CHECK( size.has_value(), "Expected the dataset to be sized in " "order to construct the Sampler"); diff --git a/torch/csrc/api/include/torch/data/dataloader/base.h b/torch/csrc/api/include/torch/data/dataloader/base.h index 1ec478bac7f1..b0ad56eb6514 100644 --- a/torch/csrc/api/include/torch/data/dataloader/base.h +++ b/torch/csrc/api/include/torch/data/dataloader/base.h @@ -55,7 +55,7 @@ class DataLoaderBase { /// standard algorithms like `std::copy(dataloader.begin(), dataloader.end(), /// output_iterator)` are supported too. Iterator begin() { - AT_CHECK( + TORCH_CHECK( shuttle_.in_flight_jobs() == 0, "Attempted to get a new DataLoader iterator " "while another iterator is not yet exhausted"); diff --git a/torch/csrc/api/include/torch/data/datasets/chunk.h b/torch/csrc/api/include/torch/data/datasets/chunk.h index 74142591bf99..c519a964d42f 100644 --- a/torch/csrc/api/include/torch/data/datasets/chunk.h +++ b/torch/csrc/api/include/torch/data/datasets/chunk.h @@ -112,7 +112,7 @@ class BatchDataBuffer { batch_example_indices.value().size() == example_count) BatchRequestType& indices = batch_example_indices.value(); for (size_t i : indices) { - AT_CHECK(i < data_size, "Index out of range"); + TORCH_CHECK(i < data_size, "Index out of range"); batch.emplace_back(std::move(data[i])); } remaining_size -= example_count; @@ -249,16 +249,16 @@ struct ChunkDatasetOptions { : preloader_count_(preloader_count), batch_size_(batch_size), cache_size_(cache_size) { - AT_CHECK( + TORCH_CHECK( preloader_count_ > 0, "Preloader count is 0. At least one preloader needs to be specified."); - AT_CHECK( + TORCH_CHECK( batch_size_ > 0, "Batch size is 0. A positive batch size needs to be specified."); - AT_CHECK( + TORCH_CHECK( cache_size_ > 0, "Cache size is 0. A positive cache size needs to be specified."); - AT_CHECK( + TORCH_CHECK( cache_size_ >= batch_size_, "Cache size is less than batch size. Cache needs to be large enough to " "hold at least one batch."); @@ -323,11 +323,11 @@ class ChunkDataset final /// is dataset agnostic and does not need overriding in different chunk /// datasets. BatchType get_batch(size_t batch_size) override { - AT_CHECK( + TORCH_CHECK( batch_buffer_ != nullptr, "Dataset needs to call reset() before calling get_batch()."); - AT_CHECK( + TORCH_CHECK( batch_size == options_.batch_size_, "The requested batch size does not match with the initialized batch size.\n" " The requested batch size is ", batch_size, diff --git a/torch/csrc/api/include/torch/data/iterator.h b/torch/csrc/api/include/torch/data/iterator.h index 21e2d3d3d593..2ba1a5d33dfb 100644 --- a/torch/csrc/api/include/torch/data/iterator.h +++ b/torch/csrc/api/include/torch/data/iterator.h @@ -50,7 +50,7 @@ struct ValidIterator : public IteratorImpl { void next() override { // If we didn't get the very first batch yet, get it now. lazy_initialize(); - AT_CHECK( + TORCH_CHECK( batch_.has_value(), "Attempted to increment iterator past the end"); // Increment to the next batch. batch_ = next_batch_(); @@ -62,7 +62,7 @@ struct ValidIterator : public IteratorImpl { Batch& get() override { // If we didn't get the very first batch yet, get it now. lazy_initialize(); - AT_CHECK( + TORCH_CHECK( batch_.has_value(), "Attempted to dereference iterator that was past the end"); return batch_.value(); diff --git a/torch/csrc/api/include/torch/expanding_array.h b/torch/csrc/api/include/torch/expanding_array.h index a840a881d9bb..605e6e2e3d0f 100644 --- a/torch/csrc/api/include/torch/expanding_array.h +++ b/torch/csrc/api/include/torch/expanding_array.h @@ -31,7 +31,7 @@ class ExpandingArray { /// at runtime. /*implicit*/ ExpandingArray(at::ArrayRef values) { // clang-format off - AT_CHECK( + TORCH_CHECK( values.size() == D, "Expected ", D, " values, but instead got ", values.size()); // clang-format on diff --git a/torch/csrc/api/include/torch/nn/cloneable.h b/torch/csrc/api/include/torch/nn/cloneable.h index 99037250ac1f..c7b6e5aaf677 100644 --- a/torch/csrc/api/include/torch/nn/cloneable.h +++ b/torch/csrc/api/include/torch/nn/cloneable.h @@ -41,7 +41,7 @@ class Cloneable : public virtual Module { copy->buffers_.clear(); copy->children_.clear(); copy->reset(); - AT_CHECK( + TORCH_CHECK( copy->parameters_.size() == parameters_.size(), "The cloned module does not have the same number of " "parameters as the original module after calling reset(). " @@ -52,7 +52,7 @@ class Cloneable : public virtual Module { copy->parameters_[parameter.key()].set_data( device ? data.to(*device) : data); } - AT_CHECK( + TORCH_CHECK( copy->buffers_.size() == buffers_.size(), "The cloned module does not have the same number of " "buffers as the original module after calling reset(). " @@ -62,7 +62,7 @@ class Cloneable : public virtual Module { auto data = autograd::Variable(*buffer).data().clone(); copy->buffers_[buffer.key()].set_data(device ? data.to(*device) : data); } - AT_CHECK( + TORCH_CHECK( copy->children_.size() == children_.size(), "The cloned module does not have the same number of " "child modules as the original module after calling reset(). " @@ -80,7 +80,7 @@ class Cloneable : public virtual Module { // was registered under the same name as `this`), but you never know what // crazy things `reset()` does, so `dynamic_cast` just to be safe. auto clone = std::dynamic_pointer_cast(other.clone(device)); - AT_CHECK( + TORCH_CHECK( clone != nullptr, "Attempted to clone submodule, but it is of a " "different type than the submodule it was to be cloned into"); diff --git a/torch/csrc/api/include/torch/nn/module.h b/torch/csrc/api/include/torch/nn/module.h index 7b566b494842..6e6509532bfe 100644 --- a/torch/csrc/api/include/torch/nn/module.h +++ b/torch/csrc/api/include/torch/nn/module.h @@ -566,8 +566,8 @@ template std::shared_ptr Module::register_module( std::string name, std::shared_ptr module) { - AT_CHECK(!name.empty(), "Submodule name must not be empty"); - AT_CHECK( + TORCH_CHECK(!name.empty(), "Submodule name must not be empty"); + TORCH_CHECK( name.find('.') == std::string::npos, "Submodule name must not contain a dot (got '", name, diff --git a/torch/csrc/api/include/torch/nn/modules/any.h b/torch/csrc/api/include/torch/nn/modules/any.h index 08deb0b526db..71b90487411e 100644 --- a/torch/csrc/api/include/torch/nn/modules/any.h +++ b/torch/csrc/api/include/torch/nn/modules/any.h @@ -383,7 +383,7 @@ struct AnyModule::Holder : public AnyModule::Placeholder { /// Calls `forward()` on the underlying module, casting each `Value` in the /// argument vector to a concrete value. Value forward(std::vector&& arguments) override { - AT_CHECK( + TORCH_CHECK( arguments.size() == sizeof...(ArgumentTypes), c10::demangle(type_info.name()), "'s forward() method expects ", @@ -466,7 +466,7 @@ AnyModule& AnyModule::operator=(std::shared_ptr module) { template AnyModule::Value AnyModule::any_forward(ArgumentTypes&&... arguments) { - AT_CHECK(!is_empty(), "Cannot call forward() on an empty AnyModule"); + TORCH_CHECK(!is_empty(), "Cannot call forward() on an empty AnyModule"); std::vector values; values.reserve(sizeof...(ArgumentTypes)); torch::apply( @@ -483,13 +483,13 @@ ReturnType AnyModule::forward(ArgumentTypes&&... arguments) { template T& AnyModule::get() { - AT_CHECK(!is_empty(), "Cannot call get() on an empty AnyModule"); + TORCH_CHECK(!is_empty(), "Cannot call get() on an empty AnyModule"); return get_(); } template const T& AnyModule::get() const { - AT_CHECK(!is_empty(), "Cannot call get() on an empty AnyModule"); + TORCH_CHECK(!is_empty(), "Cannot call get() on an empty AnyModule"); return get_(); } @@ -499,20 +499,20 @@ T AnyModule::get() const { } inline std::shared_ptr AnyModule::ptr() const { - AT_CHECK(!is_empty(), "Cannot call ptr() on an empty AnyModule"); + TORCH_CHECK(!is_empty(), "Cannot call ptr() on an empty AnyModule"); return content_->ptr(); } template std::shared_ptr AnyModule::ptr() const { - AT_CHECK(!is_empty(), "Cannot call ptr() on an empty AnyModule"); + TORCH_CHECK(!is_empty(), "Cannot call ptr() on an empty AnyModule"); // Call get() but discard the value, just to do the type checking. get_(); return std::dynamic_pointer_cast(ptr()); } inline const std::type_info& AnyModule::type_info() const { - AT_CHECK(!is_empty(), "Cannot call type_info() on an empty AnyModule"); + TORCH_CHECK(!is_empty(), "Cannot call type_info() on an empty AnyModule"); return content_->type_info; } diff --git a/torch/csrc/api/include/torch/nn/modules/conv.h b/torch/csrc/api/include/torch/nn/modules/conv.h index 66078bc56577..c492ef6f0d34 100644 --- a/torch/csrc/api/include/torch/nn/modules/conv.h +++ b/torch/csrc/api/include/torch/nn/modules/conv.h @@ -78,7 +78,7 @@ struct ConvOptions { /// Base class for all (dimension-specialized) convolution modules. template -class ConvImpl : public torch::nn::Cloneable { +class TORCH_API ConvImpl : public torch::nn::Cloneable { public: ConvImpl( int64_t input_channels, diff --git a/torch/csrc/api/include/torch/nn/modules/dropout.h b/torch/csrc/api/include/torch/nn/modules/dropout.h index 1212dcaa7c5d..ad70eaff86d3 100644 --- a/torch/csrc/api/include/torch/nn/modules/dropout.h +++ b/torch/csrc/api/include/torch/nn/modules/dropout.h @@ -40,7 +40,7 @@ class DropoutImplBase : public torch::nn::Cloneable { /// about the exact semantics of this module. class TORCH_API DropoutImpl : public detail::DropoutImplBase { public: - using detail::DropoutImplBase::DropoutImplBase; + explicit DropoutImpl(DropoutOptions options_ = DropoutOptions()); /// During training, applies a noise mask to the input tensor. /// During evaluation, applies an identity function. @@ -62,7 +62,7 @@ class TORCH_API DropoutImpl : public detail::DropoutImplBase { class TORCH_API FeatureDropoutImpl : public detail::DropoutImplBase { public: - using detail::DropoutImplBase::DropoutImplBase; + explicit FeatureDropoutImpl(DropoutOptions options_ = DropoutOptions()); /// During training, applies a noise mask to the input tensor. /// During evaluation, applies an identity function. diff --git a/torch/csrc/api/include/torch/nn/modules/rnn.h b/torch/csrc/api/include/torch/nn/modules/rnn.h index 7d78d6eec61d..e6d161e9f56b 100644 --- a/torch/csrc/api/include/torch/nn/modules/rnn.h +++ b/torch/csrc/api/include/torch/nn/modules/rnn.h @@ -53,7 +53,7 @@ struct TORCH_API RNNOptionsBase { /// Base class for all RNN implementations (intended for code sharing). template -class RNNImplBase : public torch::nn::Cloneable { +class TORCH_API RNNImplBase : public torch::nn::Cloneable { public: /// These must line up with the CUDNN mode codes: /// https://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#cudnnRNNMode_t diff --git a/torch/csrc/api/include/torch/nn/modules/sequential.h b/torch/csrc/api/include/torch/nn/modules/sequential.h index 7a4818cfe821..1f1c17e731f7 100644 --- a/torch/csrc/api/include/torch/nn/modules/sequential.h +++ b/torch/csrc/api/include/torch/nn/modules/sequential.h @@ -161,7 +161,7 @@ class SequentialImpl : public Cloneable { /// \endrst template ReturnType forward(InputTypes&&... inputs) { - AT_CHECK(!is_empty(), "Cannot call forward() on an empty Sequential"); + TORCH_CHECK(!is_empty(), "Cannot call forward() on an empty Sequential"); auto iterator = modules_.begin(); auto input = iterator->any_forward(std::forward(inputs)...); @@ -263,7 +263,7 @@ class SequentialImpl : public Cloneable { static_assert( torch::detail::is_module::value, "Can only call Sequential::at with an nn::Module type"); - AT_CHECK(index < size(), "Index out of range"); + TORCH_CHECK(index < size(), "Index out of range"); return modules_[index].get(); } @@ -275,7 +275,7 @@ class SequentialImpl : public Cloneable { static_assert( torch::detail::is_module::value, "Can only call Sequential::at with an nn::Module type"); - AT_CHECK(index < size(), "Index out of range"); + TORCH_CHECK(index < size(), "Index out of range"); return modules_[index].get(); } @@ -283,7 +283,7 @@ class SequentialImpl : public Cloneable { /// underlying module at the given index. Throws an exception if the index is /// out of bounds. std::shared_ptr ptr(size_t index) const { - AT_CHECK(index < size(), "Index out of range"); + TORCH_CHECK(index < size(), "Index out of range"); return modules_[index].ptr(); } @@ -295,7 +295,7 @@ class SequentialImpl : public Cloneable { static_assert( torch::detail::is_module::value, "Can only call Sequential::ptr with an nn::Module type"); - AT_CHECK(index < size(), "Index out of range"); + TORCH_CHECK(index < size(), "Index out of range"); return modules_[index].ptr(); } diff --git a/torch/csrc/api/include/torch/nn/parallel/data_parallel.h b/torch/csrc/api/include/torch/nn/parallel/data_parallel.h index be777be0ad86..bc1c59053758 100644 --- a/torch/csrc/api/include/torch/nn/parallel/data_parallel.h +++ b/torch/csrc/api/include/torch/nn/parallel/data_parallel.h @@ -73,10 +73,10 @@ std::vector parallel_apply( std::vector& modules, const std::vector& inputs, const optional>& devices = nullopt) { - AT_CHECK( + TORCH_CHECK( modules.size() == inputs.size(), "Must have as many inputs as modules"); if (devices) { - AT_CHECK( + TORCH_CHECK( modules.size() == devices->size(), "Must have as many devices as modules"); } @@ -140,7 +140,7 @@ Tensor data_parallel( int64_t dim = 0) { if (!devices) { const auto device_count = torch::cuda::device_count(); - AT_CHECK( + TORCH_CHECK( device_count > 0, "Expected at least one CUDA device to be available"); devices = std::vector(); devices->reserve(device_count); diff --git a/torch/csrc/api/include/torch/nn/pimpl.h b/torch/csrc/api/include/torch/nn/pimpl.h index e3cda201975b..1a033b7cf293 100644 --- a/torch/csrc/api/include/torch/nn/pimpl.h +++ b/torch/csrc/api/include/torch/nn/pimpl.h @@ -98,19 +98,19 @@ class ModuleHolder : torch::detail::ModuleHolderIndicator { /// Returns a shared pointer to the underlying module. const std::shared_ptr& ptr() const { - AT_CHECK(!is_empty(), "Accessing empty ModuleHolder"); + TORCH_CHECK(!is_empty(), "Accessing empty ModuleHolder"); return impl_; } /// Returns a pointer to the underlying module. Contained* get() { - AT_CHECK(!is_empty(), "Accessing empty ModuleHolder"); + TORCH_CHECK(!is_empty(), "Accessing empty ModuleHolder"); return impl_.get(); } /// Returns a const pointer to the underlying module. const Contained* get() const { - AT_CHECK(!is_empty(), "Accessing empty ModuleHolder"); + TORCH_CHECK(!is_empty(), "Accessing empty ModuleHolder"); return impl_.get(); } diff --git a/torch/csrc/api/include/torch/ordered_dict.h b/torch/csrc/api/include/torch/ordered_dict.h index 65b9958774e5..a26de4321548 100644 --- a/torch/csrc/api/include/torch/ordered_dict.h +++ b/torch/csrc/api/include/torch/ordered_dict.h @@ -295,41 +295,41 @@ typename OrderedDict::ConstIterator OrderedDict::end() template typename OrderedDict::Item& OrderedDict::front() { - AT_CHECK(!items_.empty(), "Called front() on an empty OrderedDict"); + TORCH_CHECK(!items_.empty(), "Called front() on an empty OrderedDict"); return items_.front(); } template const typename OrderedDict::Item& OrderedDict::front() const { - AT_CHECK(!items_.empty(), "Called front() on an empty OrderedDict"); + TORCH_CHECK(!items_.empty(), "Called front() on an empty OrderedDict"); return items_.front(); } template typename OrderedDict::Item& OrderedDict::back() { - AT_CHECK(!items_.empty(), "Called back() on an empty OrderedDict"); + TORCH_CHECK(!items_.empty(), "Called back() on an empty OrderedDict"); return items_.back(); } template const typename OrderedDict::Item& OrderedDict::back() const { - AT_CHECK(!items_.empty(), "Called back() on an empty OrderedDict"); + TORCH_CHECK(!items_.empty(), "Called back() on an empty OrderedDict"); return items_.back(); } template typename OrderedDict::Item& OrderedDict::operator[]( size_t index) { - AT_CHECK(index < items_.size(), "Index ", index, " is out of bounds"); + TORCH_CHECK(index < items_.size(), "Index ", index, " is out of bounds"); return items_[index]; } template const typename OrderedDict:: Item& OrderedDict::operator[](size_t index) const { - AT_CHECK(index < items_.size(), "Index ", index, " is out of bounds"); + TORCH_CHECK(index < items_.size(), "Index ", index, " is out of bounds"); return items_[index]; } @@ -352,7 +352,7 @@ const Value& OrderedDict::operator[](const Key& key) const { template template Value& OrderedDict::insert(K&& key, V&& value) { - AT_CHECK( + TORCH_CHECK( index_.count(key) == 0, key_description_, " '", key, "' already defined"); // Copy `key` here and move it into the index. items_.emplace_back(key, std::forward(value)); diff --git a/torch/csrc/api/include/torch/torch.h b/torch/csrc/api/include/torch/torch.h index e7d190c499df..b41e6eefffdd 100644 --- a/torch/csrc/api/include/torch/torch.h +++ b/torch/csrc/api/include/torch/torch.h @@ -4,6 +4,14 @@ #ifdef TORCH_API_INCLUDE_EXTENSION_H #include -#warning \ + +#define DEPRECATE_MESSAGE \ "Including torch/torch.h for C++ extensions is deprecated. Please include torch/extension.h" + +#ifdef _MSC_VER +# pragma message ( DEPRECATE_MESSAGE ) +#else +# warning DEPRECATE_MESSAGE +#endif + #endif // defined(TORCH_API_INCLUDE_EXTENSION_H) diff --git a/torch/csrc/api/include/torch/utils.h b/torch/csrc/api/include/torch/utils.h index 617265c4dc37..70fc19a972d3 100644 --- a/torch/csrc/api/include/torch/utils.h +++ b/torch/csrc/api/include/torch/utils.h @@ -26,4 +26,10 @@ using at::get_num_threads; // Sets the number of threads to be used in parallel region. using at::set_num_threads; +// Returns the number of threads used for inter-op parallelism. +using at::get_num_interop_threads; + +// Sets the number of threads to be used for inter-op parallelism. +using at::set_num_interop_threads; + } // namespace torch diff --git a/torch/csrc/api/src/data/datasets/mnist.cpp b/torch/csrc/api/src/data/datasets/mnist.cpp index d77b4573ca53..7dfe17a2a089 100644 --- a/torch/csrc/api/src/data/datasets/mnist.cpp +++ b/torch/csrc/api/src/data/datasets/mnist.cpp @@ -45,7 +45,7 @@ uint32_t read_int32(std::ifstream& stream) { uint32_t expect_int32(std::ifstream& stream, uint32_t expected) { const auto value = read_int32(stream); // clang-format off - AT_CHECK(value == expected, + TORCH_CHECK(value == expected, "Expected to read number ", expected, " but found ", value, " instead"); // clang-format on return value; @@ -63,7 +63,7 @@ Tensor read_images(const std::string& root, bool train) { const auto path = join_paths(root, train ? kTrainImagesFilename : kTestImagesFilename); std::ifstream images(path, std::ios::binary); - AT_CHECK(images, "Error opening images file at ", path); + TORCH_CHECK(images, "Error opening images file at ", path); const auto count = train ? kTrainSize : kTestSize; @@ -83,7 +83,7 @@ Tensor read_targets(const std::string& root, bool train) { const auto path = join_paths(root, train ? kTrainTargetsFilename : kTestTargetsFilename); std::ifstream targets(path, std::ios::binary); - AT_CHECK(targets, "Error opening targets file at ", path); + TORCH_CHECK(targets, "Error opening targets file at ", path); const auto count = train ? kTrainSize : kTestSize; diff --git a/torch/csrc/api/src/nn/init.cpp b/torch/csrc/api/src/nn/init.cpp index 7d64b9f02da8..c16f2b2b9ab3 100644 --- a/torch/csrc/api/src/nn/init.cpp +++ b/torch/csrc/api/src/nn/init.cpp @@ -18,7 +18,7 @@ namespace { struct Fan { explicit Fan(Tensor& tensor) { const auto dimensions = tensor.ndimension(); - AT_CHECK( + TORCH_CHECK( dimensions >= 2, "Fan in and fan out can not be computed for tensor with fewer than 2 dimensions"); @@ -73,7 +73,7 @@ Tensor constant_(Tensor tensor, Scalar value) { Tensor dirac_(Tensor tensor) { NoGradGuard guard; - AT_CHECK( + TORCH_CHECK( tensor.ndimension() >= 3 && tensor.ndimension() <= 5, "Only tensors with 3, 4, or 5 dimensions are supported"); @@ -100,7 +100,7 @@ Tensor dirac_(Tensor tensor) { Tensor eye_(Tensor matrix) { NoGradGuard guard; - AT_CHECK( + TORCH_CHECK( matrix.ndimension() == 2, "Only tensors with 2 dimensions are supported"); return torch::eye_out(matrix, matrix.size(0), matrix.size(1)); } @@ -118,7 +118,7 @@ Tensor ones_(Tensor tensor) { Tensor orthogonal_(Tensor tensor, double gain) { NoGradGuard guard; - AT_CHECK( + TORCH_CHECK( tensor.ndimension() >= 2, "Only tensors with 2 or more dimensions are supported"); @@ -151,7 +151,7 @@ Tensor orthogonal_(Tensor tensor, double gain) { Tensor sparse_(Tensor tensor, double sparsity, double std) { NoGradGuard guard; - AT_CHECK( + TORCH_CHECK( tensor.ndimension() == 2, "Only tensors with 2 dimensions are supported"); const auto rows = tensor.size(0); diff --git a/torch/csrc/api/src/nn/module.cpp b/torch/csrc/api/src/nn/module.cpp index c0f54efb3fc4..e266656fdfd5 100644 --- a/torch/csrc/api/src/nn/module.cpp +++ b/torch/csrc/api/src/nn/module.cpp @@ -310,8 +310,8 @@ Tensor& Module::register_parameter( std::string name, Tensor tensor, bool requires_grad) { - AT_CHECK(!name.empty(), "Parameter name must not be empty"); - AT_CHECK( + TORCH_CHECK(!name.empty(), "Parameter name must not be empty"); + TORCH_CHECK( name.find('.') == std::string::npos, "Parameter name must not contain a dot (got '", name, @@ -321,8 +321,8 @@ Tensor& Module::register_parameter( } Tensor& Module::register_buffer(std::string name, Tensor tensor) { - AT_CHECK(!name.empty(), "Buffer name must not be empty"); - AT_CHECK( + TORCH_CHECK(!name.empty(), "Buffer name must not be empty"); + TORCH_CHECK( name.find('.') == std::string::npos, "Buffer name must not contain a dot (got '", name, @@ -388,7 +388,7 @@ std::ostream& operator<<(std::ostream& stream, const nn::Module& module) { serialize::OutputArchive& operator<<( serialize::OutputArchive& archive, const std::shared_ptr& module) { - AT_CHECK(module != nullptr, "Cannot serialize empty module"); + TORCH_CHECK(module != nullptr, "Cannot serialize empty module"); module->save(archive); return archive; } @@ -396,7 +396,7 @@ serialize::OutputArchive& operator<<( serialize::InputArchive& operator>>( serialize::InputArchive& archive, const std::shared_ptr& module) { - AT_CHECK(module != nullptr, "Cannot deserialize empty module"); + TORCH_CHECK(module != nullptr, "Cannot deserialize empty module"); module->load(archive); return archive; } diff --git a/torch/csrc/api/src/nn/modules/batchnorm.cpp b/torch/csrc/api/src/nn/modules/batchnorm.cpp index 8a542e5cefc0..7fab1f5f645a 100644 --- a/torch/csrc/api/src/nn/modules/batchnorm.cpp +++ b/torch/csrc/api/src/nn/modules/batchnorm.cpp @@ -42,7 +42,7 @@ void BatchNormImpl::pretty_print(std::ostream& stream) const { } Tensor BatchNormImpl::forward(const Tensor& input) { - AT_CHECK( + TORCH_CHECK( options.stateful_, "Calling BatchNorm::forward is only permitted when " "the 'stateful' option is true (was false). " @@ -56,7 +56,7 @@ Tensor BatchNormImpl::pure_forward( const Tensor& variance) { if (is_training()) { const auto num_channels = input.dim() > 1 ? input.size(1) : 1; - AT_CHECK( + TORCH_CHECK( input.numel() / num_channels > 1, "BatchNorm expected more than 1 value per channel when training!"); } diff --git a/torch/csrc/api/src/nn/modules/conv.cpp b/torch/csrc/api/src/nn/modules/conv.cpp index 42147438b963..739b7ccd7d21 100644 --- a/torch/csrc/api/src/nn/modules/conv.cpp +++ b/torch/csrc/api/src/nn/modules/conv.cpp @@ -22,7 +22,7 @@ template void ConvImpl::reset() { if (!options.transposed_) { for (auto pad : *options.output_padding_) { - AT_CHECK( + TORCH_CHECK( pad == 0, "Only transposed convolutions support output padding!"); } } diff --git a/torch/csrc/api/src/nn/modules/dropout.cpp b/torch/csrc/api/src/nn/modules/dropout.cpp index c068f70d389c..84a0d916b7e1 100644 --- a/torch/csrc/api/src/nn/modules/dropout.cpp +++ b/torch/csrc/api/src/nn/modules/dropout.cpp @@ -14,8 +14,8 @@ namespace detail { template DropoutImplBase::DropoutImplBase(DropoutOptions options_) : options(options_) { - AT_CHECK(options.rate_ >= 0, "Dropout rate must not be less than zero"); - AT_CHECK(options.rate_ <= 1, "Dropout rate must not be greater than one"); + TORCH_CHECK(options.rate_ >= 0, "Dropout rate must not be less than zero"); + TORCH_CHECK(options.rate_ <= 1, "Dropout rate must not be greater than one"); } template @@ -27,6 +27,8 @@ template class DropoutImplBase; DropoutOptions::DropoutOptions(double rate) : rate_(rate) {} +DropoutImpl::DropoutImpl(DropoutOptions options_) : DropoutImplBase(options_) {} + Tensor DropoutImpl::forward(const Tensor& input) { return torch::dropout(input, options.rate_, this->is_training()); } @@ -35,6 +37,9 @@ void DropoutImpl::pretty_print(std::ostream& stream) const { stream << "torch::nn::Dropout(rate=" << options.rate_ << ")"; } +FeatureDropoutImpl::FeatureDropoutImpl(DropoutOptions options_) + : DropoutImplBase(options_) {} + Tensor FeatureDropoutImpl::forward(const Tensor& input) { return torch::feature_dropout(input, options.rate_, this->is_training()); } diff --git a/torch/csrc/api/src/serialize/input-archive.cpp b/torch/csrc/api/src/serialize/input-archive.cpp index e86217912fe8..15d9c2648bb4 100644 --- a/torch/csrc/api/src/serialize/input-archive.cpp +++ b/torch/csrc/api/src/serialize/input-archive.cpp @@ -30,7 +30,7 @@ bool InputArchive::try_read( // clang-format off auto read_param = is_buffer ? buffer : param; auto read_tensor = read_param->value().toTensor(); - AT_CHECK( + TORCH_CHECK( bool(buffer) == is_buffer, "Expected deserialized tensor for key '", key, "' to ", is_buffer ? "not " : "", "be a buffer, but it was not"); @@ -52,7 +52,7 @@ void InputArchive::read( const std::string& key, Tensor& tensor, bool is_buffer) { - AT_CHECK( + TORCH_CHECK( try_read(key, tensor, is_buffer), "No such serialized tensor '", key, @@ -69,7 +69,7 @@ bool InputArchive::try_read(const std::string& key, InputArchive& archive) { } void InputArchive::read(const std::string& key, InputArchive& archive) { - AT_CHECK( + TORCH_CHECK( try_read(key, archive), "No such serialized submodule: '", key, "'"); } diff --git a/torch/csrc/autograd/function.h b/torch/csrc/autograd/function.h index 512cfb60cb4b..b28a1f1e9366 100644 --- a/torch/csrc/autograd/function.h +++ b/torch/csrc/autograd/function.h @@ -214,7 +214,7 @@ struct TORCH_API Function : std::enable_shared_from_this { /// Returns true if the particular output edge is active, and that particular /// output of this function should be computed. bool should_compute_output(size_t output_edge_index) const { - AT_CHECK(output_edge_index < num_outputs(), "Index out of range"); + TORCH_CHECK(output_edge_index < num_outputs(), "Index out of range"); return next_edges_[output_edge_index].is_valid(); } diff --git a/torch/csrc/autograd/functions/comm.cpp b/torch/csrc/autograd/functions/comm.cpp index 8df447cc0f8a..d762b558df65 100644 --- a/torch/csrc/autograd/functions/comm.cpp +++ b/torch/csrc/autograd/functions/comm.cpp @@ -74,7 +74,7 @@ Gather::~Gather() {} variable_list Gather::apply(variable_list&& inputs) { bool all_are_zero_dim = true; for (const auto& input : inputs) { - AT_CHECK( + TORCH_CHECK( input.is_cuda(), "All inputs to Gather must be CUDA tensors, got ", input.type()); diff --git a/torch/csrc/autograd/functions/comm.h b/torch/csrc/autograd/functions/comm.h index 1e98573b74d1..204b1075093c 100644 --- a/torch/csrc/autograd/functions/comm.h +++ b/torch/csrc/autograd/functions/comm.h @@ -15,7 +15,7 @@ namespace torch { namespace autograd { //TODO: change it to TORCH_API when we merge the libs -struct TORCH_API Scatter : public Function { +struct AT_CUDA_API Scatter : public Function { explicit Scatter( std::vector devices, const c10::optional>& chunk_sizes = c10::nullopt, @@ -34,7 +34,7 @@ struct TORCH_API Scatter : public Function { bool unsqueeze_scalars_; }; -struct TORCH_API Gather : public Function { +struct AT_CUDA_API Gather : public Function { explicit Gather(const at::Device& destination_device, int64_t dim = 0); ~Gather() override; diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp index 8a22777225aa..5275dfa8695e 100644 --- a/torch/csrc/autograd/profiler.cpp +++ b/torch/csrc/autograd/profiler.cpp @@ -27,6 +27,8 @@ std::list> all_event_lists; thread_local std::shared_ptr event_list; thread_local uint16_t thread_id; +ProfilerConfig::~ProfilerConfig() = default; + RangeEventList& getEventList() { if (!event_list) { std::lock_guard guard(all_event_lists_mutex); @@ -246,7 +248,7 @@ RecordProfile::~RecordProfile() { } void RecordProfile::processEvents(const std::vector& events) { - AT_CHECK(out_, "could not open file"); + TORCH_CHECK(out_, "could not open file"); Event* start = nullptr; for (Event* e : events) { if(0 == strcmp(e->name(), "__start_profile")) { @@ -254,7 +256,7 @@ void RecordProfile::processEvents(const std::vector& events) { break; } } - AT_CHECK(start, "could not find start?"); + TORCH_CHECK(start, "could not find start?"); std::vector stack; out_ << "[\n"; bool first = true; diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h index 5de1a2a39de7..78a6b419c085 100644 --- a/torch/csrc/autograd/profiler.h +++ b/torch/csrc/autograd/profiler.h @@ -101,9 +101,10 @@ enum class TORCH_API ProfilerState { NVTX, // only emit NVTX markers }; -struct ProfilerConfig { +struct TORCH_API ProfilerConfig { ProfilerConfig(ProfilerState state, bool report_input_shapes) : state(state), report_input_shapes(report_input_shapes) {} + ~ProfilerConfig(); ProfilerState state; bool report_input_shapes; }; diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp index 3357ca7a32a3..9b3ede12c38a 100644 --- a/torch/csrc/autograd/python_variable.cpp +++ b/torch/csrc/autograd/python_variable.cpp @@ -304,6 +304,13 @@ PyObject *THPVariable_get_requires_grad(THPVariable *self) END_HANDLE_TH_ERRORS } +PyObject *THPVariable_get_ndim(THPVariable *self) +{ + HANDLE_TH_ERRORS + return PyInt_FromLong(self->cdata.dim()); + END_HANDLE_TH_ERRORS +} + int THPVariable_set_requires_grad(THPVariable *self, PyObject *obj) { HANDLE_TH_ERRORS @@ -443,6 +450,7 @@ static struct PyGetSetDef THPVariable_properties[] = { {"dtype", (getter)THPVariable_dtype, nullptr, nullptr, nullptr}, {"layout", (getter)THPVariable_layout, nullptr, nullptr, nullptr}, {"device", (getter)THPVariable_device, nullptr, nullptr, nullptr}, + {"ndim", (getter)THPVariable_get_ndim, nullptr, nullptr, nullptr}, {nullptr} }; @@ -508,8 +516,8 @@ void initTensorImplConversion(PyObject* module) { m.def("_wrap_tensor_impl", [](void* ptr) { auto p = c10::intrusive_ptr:: unsafe_reclaim_from_nonowning(static_cast(ptr)); - AT_CHECK(p.defined(), "Can't wrap undefined tensor"); - AT_CHECK(!p->is_variable(), "Can wrap only non-variable tensor"); + TORCH_CHECK(p.defined(), "Can't wrap undefined tensor"); + TORCH_CHECK(!p->is_variable(), "Can wrap only non-variable tensor"); auto tensor = at::Tensor::wrap_tensor_impl(std::move(p)); return py::cast(torch::autograd::Variable( torch::autograd::make_variable(std::move(tensor), false))); diff --git a/torch/csrc/autograd/record_function.cpp b/torch/csrc/autograd/record_function.cpp index 75394b3c0cbe..9ecae6da64dc 100644 --- a/torch/csrc/autograd/record_function.cpp +++ b/torch/csrc/autograd/record_function.cpp @@ -1,6 +1,8 @@ #include #include +#include + namespace torch { namespace autograd { namespace profiler { namespace { @@ -8,6 +10,28 @@ std::vector start_callbacks; std::vector end_callbacks; size_t callback_needs_inputs = 0; thread_local RecordFunction* thread_local_func_ = nullptr; + +bool is_sampled_callbacks = false; +double sampling_prob = 1.0; +constexpr double kEps = 1e-10; +} + +void setSamplingProbability(double prob) { + if (std::abs(prob - 1.0) < kEps) { + is_sampled_callbacks = false; + } else { + TORCH_CHECK(prob > -kEps && prob < 1.0); + is_sampled_callbacks = true; + } + sampling_prob = prob; +} + +double getSamplingProbability() { + return sampling_prob; +} + +bool checkCallbacksSampled() { + return is_sampled_callbacks; } void pushCallback( diff --git a/torch/csrc/autograd/record_function.h b/torch/csrc/autograd/record_function.h index 7d25f55f2cab..def063f78a71 100644 --- a/torch/csrc/autograd/record_function.h +++ b/torch/csrc/autograd/record_function.h @@ -92,14 +92,25 @@ struct TORCH_API RecordFunction { TORCH_API bool hasCallbacks(); TORCH_API bool needsInputs(); +TORCH_API void setSamplingProbability(double); +TORCH_API double getSamplingProbability(); +TORCH_API bool checkCallbacksSampled(); + +inline bool checkCallbacksEnabled() { + return !checkCallbacksSampled() || + (((double) std::rand() / RAND_MAX) < getSamplingProbability()); +} + // optional argument - function's seq_no #define RECORD_FUNCTION(fn, inputs, ...) \ torch::autograd::profiler::RecordFunction guard; \ if (torch::autograd::profiler::hasCallbacks()) { \ - if (torch::autograd::profiler::needsInputs()) { \ - guard.before(fn, inputs, ##__VA_ARGS__); \ - } else { \ - guard.before(fn, ##__VA_ARGS__); \ + if (torch::autograd::profiler::checkCallbacksEnabled()) { \ + if (torch::autograd::profiler::needsInputs()) { \ + guard.before(fn, inputs, ##__VA_ARGS__); \ + } else { \ + guard.before(fn, ##__VA_ARGS__); \ + } \ } \ } diff --git a/torch/csrc/autograd/variable.cpp b/torch/csrc/autograd/variable.cpp index 9fa2033c15d8..b5a6798f5aa8 100644 --- a/torch/csrc/autograd/variable.cpp +++ b/torch/csrc/autograd/variable.cpp @@ -30,7 +30,7 @@ Variable::Impl::Impl(at::Tensor data, std::unique_ptr au // set_requires_grad also checks error conditions. autograd_meta->set_requires_grad(requires_grad, this); - AT_CHECK( + TORCH_CHECK( !autograd_meta->grad_fn_ || !autograd_meta->requires_grad_, "requires_grad should be false if grad_fn is set"); if (!data_.defined()) { @@ -54,8 +54,8 @@ IntArrayRef Variable::Impl::strides() const { return data_.strides(); } -bool Variable::Impl::is_contiguous() const { - return data_.is_contiguous(); +bool Variable::Impl::is_contiguous(MemoryFormat memory_format) const { + return data_.is_contiguous(memory_format); } int64_t Variable::Impl::dim() const { @@ -170,12 +170,12 @@ void Variable::Impl::set_data(const at::Tensor &new_data) { device_opt_ = new_data.device(); type_id_ = new_data.dispatch_type().type_id(); - auto new_data_impl_copy = new_data.getIntrusivePtr()->shallow_copy_and_detach(); // Version counter is not shared when we replace a `Variable`'s underlying `Tensor` // by calling `set_data(...)`. The original version of the `Variable` is always preserved. // See NOTE [ Version Counter Sharing ] for details. - auto saved_version_ = data_.unsafeGetTensorImpl()->version_counter().current_version(); - new_data_impl_copy->set_version_counter(saved_version_); + auto new_data_impl_copy = new_data.getIntrusivePtr()->shallow_copy_and_detach( + /*version_counter=*/data_.unsafeGetTensorImpl()->version_counter(), + /*allow_tensor_metadata_change=*/true); data_ = std::move(at::Tensor(new_data_impl_copy)); } @@ -188,7 +188,7 @@ Variable::DifferentiableViewImpl::DifferentiableViewImpl(Variable base, at::Tens : Variable::Impl(std::move(data), std::move(autograd_meta), false, std::move(gradient_edge)) { auto diff_view_meta = static_cast(get_autograd_meta()); diff_view_meta->base_ = std::move(base); - AT_CHECK(diff_view_meta->base_.defined(), "base is undefined"); + TORCH_CHECK(diff_view_meta->base_.defined(), "base is undefined"); if (diff_view_meta->base_.is_view()) { diff_view_meta->base_ = diff_view_meta->base_.base(); } @@ -238,7 +238,7 @@ void Variable::rebase_history(Edge gradient_edge) { auto diff_view_meta = static_cast(get_autograd_meta()); AT_ASSERT(gradient_edge.input_nr == 0); AT_ASSERT(gradient_edge.function); - AT_CHECK( + TORCH_CHECK( gradient_edge.function->num_inputs() == 1, "Functions which modify views in-place must return a single Variable"); diff_view_meta->output_nr_ = gradient_edge.input_nr; diff --git a/torch/csrc/autograd/variable.h b/torch/csrc/autograd/variable.h index ec42d56562c9..5cd7d648131e 100644 --- a/torch/csrc/autograd/variable.h +++ b/torch/csrc/autograd/variable.h @@ -137,14 +137,14 @@ struct TORCH_API Variable : public at::Tensor { // "Downcasts" a `Tensor` into a `Variable`. Only call this on tensors you // know are Variables. /*implicit*/ Variable(at::Tensor const& rhs) : at::Tensor(rhs) { - AT_CHECK( + TORCH_CHECK( is_variable() || !defined(), "Tensor that was converted to Variable was not actually a Variable"); } /*implicit*/ Variable(at::Tensor&& rhs) : at::Tensor(std::move(rhs)) { - AT_CHECK( + TORCH_CHECK( is_variable() || !defined(), "Tensor that was converted to Variable was not actually a Variable"); } @@ -355,7 +355,7 @@ struct TORCH_API Variable::AutogradMeta : public c10::AutogradMetaInterface { /// leaf variables that want to accumulate gradients, and false for all other /// variables. void set_requires_grad(bool requires_grad, at::TensorImpl* self_impl) override { - AT_CHECK( + TORCH_CHECK( !requires_grad || at::isFloatingType(at::typeMetaToScalarType(self_impl->dtype())), "Only Tensors of floating point dtype can require gradients"); requires_grad_ = requires_grad; @@ -409,7 +409,7 @@ struct TORCH_API Variable::Impl : public at::TensorImpl { int64_t numel() const override; at::IntArrayRef sizes() const override; at::IntArrayRef strides() const override; - bool is_contiguous() const override; + bool is_contiguous(at::MemoryFormat memory_format=at::MemoryFormat::Any) const override; int64_t size(int64_t d) const override; int64_t stride(int64_t d) const override; void resize_dim(int64_t ndim) override; @@ -546,21 +546,22 @@ inline Variable make_variable_view( if (data.defined()) { if (is_differentiable) { /// Differentiable view. Track history with DifferentiableViewImpl. - auto data_impl_copy = data.getIntrusivePtr()->shallow_copy_and_detach(); - data_impl_copy->set_allow_tensor_metadata_change(allow_tensor_metadata_change); + auto data_impl_copy = data.getIntrusivePtr()->shallow_copy_and_detach( + /*version_counter=*/0, + /*allow_tensor_metadata_change=*/allow_tensor_metadata_change); auto data_copy = at::Tensor(data_impl_copy); auto diff_view_meta = c10::guts::make_unique(); return Variable(c10::make_intrusive( std::move(base), std::move(data_copy), std::move(gradient_edge), std::move(diff_view_meta))); } else { /// Non-differentiable view. Just share version counter. - auto data_impl_copy = data.getIntrusivePtr()->shallow_copy_and_detach(); - data_impl_copy->set_allow_tensor_metadata_change(allow_tensor_metadata_change); + auto data_impl_copy = data.getIntrusivePtr()->shallow_copy_and_detach( + /*version_counter=*/base.version_counter(), + /*allow_tensor_metadata_change=*/allow_tensor_metadata_change); auto data_copy = at::Tensor(data_impl_copy); auto autograd_meta = c10::guts::make_unique(); auto var = Variable(c10::make_intrusive( std::move(data_copy), std::move(autograd_meta), false, std::move(gradient_edge))); - var.set_version_counter(base.version_counter()); return var; } } @@ -571,12 +572,13 @@ inline Variable make_variable( at::Tensor data, bool requires_grad = false, bool allow_tensor_metadata_change = true) { - AT_CHECK( + TORCH_CHECK( !data.is_variable(), "Must not create a new variable from a variable, use its .data()"); if (data.defined()) { - auto data_impl_copy = data.getIntrusivePtr()->shallow_copy_and_detach(); - data_impl_copy->set_allow_tensor_metadata_change(allow_tensor_metadata_change); + auto data_impl_copy = data.getIntrusivePtr()->shallow_copy_and_detach( + /*version_counter=*/0, + /*allow_tensor_metadata_change=*/allow_tensor_metadata_change); auto data_copy = at::Tensor(data_impl_copy); auto autograd_meta = c10::guts::make_unique(); return Variable(c10::make_intrusive(data_copy, std::move(autograd_meta), requires_grad)); @@ -588,7 +590,7 @@ inline Variable make_variable_consuming( at::Tensor data, bool requires_grad = false, bool allow_tensor_metadata_change = true) { - AT_CHECK( + TORCH_CHECK( !data.is_variable(), "Must not create a new variable from a variable, use its .data()"); if (data.defined()) { @@ -604,12 +606,13 @@ inline Variable make_variable( at::Tensor data, Edge gradient_edge, bool allow_tensor_metadata_change = true) { - AT_CHECK( + TORCH_CHECK( !data.is_variable(), "Must not create a new variable from a variable, use its .data()"); if (data.defined()) { - auto data_impl_copy = data.getIntrusivePtr()->shallow_copy_and_detach(); - data_impl_copy->set_allow_tensor_metadata_change(allow_tensor_metadata_change); + auto data_impl_copy = data.getIntrusivePtr()->shallow_copy_and_detach( + /*version_counter=*/0, + /*allow_tensor_metadata_change=*/allow_tensor_metadata_change); auto data_copy = at::Tensor(data_impl_copy); auto autograd_meta = c10::guts::make_unique(); return Variable(c10::make_intrusive(data_copy, std::move(autograd_meta), false, std::move(gradient_edge))); @@ -624,7 +627,7 @@ inline Variable make_variable( /// in DEBUG mode and the tensor's dynamic type is not in fact `Variable`, /// throws a `std::invalid_argument` exception. inline Variable& as_variable_ref(at::Tensor& tensor) { - AT_CHECK( + TORCH_CHECK( tensor.is_variable(), "Attempted to cast a Tensor to a Variable, but " "the dynamic type of the value is not Variable."); @@ -632,7 +635,7 @@ inline Variable& as_variable_ref(at::Tensor& tensor) { } inline const Variable& as_variable_ref(const at::Tensor& tensor) { - AT_CHECK( + TORCH_CHECK( tensor.is_variable(), "Attempted to cast a Tensor to a Variable, but " "the dynamic type of the value is not Variable."); @@ -767,7 +770,7 @@ inline Variable::Variable(c10::intrusive_ptr self) : at::Tensor(std::move(self)) {} inline Variable::Impl* Variable::get() const { - AT_CHECK(defined(), "Called Variable::get() on an undefined Variable"); + TORCH_CHECK(defined(), "Called Variable::get() on an undefined Variable"); return static_cast(impl_.get()); } }} // namespace torch::autograd diff --git a/torch/csrc/cuda/Event.cpp b/torch/csrc/cuda/Event.cpp index 216f7acf980e..c0e3bbb68edd 100644 --- a/torch/csrc/cuda/Event.cpp +++ b/torch/csrc/cuda/Event.cpp @@ -56,10 +56,10 @@ static PyObject * THCPEvent_from_ipc_handle( at::Device device = r.device(0); std::string handle_string = r.string(1); - AT_CHECK(handle_string.size() == sizeof(cudaIpcEventHandle_t), + TORCH_CHECK(handle_string.size() == sizeof(cudaIpcEventHandle_t), "cudaIpcEventHandle_t expects byte-like object of size ", sizeof(cudaIpcEventHandle_t), ", but got ", handle_string.size()); - AT_CHECK(device.type() == at::kCUDA, "Event can only be created on " + TORCH_CHECK(device.type() == at::kCUDA, "Event can only be created on " "CUDA devices, but got device type ", device.type()) THPObjectPtr ptr(type->tp_alloc(type, 0)); diff --git a/torch/csrc/cuda/comm.cpp b/torch/csrc/cuda/comm.cpp index 308ca606f708..25ba19651215 100644 --- a/torch/csrc/cuda/comm.cpp +++ b/torch/csrc/cuda/comm.cpp @@ -181,7 +181,7 @@ std::vector scatter( if (chunk_sizes) { const int64_t chunk_size_sum = std::accumulate(chunk_sizes->begin(), chunk_sizes->end(), int64_t{0}); - AT_CHECK( + TORCH_CHECK( chunk_size_sum == tensor.size(dim), "given chunk sizes don't sum up to the tensor's size ", "(sum(chunk_sizes) == ", chunk_size_sum, @@ -190,7 +190,7 @@ std::vector scatter( int64_t chunk_start = 0; for (size_t chunk = 0; chunk < chunk_sizes->size(); ++chunk) { const int64_t chunk_size = (*chunk_sizes)[chunk]; - AT_CHECK(chunk_size > 0, "Chunk size must be positive"); + TORCH_CHECK(chunk_size > 0, "Chunk size must be positive"); chunks.push_back(tensor.narrow(dim, chunk_start, chunk_size)); chunk_start += chunk_size; } @@ -202,7 +202,7 @@ std::vector scatter( for (size_t chunk = 0; chunk < chunks.size(); ++chunk) { const auto device_index = static_cast(devices[chunk]); if (streams && (*streams)[chunk]) { - AT_CHECK( + TORCH_CHECK( (*streams)[chunk]->device_index() == device_index, "Expected the device associated with the stream at index ", chunk, " (was ", (*streams)[chunk]->device_index(), ") ", @@ -220,19 +220,19 @@ at::Tensor gather( at::TensorList tensors, int64_t dim, c10::optional destination_index) { - AT_CHECK(!tensors.empty(), "Expected at least one tensor to gather from"); + TORCH_CHECK(!tensors.empty(), "Expected at least one tensor to gather from"); at::Tensor result; int64_t total_size = 0; auto& first = tensors.front(); const auto first_size = first.sizes(); std::vector expected_size(first_size.begin(), first_size.end()); for (const auto& tensor : tensors) { - AT_CHECK( + TORCH_CHECK( tensor.is_cuda(), "Gather expects all inputs to have CUDA type"); AT_ASSERT(tensor.ndimension() == static_cast(expected_size.size())); expected_size[dim] = tensor.size(dim); for (size_t dimension = 0; dimension < expected_size.size(); ++dimension) { - AT_CHECK( + TORCH_CHECK( expected_size[dimension] == tensor.size(dimension), "Gather got an input of invalid size: got ", tensor.sizes(), ", but expected ", at::IntArrayRef(expected_size)); diff --git a/torch/csrc/cuda/nccl.cpp b/torch/csrc/cuda/nccl.cpp index 2db8306252d8..e377e971f4df 100644 --- a/torch/csrc/cuda/nccl.cpp +++ b/torch/csrc/cuda/nccl.cpp @@ -250,7 +250,7 @@ void broadcast( const auto stream = (streams.empty() || !streams[i]) ? at::cuda::getCurrentCUDAStream(device).stream() : streams[i]->stream(); - AT_CHECK( + TORCH_CHECK( static_cast(numel) <= static_cast(count_max), "Broadcast tensor has ", numel, @@ -275,7 +275,7 @@ void reduce( const comm_list& user_comms) { #ifdef USE_NCCL using namespace torch::cuda::nccl::detail; - AT_CHECK( + TORCH_CHECK( root >= 0 && static_cast(root) < inputs.size(), "invalid root"); _check_inputs(inputs, outputs, 1, 1); diff --git a/torch/csrc/distributed/c10d/comm.cpp b/torch/csrc/distributed/c10d/comm.cpp new file mode 100644 index 000000000000..5d6659559404 --- /dev/null +++ b/torch/csrc/distributed/c10d/comm.cpp @@ -0,0 +1,82 @@ +#include + +#include + +#include +#include +#include + +namespace c10d { +namespace { + +class BroadcastWork { + public: + BroadcastWork( + const std::shared_ptr& process_group, + std::vector bucket_tensors) + : bucket_tensors_(std::move(bucket_tensors)), + flat_tensor_({torch::utils::flatten_dense_tensors(bucket_tensors_)}), + work_(process_group->broadcast(flat_tensor_)) {} + + void finish() { + work_->wait(); + + // Copy the output of the broadcast operation back. + auto output_tensors = torch::utils::unflatten_dense_tensors( + flat_tensor_.front(), bucket_tensors_); + AT_ASSERT(output_tensors.size() == bucket_tensors_.size()); + for (size_t i = 0; i < output_tensors.size(); i++) { + bucket_tensors_[i].copy_(output_tensors[i], /*non_blocking=*/true); + } + } + + protected: + // The list of tensors to broadcast. They are guaranteed to be + // placed on the same device and have the same dtype. + std::vector bucket_tensors_; + + // The vector with a single flattened tensor containing the contents + // of the tensors in bucket_tensors_. It must be stored in a vector + // because c10d::ProcessGroup::broadcast takes a vector argument. + std::vector flat_tensor_; + + // The broadcast work that is kicked off upon construction. + std::shared_ptr work_; +}; + +} // namespace + +// Broadcast many tensors to all processes in the process group. +void broadcast_coalesced( + std::shared_ptr process_group, + at::TensorList tensors, + size_t buffer_size) { + // Coalesce tensors into buckets taking into account the maximum buffer size. + // This routine is multi-device aware, so the tensors can be split across + // multiple devices and can contain a mix of CPU and CUDA tensors. + const auto buckets = + compute_bucket_assignment_by_size(tensors.vec(), {buffer_size}); + + // Returns tensor at specified index in input tensor list. + const auto lookup = [&tensors](size_t index) { return tensors[index]; }; + + // We maintain a maximum of 2 in flight broadcast operations to avoid + // allocating too much memory (in case the specified tensors are very large). + std::deque in_flight; + constexpr auto max_in_flight = 2; + for (const auto& bucket : buckets) { + if (in_flight.size() >= max_in_flight) { + in_flight.front().finish(); + in_flight.pop_front(); + } + + in_flight.emplace_back(process_group, c10::fmap(bucket, lookup)); + } + + while (!in_flight.empty()) { + in_flight.front().finish(); + in_flight.pop_front(); + } +} + +} // namespace c10d diff --git a/torch/csrc/distributed/c10d/comm.h b/torch/csrc/distributed/c10d/comm.h new file mode 100644 index 000000000000..9aec2e6bac3e --- /dev/null +++ b/torch/csrc/distributed/c10d/comm.h @@ -0,0 +1,16 @@ +#pragma once + +#include + +#include +#include + +namespace c10d { + +// Broadcast many tensors to all processes in the process group. +void broadcast_coalesced( + std::shared_ptr process_group, + at::TensorList tensors, + size_t buffer_size); + +} // namespace c10d diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp index a1b14fd24da3..b8ce586ded42 100644 --- a/torch/csrc/distributed/c10d/init.cpp +++ b/torch/csrc/distributed/c10d/init.cpp @@ -18,6 +18,7 @@ #include #include +#include #include #include #include @@ -534,6 +535,21 @@ They are used in specifying strategies for reduction collectives, e.g., py::arg("bucket_size"), py::call_guard()); + module.def( + "_broadcast_coalesced", + // Define a lambda such that the pybind11 prototype can take a std::vector + // for the tensor list argument, but still pass it to the underlying + // function as a c10::ArrayRef. + [](std::shared_ptr<::c10d::ProcessGroup> process_group, + std::vector tensors, + size_t buffer_size) { + broadcast_coalesced(process_group, tensors, buffer_size); + }, + py::arg("process_group"), + py::arg("tensors"), + py::arg("buffer_size"), + py::call_guard()); + Py_RETURN_TRUE; } diff --git a/torch/csrc/distributed/c10d/reducer.cpp b/torch/csrc/distributed/c10d/reducer.cpp index 3b9caef7df4c..d919705d1e57 100644 --- a/torch/csrc/distributed/c10d/reducer.cpp +++ b/torch/csrc/distributed/c10d/reducer.cpp @@ -395,17 +395,19 @@ void Reducer::prepare_for_backward( "starting a new one. ", "", "This error indicates that your module has parameters that were ", - "not used in producing its output (the return value of `forward`). ", + "not used in producing loss. ", "", - "You can enable unused parameter detection by passing the keyword " + "You can enable unused parameter detection by (1) passing the keyword " "argument `find_unused_parameters=True` to ", - "`torch.nn.parallel.DistributedDataParallel`. ", + "`torch.nn.parallel.DistributedDataParallel`; (2) making sure all ", + "`forward` function outputs participate in calculating loss. " "", - "If you already have this argument set, then the distributed data ", - "parallel module wasn't able to locate the output tensors in the ", + "If you already have done the above two steps, then the distributed ", + "data parallel module wasn't able to locate the output tensors in the ", "return value of your module's `forward` function. ", - "Please include the structure of the return value of `forward` of ", - "your module when reporting this issue (e.g. list, dict, iterable)."); + "Please include the loss function and the structure of the return ", + "value of `forward` of your module when reporting this issue (e.g. ", + "list, dict, iterable)."); } // Reset accounting. diff --git a/torch/csrc/generic/Storage.cpp b/torch/csrc/generic/Storage.cpp index 4e9382f19c3f..5bec5935ef61 100644 --- a/torch/csrc/generic/Storage.cpp +++ b/torch/csrc/generic/Storage.cpp @@ -382,12 +382,11 @@ void THPStorage_(postInit)(PyObject *module) THPStorageClass = PyObject_GetAttrString(module,(char*)TH_CONCAT_STRING_2(Real,Storage)); if (!THPStorageClass) throw python_error(); - bool is_cuda = false; + at::Backend backend = at::Backend::CPU; #ifdef THC_GENERIC_FILE - is_cuda = true; + backend = at::Backend::CUDA; #endif - const char *type_name = TH_CONCAT_STRING_2(Real,); - torch::registerStoragePyTypeObject((PyTypeObject*)THPStorageClass, type_name, is_cuda, false); + torch::registerStoragePyTypeObject((PyTypeObject*)THPStorageClass, backend, TH_CONCAT_2(at::k, Real)); } #endif diff --git a/torch/csrc/jit/autodiff.cpp b/torch/csrc/jit/autodiff.cpp index d10868f96070..dd77ee86186d 100644 --- a/torch/csrc/jit/autodiff.cpp +++ b/torch/csrc/jit/autodiff.cpp @@ -856,7 +856,7 @@ static void lambdaLiftReverse(Gradient& grad_desc, ReverseDetails& rev_info) { Gradient differentiate(std::shared_ptr& graph) { Gradient grad_desc; // Take ownership of the graph - AT_CHECK( + TORCH_CHECK( graph.use_count() == 1, "differentiate will mutate and destroy the graph, so it requires " "graph.use_count() == 1, but found %d", diff --git a/torch/csrc/jit/constants.cpp b/torch/csrc/jit/constants.cpp index c66ec81acc84..ef01b4c90543 100644 --- a/torch/csrc/jit/constants.cpp +++ b/torch/csrc/jit/constants.cpp @@ -85,7 +85,7 @@ c10::optional tryInsertConstant( return c10::nullopt; } if (loc) - n->setSourceLocation(std::make_shared(*loc)); + n->setSourceRange(*loc); if (scope) n->setScope(*scope); if (result_type) { diff --git a/torch/csrc/jit/custom_operator.h b/torch/csrc/jit/custom_operator.h index 87c2f74c847a..93a334ab9e08 100644 --- a/torch/csrc/jit/custom_operator.h +++ b/torch/csrc/jit/custom_operator.h @@ -2,7 +2,7 @@ #include #include -#include +#include #include #include @@ -83,7 +83,7 @@ inline void checkArgumentVector( const FunctionSchema& inferredSchema, const FunctionSchema& providedSchema) { // clang-format off - AT_CHECK( + TORCH_CHECK( inferred.size() == provided.size(), "Inferred ", inferred.size(), " ", what, "(s) for operator implementation, but the provided schema specified ", @@ -92,7 +92,7 @@ inline void checkArgumentVector( // clang-format on for (size_t i = 0; i < provided.size(); ++i) { // clang-format off - AT_CHECK( + TORCH_CHECK( provided[i].type()->isSubtypeOf(inferred[i].type()), "Inferred type for ", what, " #", i, " was ", *inferred[i].type(), ", but the provided schema specified type ", *provided[i].type(), @@ -246,12 +246,33 @@ struct TORCH_API RegisterOperators { RegisterOperators& op( const std::string& name, Implementation&& implementation, - OperatorOptions options = OperatorOptions()) { + OperatorOptions options) { + registerOperator(createOperator( name, std::forward(implementation), options)); return *this; } + + template + RegisterOperators& op( + const std::string& name, + Implementation&& implementation) { + registrars_.emplace_back(std::make_shared(name, std::forward(implementation))); + + return *this; + } + +private: + // A c10::RegisterOperators instance is not copyable, so to make + // torch::jit::RegisterOperators copyable, we use shared_ptrs. + // We need to keep the c10::RegisterOperators instances around + // because this is an RAII pattern. In the destructor, the registered + // ops get de-registered. + std::vector> registrars_; }; } // namespace jit + +using RegisterOperators = c10::RegisterOperators; + } // namespace torch diff --git a/torch/csrc/jit/export.cpp b/torch/csrc/jit/export.cpp index 150838162ff5..0d64e0399a63 100644 --- a/torch/csrc/jit/export.cpp +++ b/torch/csrc/jit/export.cpp @@ -38,13 +38,7 @@ namespace onnx = ::ONNX_NAMESPACE; class ScriptModuleSerializer; std::string getNodeStackTraceString(const Node* n) { - std::stringstream ss; - if (n->getSourceLocation()) { - n->getSourceLocation()->highlight(ss); - } else { - ss << ""; - } - return ss.str(); + return n->sourceRange().str(); } void validateBlock( @@ -258,10 +252,8 @@ void EncoderBase::EncodeBlock( continue; } auto p_n = graph_proto->add_node(); - if (node->getSourceLocation() && !strip_doc_) { - std::stringstream ss; - node->getSourceLocation()->highlight(ss); - p_n->set_doc_string(ss.str()); + if (!strip_doc_) { + p_n->set_doc_string(node->sourceRange().str()); } for (auto input : node->inputs()) { if (input->node()->mustBeNone() && !is_raw_export) { @@ -512,7 +504,10 @@ class ScriptModuleSerializer final { // to dump the content of a tensor void writeTensorTable(torch::ModelDef* model_def); - void writeAttributeTable(); + // Write the list of ivalues to a file as a pickle program + void writePickleArchive( + const std::string& name, + const std::vector& ivalues); void writeLibs(torch::ModelDef* model_def); void convertModule( @@ -521,10 +516,8 @@ class ScriptModuleSerializer final { const std::string& name, torch::ModuleDef* module_def); - void convertParameter( - const script::Slot& param, - torch::ParameterDef* param_def, - bool is_parameter); + IValue moduleGetState(const script::Module& module); + bool moduleHasValidGetSetState(const script::Module& module); void convertClass(const ClassTypePtr& type, torch::ModelDef* model_def); @@ -534,7 +527,9 @@ class ScriptModuleSerializer final { // all tensors that will be stored std::vector tensor_table_; - std::vector attribute_table_; + // A list of attributes (indexed by attr_def->id()) and module state (indexed + // by module_def->id()) + std::vector pickled_ivalues_; // all classes used by this module hierarchy std::vector class_table_; @@ -664,8 +659,8 @@ void ScriptModuleSerializer::convertModel( convertModule( module, "", writer_.archiveName(), model_def->mutable_main_module()); - // This may write some attributes to the tensor_table_ - writeAttributeTable(); + + writePickleArchive("attributes.pkl", pickled_ivalues_); writeTensorTable(model_def); writeLibs(model_def); @@ -677,6 +672,82 @@ void ScriptModuleSerializer::convertModel( } } +bool ScriptModuleSerializer::moduleHasValidGetSetState( + const script::Module& module) { + // Check that the schemas for __getstate__ and __setstate__ are correct + auto getstate = module.module_object()->type()->getMethod("__getstate__"); + if (getstate == nullptr) { + return false; + } + auto get_schema = + module.module_object()->type()->getMethod("__getstate__")->getSchema(); + + // Check __getstate__ + // __getstate__ is expected to be (self) -> T + AT_CHECK( + get_schema.arguments().size() == 1, + "'__getstate__' must have 'self' as its only argument, but found ", + get_schema.arguments().size(), + " arguments"); + AT_CHECK( + get_schema.returns().size() == 1, + "'__getstate__' must return 1 value, but found ", + get_schema.returns().size()); + + // Check __setstate__ if the method exists + // __setstate__ is expected to be (self, T) -> None + // TODO: use getMethod("__getstate__") once methods are not lowered + auto setstate = module.class_compilation_unit().find_function("__setstate__"); + if (setstate == nullptr) { + return false; + } + auto set_schema = setstate->getSchema(); + + AT_CHECK( + set_schema.arguments().size() == 2, + "'__setstate__' must have 'self' and the state as its " + "only arguments, but found ", + set_schema.arguments().size(), + " arguments"); + AT_CHECK( + set_schema.returns().size() == 1, + "'__setstate__' must return None, but found ", + set_schema.returns().size(), + " return values"); + AT_CHECK( + set_schema.returns().at(0).type()->isSubtypeOf(NoneType::get()), + "'__setstate__' must return None, but found value of type", + set_schema.returns().at(0).type()->python_str()); + + // Check that the return type of __getstate__ matches the input to + // __setstate__ + auto get_type = get_schema.returns().at(0).type(); + auto set_type = set_schema.arguments().at(1).type(); + + AT_CHECK( + set_type->isSubtypeOf(get_type), + "'__getstate__'s return type (", + get_type->python_str(), + " does not match '__setstate__'s argument type (", + set_type->python_str(), + "))"); + + return true; +} + +/// Run module.__getstate__() and return the result +IValue ScriptModuleSerializer::moduleGetState(const script::Module& module) { + auto getstate = module.find_method("__getstate__"); + AT_CHECK( + getstate != nullptr, + "Cannot call '__getstate__' method because" + " it does not exist"); + + Stack stack; + getstate->run(stack); + return stack.at(0); +} + size_t ScriptModuleSerializer::addTensor(const at::Tensor& tensor) { tensor_table_.push_back(tensor); return tensor_table_.size() - 1; @@ -728,17 +799,18 @@ void ScriptModuleSerializer::writeTensorTable(torch::ModelDef* model_def) { } } -void ScriptModuleSerializer::writeAttributeTable() { +void ScriptModuleSerializer::writePickleArchive( + const std::string& name, + const std::vector& ivalues) { Pickler pickler(&tensor_table_); pickler.start(); pickler.startTuple(); - for (const IValue& ivalue : attribute_table_) { + for (const IValue& ivalue : ivalues) { pickler.addIValue(ivalue); } pickler.endTuple(); pickler.finish(); - writer_.writeRecord( - "attributes.pkl", pickler.stack().data(), pickler.stack().size()); + writer_.writeRecord(name, pickler.stack().data(), pickler.stack().size()); } void ScriptModuleSerializer::convertModule( @@ -748,19 +820,47 @@ void ScriptModuleSerializer::convertModule( torch::ModuleDef* module_def) { module_def->set_name(name); module_def->set_optimize(module.is_optimized()); - for (const auto& elem : module.get_parameters()) { + + // If __getstate__ and __setstate__ methods are provided, use those for + // serializing instead of serializing the attributes directly + bool user_provided_serialization = moduleHasValidGetSetState(module); + if (user_provided_serialization) { + // Run the '__getstate__' method on the module and store the result + pickled_ivalues_.emplace_back(moduleGetState(module)); + module_def->set_get_state_attribute_id(pickled_ivalues_.size() - 1); + } + + // Add all the parameters + for (const auto& param : module.get_parameters()) { torch::ParameterDef* param_def = module_def->add_parameters(); - convertParameter(elem, param_def, /*is_buffer=*/false); + param_def->set_name(param.name()); + param_def->set_is_buffer(false); + if (user_provided_serialization) { + // If a __getstate__ was used, don't write the actual tensor + param_def->set_tensor_id(-1); + } else { + param_def->set_tensor_id(addTensor(param.value().toTensor())); + } } + // Add all the attributes for (const auto& attribute : module.get_attributes()) { // Add attribute to ModuleDef torch::AttributeDef* attribute_def = module_def->add_attributes(); attribute_def->set_name(attribute.name()); attribute_def->set_type(attribute.type()->python_str()); - attribute_table_.push_back(attribute.value()); - attribute_def->set_id(attribute_table_.size() - 1); + if (!user_provided_serialization) { + // Write the attribute's index if it's actually saved, -1 if it needs to + // come from __getstate__ + pickled_ivalues_.push_back(attribute.value()); + attribute_def->set_id(pickled_ivalues_.size() - 1); + } else { + // The module had a __setstate__, so write the attribute name/type so + // it can be correctly imported, but it has no entry in the + // pickled_ivalues_ table + attribute_def->set_id(-1); + } } std::stringstream module_name; @@ -768,7 +868,7 @@ void ScriptModuleSerializer::convertModule( module_name << prefix << "_"; module_name << name; - if (module.get_methods().size() > 0) { + if (module.class_compilation_unit().get_functions().size() > 0) { std::ostringstream methods; methods << "op_version_set = " << CURRENT_OP_VERSION_SET << "\n"; PythonPrint( @@ -794,15 +894,6 @@ void ScriptModuleSerializer::convertModule( } } -void ScriptModuleSerializer::convertParameter( - const script::Slot& param, - torch::ParameterDef* param_def, - bool is_parameter) { - param_def->set_name(param.name()); - param_def->set_is_buffer(is_parameter); - param_def->set_tensor_id(addTensor(param.value().toTensor())); -} - // Pretty printing for ONNX constexpr char indent_char = ' '; constexpr size_t indent_multiplier = 2; diff --git a/torch/csrc/jit/fuser/cpu/fused_kernel.cpp b/torch/csrc/jit/fuser/cpu/fused_kernel.cpp index c044aca46234..c9603f5db3a8 100644 --- a/torch/csrc/jit/fuser/cpu/fused_kernel.cpp +++ b/torch/csrc/jit/fuser/cpu/fused_kernel.cpp @@ -88,7 +88,7 @@ static void runCompiler( config.openmp = false; // disable for future compiles return runCompiler(cpp_file, so_file); } - AT_CHECK(r == 0, "Failed to compile a fused CPU kernel"); + TORCH_CHECK(r == 0, "Failed to compile a fused CPU kernel"); } static const std::string disas_string = "objdump -M intel -d \"${so_file}\""; diff --git a/torch/csrc/jit/fuser/cuda/fused_kernel.cpp b/torch/csrc/jit/fuser/cuda/fused_kernel.cpp index 521cfd2f2184..4f1809b5dd7f 100644 --- a/torch/csrc/jit/fuser/cuda/fused_kernel.cpp +++ b/torch/csrc/jit/fuser/cuda/fused_kernel.cpp @@ -39,18 +39,6 @@ namespace cuda { // INSTEAD USE, e.g. nvrtc().cuLoadModule(...) // If a function is missing add it to the list in thnvrtc. -void checkCUDAVersion(const cudaDeviceProp& prop) { - if ((prop.major >= 6 && CUDA_VERSION < 8000) || - (prop.major >= 7 && CUDA_VERSION < 9000)) { - std::stringstream err_string; - err_string - << "In CUDAFusedKernel, PyTorch compiled with insufficient CUDA version: " - << CUDA_VERSION << " for the current GPU device " << prop.name - << " with device capability " << prop.major << "." << prop.minor; - throw std::runtime_error(err_string.str()); - } -} - #ifdef USE_DIRECT_NVRTC std::pair, THNVRTC*> loadNVRTC() { return std::make_pair(nullptr, torch_load_nvrtc()); diff --git a/torch/csrc/jit/fuser/executor.cpp b/torch/csrc/jit/fuser/executor.cpp index fab904338c44..49dfa40634a4 100644 --- a/torch/csrc/jit/fuser/executor.cpp +++ b/torch/csrc/jit/fuser/executor.cpp @@ -69,7 +69,7 @@ static c10::optional> canRunKernel( const KernelSpec& spec, at::TensorList args) { // Short-circuits on size mismatch - AT_CHECK( + TORCH_CHECK( args.size() == spec.inputChunks().size(), "Expected ", spec.inputChunks().size(), diff --git a/torch/csrc/jit/graph_executor.cpp b/torch/csrc/jit/graph_executor.cpp index ac44b5ad08eb..fa3ae590015d 100644 --- a/torch/csrc/jit/graph_executor.cpp +++ b/torch/csrc/jit/graph_executor.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -27,6 +28,8 @@ #include #include #include +#include +#include #include #include @@ -58,6 +61,11 @@ std::shared_ptr lastExecutedOptimizedGraph() { return last_executed_optimized_graph.lock(); } +void ExecutionPlan::run(Stack& stack) const { + InterpreterState(code).run(stack); + last_executed_optimized_graph = graph; +} + namespace { using tensor_list = std::vector; @@ -70,31 +78,6 @@ using autograd::variable_list; const size_t autodiffSubgraphNodeThreshold = 2; const size_t autodiffSubgraphInlineThreshold = 5; -struct ExecutionPlan { - ExecutionPlan() = default; - ExecutionPlan(std::shared_ptr graph) - : code(graph), graph(std::move(graph)) {} - - void run(Stack& stack) const { - InterpreterState(code).run(stack); - last_executed_optimized_graph = graph; - } - - operator bool() const { - return static_cast(graph); - } - - ExecutionPlanState getDebugState() { - ExecutionPlanState state; - state.code = &code; - state.graph = graph.get(); - return state; - } - - Code code; - std::shared_ptr graph; -}; - struct CaptureList { CaptureList(size_t capture_size) { capture_types_.reserve(capture_size); @@ -489,28 +472,16 @@ GraphExecutor* getGradExecutor(Operation& op) { // and different requires_grad states, and handles specializations for each // situation. GraphExecutor is completely unaware of tracing or module // parameters to keep the tracing concerns separated. -struct GraphExecutorImpl { - static std::shared_ptr prepareGraph(std::shared_ptr& graph) { - auto copy = graph->copy(); - EraseShapeInformation(copy); - return copy; - } - - GraphExecutorImpl(std::shared_ptr graph, bool optimize) - : graph(prepareGraph(graph)), - // until we have correct alias analysis any use of mutable operators - // disables all optimization - optimize(optimize), - num_inputs(this->graph->inputs().size()), - arg_spec_creator_(*graph), - num_outputs(this->graph->outputs().size()) { +struct GraphExecutorImpl : public GraphExecutorImplBase { + GraphExecutorImpl(const std::shared_ptr& graph, bool optimize) + : GraphExecutorImplBase(graph, optimize), arg_spec_creator_(*graph) { logging::getLogger()->addStatValue( logging::runtime_counters::GRAPH_EXECUTORS_CONSTRUCTED, 1.0); } // entry point where execution begins - void run(Stack& stack) { - AT_CHECK( + void run(Stack& stack) override { + TORCH_CHECK( stack.size() >= num_inputs, "expected ", num_inputs, @@ -529,7 +500,7 @@ struct GraphExecutorImpl { return execution_plan.run(stack); } - GraphExecutorState getDebugState() { + GraphExecutorState getDebugState() override { GraphExecutorState state; state.graph = graph.get(); if (fallback) { @@ -541,7 +512,7 @@ struct GraphExecutorImpl { return state; } - private: + protected: friend struct GraphExecutor; const ExecutionPlan& getOrCompileFallback() { @@ -608,10 +579,11 @@ struct GraphExecutorImpl { for (Node* dnode : diff_nodes) { auto diff_graph = std::move(dnode->g(attr::Subgraph)); Gradient gradient = differentiate(diff_graph); - // Run post differentiation optimizations, Autodiff will replace some + // Run post differentiation optimizations, Autodiff will replace some // parts of graph with new graph, these new graphs usually consists of // control flows and miss shape information on nodes, so we run shape - // prop and differentiable optimizations to ensure the graph is optimized + // prop and differentiable optimizations to ensure the graph is + // optimized PropagateInputShapes(gradient.f); runOptimization(gradient.f); // run non diff optimization on the forward graph @@ -720,18 +692,9 @@ struct GraphExecutorImpl { } } - // The unoptimized starting graph. This field is effectively const, but we - // can't make it so because Graph::copy() is not const (and making it const is - // not that easy at this point). - std::shared_ptr graph; + ~GraphExecutorImpl() override = default; - // If false, we'll run the graph as we get it, without any optimizations. - // Useful for debugging. - const bool optimize; - const size_t num_inputs; ArgumentSpecCreator arg_spec_creator_; - const size_t num_outputs; - // Populated only when optimize is false (and in that case plan_cache will be // unused). The compiled version of graph. ExecutionPlan fallback; @@ -739,14 +702,15 @@ struct GraphExecutorImpl { // Mapping from argument configurations to optimized versions of the graph // that are specialized to the spec. std::unordered_map plan_cache; - - // GraphExecutors can be accessed from multiple threads, so this thread needs - // to be held every time we access the fallback or plan_cache. - std::mutex compile_mutex; }; GraphExecutor::GraphExecutor(std::shared_ptr graph, bool optimize) - : pImpl(new GraphExecutorImpl(std::move(graph), optimize)) {} + : pImpl( + getProfilingMode() + ? dynamic_cast( + new ProfilingGraphExecutorImpl(graph, optimize)) + : dynamic_cast( + new GraphExecutorImpl(graph, optimize))) {} void GraphExecutor::run(Stack& inputs) { return pImpl->run(inputs); diff --git a/torch/csrc/jit/graph_executor.h b/torch/csrc/jit/graph_executor.h index 6c2cb6cbc65b..de51a0189df6 100644 --- a/torch/csrc/jit/graph_executor.h +++ b/torch/csrc/jit/graph_executor.h @@ -26,7 +26,7 @@ struct GraphExecutorState { std::unordered_map execution_plans; }; -struct GraphExecutorImpl; +struct GraphExecutorImplBase; struct TORCH_API GraphExecutor { GraphExecutor() = default; GraphExecutor(std::shared_ptr graph, bool optimize = true); @@ -38,7 +38,7 @@ struct TORCH_API GraphExecutor { GraphExecutorState getDebugState(); private: - std::shared_ptr pImpl; + std::shared_ptr pImpl; }; // These passes need to run before it is valid to pass to the interpreter @@ -48,6 +48,8 @@ TORCH_API void runRequiredPasses(const std::shared_ptr& g); TORCH_API void debugSetAutodiffSubgraphInlining(bool state); TORCH_API std::shared_ptr lastExecutedOptimizedGraph(); +TORCH_API bool& getProfilingMode(); + namespace detail { GraphExecutor* getGradExecutor(Operation& op); diff --git a/torch/csrc/jit/graph_executor_impl.h b/torch/csrc/jit/graph_executor_impl.h new file mode 100644 index 000000000000..d7d8b7a71f1f --- /dev/null +++ b/torch/csrc/jit/graph_executor_impl.h @@ -0,0 +1,101 @@ +#pragma once +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace torch { +namespace jit { + +struct ExecutionPlan { + ExecutionPlan() = default; + ExecutionPlan(std::shared_ptr graph) + : code(graph), graph(std::move(graph)) {} + + void run(Stack& stack) const; + + operator bool() const { + return static_cast(graph); + } + + ExecutionPlanState getDebugState() { + ExecutionPlanState state; + state.code = &code; + state.graph = graph.get(); + return state; + } + + Code code; + std::shared_ptr graph; +}; + +// a Graph can be created via tracing, or via a language-based frontend +// GraphExecutor runs it. It can run the same graph on many different sizes +// and different requires_grad states, and handles specializations for each +// situation. GraphExecutor is completely unaware of tracing or module +// parameters to keep the tracing concerns separated. +struct GraphExecutorImplBase { + static std::shared_ptr prepareGraph( + const std::shared_ptr& graph) { + auto copy = graph->copy(); + EraseShapeInformation(copy); + return copy; + } + + GraphExecutorImplBase(const std::shared_ptr& graph, bool optimize) + : graph(prepareGraph(graph)), + // until we have correct alias analysis any use of mutable operators + // disables all optimization + optimize(optimize), + num_inputs(this->graph->inputs().size()), + num_outputs(this->graph->outputs().size()) {} + + // entry point where execution begins + virtual void run(Stack& stack) = 0; + virtual GraphExecutorState getDebugState() = 0; + virtual ~GraphExecutorImplBase() = default; + + protected: + friend struct GraphExecutor; + + // The unoptimized starting graph. This field is effectively const, but we + // can't make it so because Graph::copy() is not const (and making it const is + // not that easy at this point). + std::shared_ptr graph; + + // If false, we'll run the graph as we get it, without any optimizations. + // Useful for debugging. + const bool optimize; + const size_t num_inputs; + const size_t num_outputs; + + // GraphExecutors can be accessed from multiple threads, so this thread needs + // to be held every time we access the fallback or plan_cache. + std::mutex compile_mutex; +}; + +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/import.cpp b/torch/csrc/jit/import.cpp index 8d97460f6b6f..79311a07e010 100644 --- a/torch/csrc/jit/import.cpp +++ b/torch/csrc/jit/import.cpp @@ -58,8 +58,11 @@ class ScriptModuleDeserializer final { void convertModule(const torch::ModuleDef& module_def); void loadTensorTable(torch::ModelDef* model_def); - void loadAttributeTable(); + std::vector loadPickleArchive(const std::string& name); void importCallback(const std::string& qualifier); + void moduleSetState( + const std::shared_ptr& module, + IValue state); caffe2::serialize::PyTorchStreamReader reader_; // this is a hack to make sure the script module created in C++ is the @@ -69,7 +72,8 @@ class ScriptModuleDeserializer final { std::vector moduleStack_; std::vector tensor_table_; - std::vector attribute_table_; + std::vector pickled_ivalues_; + std::unordered_set imported_libs_; std::shared_ptr main_module_; @@ -130,16 +134,18 @@ void ScriptModuleDeserializer::deserialize( // Load extra files. for (const auto& kv : extra_files) { const std::string& key = "extra/" + kv.first; - at::DataPtr meta_ptr; - size_t meta_size; - std::tie(meta_ptr, meta_size) = reader_.getRecord(key); - extra_files[kv.first] = - std::string(static_cast(meta_ptr.get()), meta_size); + if (reader_.hasFile(key)) { + at::DataPtr meta_ptr; + size_t meta_size; + std::tie(meta_ptr, meta_size) = reader_.getRecord(key); + extra_files[kv.first] = + std::string(static_cast(meta_ptr.get()), meta_size); + } } loadTensorTable(&model_def); if (model_def.proto_version() >= 2) { - loadAttributeTable(); + pickled_ivalues_ = loadPickleArchive("attributes.pkl"); } // TODO: this can be simplified when C++/Python interop lands, @@ -154,13 +160,13 @@ void ScriptModuleDeserializer::loadTensorTable(torch::ModelDef* model_def) { } } -void ScriptModuleDeserializer::loadAttributeTable() { +std::vector ScriptModuleDeserializer::loadPickleArchive(const std::string& name) { at::DataPtr attributes_ptr; size_t attributes_size; std::tie(attributes_ptr, attributes_size) = - reader_.getRecord("attributes.pkl"); + reader_.getRecord(name); Unpickler unpickler(attributes_ptr.get(), attributes_size, &tensor_table_); - attribute_table_ = unpickler.parse_ivalue_list(); + return unpickler.parse_ivalue_list(); } at::Tensor ScriptModuleDeserializer::loadTensor( @@ -255,6 +261,21 @@ void ScriptModuleDeserializer::importCallback(const std::string& qualifier) { import_callback); } +void ScriptModuleDeserializer::moduleSetState( + const std::shared_ptr& module, + IValue state) { + auto setstate = module->class_compilation_unit().find_function("__setstate__"); + + AT_CHECK( + setstate != nullptr, + "Cannot call '__setstate__' method because" + " it does not exist"); + + // TODO: once modules are first class in the interpreter and methods are not + // lowered, change this to `module->run_method("__setstate__", {state});` + setstate->run({module->module_object(), state}); +} + void ScriptModuleDeserializer::convertModule( const torch::ModuleDef& module_def) { std::shared_ptr module = moduleLookup_(moduleStack_); @@ -282,10 +303,16 @@ void ScriptModuleDeserializer::convertModule( continue; } + IValue ivalue; + if (attr_def.id() >= 0) { + // attribute has no value in the table, set it to None for now. After + // __getstate__, check that all the attributes that are not Optional + // can't be None + ivalue = pickled_ivalues_.at(attr_def.id()); + } + module->register_attribute( - attr_def.name(), - typeParser.parseType(attr_def.type()), - attribute_table_.at(attr_def.id())); + attr_def.name(), typeParser.parseType(attr_def.type()), ivalue); } if (module_def.has_torchscript_arena()) { at::DataPtr data; @@ -303,6 +330,26 @@ void ScriptModuleDeserializer::convertModule( tensor_table_, import_callback); } + + if (module_def.has_get_state_attribute_id()) { + moduleSetState( + module, pickled_ivalues_.at(module_def.get_state_attribute_id())); + } + + for (const auto& slot : module->get_attributes()) { + // Verify that all the non-optional attributes have been initialized + // TODO: Issue #20497 + if (slot.type()->kind() != TypeKind::OptionalType) { + AT_CHECK( + !slot.value().isNone(), + "The field '", + slot.name(), + "' was left unitialized after __setstate__, but expected a ", + "value of type '", + slot.type()->python_str(), + "'"); + } + } } } // namespace diff --git a/torch/csrc/jit/init.cpp b/torch/csrc/jit/init.cpp index b4e2c4398753..369d12c63320 100644 --- a/torch/csrc/jit/init.cpp +++ b/torch/csrc/jit/init.cpp @@ -169,6 +169,38 @@ void initJITBindings(PyObject* module) { std::tuple>>(pyQParamDict); return InsertQuantDequantNodes(g, qparam_dict); }) + .def( + "_jit_pass_insert_quantdequant_for_weight_bias", + [](std::shared_ptr& moduleObj, + const std::string& method_name, + const std::string& param_name, + py::function pyGetQParamFunc) { + // For different static params we pass different getQParamFunc via + // same interface exposed by the quantizer. + if (param_name == std::string("weight")) { + auto getQParamFunc = + py::cast( + at::Tensor)>>(pyGetQParamFunc); + InsertQuantDequantNodesForParam( + moduleObj, + method_name, + param_name, + getQParamFunc, + at::ScalarType::QInt8); + } else if (param_name == std::string("bias")) { + auto getQParamFunc = + py::cast( + float, float)>>(pyGetQParamFunc); + InsertQuantDequantNodesForParam( + moduleObj, + method_name, + param_name, + getQParamFunc, + at::ScalarType::QInt32); + } else { + TORCH_CHECK(false, "Invalid Param Name"); + } + }) .def( "_jit_pass_quantlint", [](std::shared_ptr& g) { return QuantLinting(g); }) @@ -295,6 +327,9 @@ void initJITBindings(PyObject* module) { auto stack = toStack(args); checkAliasAnnotation(g, std::move(stack), unqualified_op_name); }) + .def( + "_jit_set_profiling_mode", + [](bool profiling_flag) { getProfilingMode() = profiling_flag; }) .def( "_jit_fuser_get_fused_kernel_code", [](Graph& g, std::vector inps) { @@ -374,8 +409,8 @@ void initJITBindings(PyObject* module) { try { auto symbol = Symbol::fromQualString(qualified_name); auto operations = getAllOperatorsFor(symbol); - AT_CHECK(!operations.empty(), "No such operator ", qualified_name); - AT_CHECK( + TORCH_CHECK(!operations.empty(), "No such operator ", qualified_name); + TORCH_CHECK( operations.size() == 1, "Found ", operations.size(), diff --git a/torch/csrc/jit/interpreter.cpp b/torch/csrc/jit/interpreter.cpp index e052456707f1..808412883d0a 100644 --- a/torch/csrc/jit/interpreter.cpp +++ b/torch/csrc/jit/interpreter.cpp @@ -1,6 +1,7 @@ #include #include +#include #include #include #include @@ -330,7 +331,7 @@ struct Instruction { UseList inputs; ListHandle outputs; Symbol debug_name; // used in dump to understand the generated code - std::shared_ptr debug_location; // for error reporting + c10::optional debug_location; // for error reporting }; int relativeJump(int from_inst, int to_inst) { @@ -377,7 +378,7 @@ struct CodeImpl { void insertNodesFromBlock(Block* block) { for (auto node : block->nodes()) { - const auto& source_location = node->getSourceLocation(); + SourceRange source_location = node->sourceRange(); switch (node->kind()) { case prim::If: { // x = if c: @@ -481,7 +482,7 @@ struct CodeImpl { size_t insertInstruction(Node* n) { auto inst = insertInstruction( n->kind(), - n->getSourceLocation(), + n->sourceRange(), n->inputs(), moveFlags(n), n->outputs()); @@ -490,7 +491,7 @@ struct CodeImpl { } size_t insertInstruction( Symbol sym, - std::shared_ptr debug_location, + const SourceRange& debug_location, ArrayRef inputs, ArrayRef move_flags, ArrayRef outputs) { @@ -520,7 +521,7 @@ struct CodeImpl { } size_t insertAssign( - std::shared_ptr debug_location, + const SourceRange& debug_location, ArrayRef inputs, ArrayRef move_flags, ArrayRef outputs) { @@ -546,7 +547,7 @@ struct CodeImpl { list.size = 0; } void listInsert(ListHandle& list, int value) { - AT_CHECK( + TORCH_CHECK( list.start + list.size == (int)int_data.size(), "another list already started"); int_data.push_back(value); @@ -557,7 +558,7 @@ struct CodeImpl { list.size = 0; } void listInsert(ListHandle& list, int value) { - AT_CHECK( + TORCH_CHECK( list.start + list.size == (int)bool_data.size(), "another list already started"); bool_data.push_back(value); @@ -700,8 +701,8 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target { // the current thread will continue running before it suspends. InterpreterState state(intrusive_from_this()); e.future->addCallback([state]() { - c10::global_work_queue().run(InterpreterContinuation( - state, Stack(), autograd::GradMode::is_enabled())); + at::launch(InterpreterContinuation(state, Stack(), + autograd::GradMode::is_enabled())); }); return true; @@ -713,14 +714,10 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target { } catch (std::exception& e) { // Error from the current thread bool is_jit_exception = dynamic_cast(&e); - if (instructions[pc].debug_location) { - handleError( - instructions[pc].debug_location->wrapException( - e, "operation failed in interpreter"), - is_jit_exception); - } else { - handleError(e.what(), is_jit_exception); - } + handleError( + instructions[pc].debug_location->wrapException( + e, "operation failed in interpreter"), + is_jit_exception); return false; } } diff --git a/torch/csrc/jit/ir.cpp b/torch/csrc/jit/ir.cpp index cb56dd710df7..d254109ea2dd 100644 --- a/torch/csrc/jit/ir.cpp +++ b/torch/csrc/jit/ir.cpp @@ -200,6 +200,14 @@ void Node::printAttributes(std::ostream& out, bool ignore_subgraph = false) out << "]"; } +SourceRange Node::sourceRange() const { + if(source_range_) { + return *source_range_; + } + std::stringstream ss; + return SourceRange(ss.str()); +} + static std::ostream& indent(std::ostream& out, size_t level) { for (size_t i = 0; i < level; ++i) { out << " "; @@ -224,8 +232,10 @@ std::ostream& Node::print( if (numAttributes() > 1 && kind() != prim::DifferentiableGraph) { printAttributes(out, /*ignore_subgraph=*/true); } + groups->push_back(this); } else { + out << kind().toQualString(); if (hasAttributes()) { printAttributes(out); @@ -241,6 +251,7 @@ std::ostream& Node::print( out << ", "; out << "scope: " << scName << "\n"; } + for (size_t i = 0; i < blocks().size(); ++i) { auto b = blocks()[i]; indent(out, level + 1) << "block" << i << "(" @@ -251,6 +262,7 @@ std::ostream& Node::print( } indent(out, level + 2) << "-> (" << b->outputs() << ")\n"; } + return out; } @@ -539,7 +551,6 @@ Block::Block(Graph* graph_, Node* node_) output_(graph_->create(prim::Return, 0)), input_(graph_->create(prim::Param, 0)), owning_node_(node_) { - input_->next() = output_; input_->prev() = output_; output_->next() = input_; @@ -642,6 +653,16 @@ void Graph::remapTypes(const std::function& type_map) { block()->remapTypes(type_map); } +void Value::inferTypeFrom(const at::Tensor& output) { + if (output.is_mkldnn()) { + // mkldnn tensor as opaque tensor doesn't have strides, so we can + // not create a CompleteTensorType + setType(DimensionedTensorType::create(output)); + return; + } + setType(CompleteTensorType::create(output)); +} + bool Value::mustBeNone() const { return node_->mustBeNone(); } @@ -973,7 +994,7 @@ void Node::destroy() { } void Node::cloneFrom(Node* s) { - setSourceLocation(s->getSourceLocation()); + s->source_range_ = s->source_range_; if (s->scope_ && !s->scope_->isBlank()) { scope_ = s->scope_; } @@ -1107,7 +1128,9 @@ Node* Node::insertBefore(Node* n) { Node* Node::insertAfter(Node* n) { AT_ASSERT(!inBlockList() && n->inBlockList()); AT_ASSERT(n->owningBlock()); - AT_ASSERTM(n->kind() != prim::Return, "Attempting to insert a Node after the Return node or before the Param node"); + AT_ASSERTM( + n->kind() != prim::Return, + "Attempting to insert a Node after the Return node or before the Param node"); this->owning_block_ = n->owningBlock(); Node* next = n->next(); n->next() = this; diff --git a/torch/csrc/jit/ir.h b/torch/csrc/jit/ir.h index 3031dd2e4f57..89d7d2b451b2 100644 --- a/torch/csrc/jit/ir.h +++ b/torch/csrc/jit/ir.h @@ -162,9 +162,7 @@ struct Value { public: Value* setType(TypePtr type); - void inferTypeFrom(const at::Tensor& output) { - setType(CompleteTensorType::create(output)); - } + TORCH_API void inferTypeFrom(const at::Tensor& output); const TypePtr& type() const { AT_ASSERT(type_ != nullptr); return type_; @@ -249,7 +247,7 @@ struct TORCH_API Node { std::vector blocks_; Graph* graph_; Block* owning_block_; - std::shared_ptr source_location_; + c10::optional source_range_; ScopePtr scope_; // Assumes FunctionSchemas are persistent, so we don't manage their lifetime. // This field is effective a cache that's populated on attribute lookups and @@ -287,13 +285,12 @@ struct TORCH_API Node { NodeKind kind() const { return kind_; } - Node* setSourceLocation(std::shared_ptr sl) { - source_location_ = std::move(sl); + Node* setSourceRange(SourceRange r) { + source_range_ = std::move(r); return this; } - std::shared_ptr getSourceLocation() const { - return source_location_; - } + SourceRange sourceRange() const; + Graph* owningGraph() { return graph_; } @@ -591,8 +588,6 @@ struct TORCH_API Node { // template variable, returning nullptr if the cast is invalid.. // // Example usage: if(auto s = n.cast