diff --git a/.circleci/cimodel/data/caffe2_build_data.py b/.circleci/cimodel/data/caffe2_build_data.py
index 446f61af8666..dcaa90e01f04 100644
--- a/.circleci/cimodel/data/caffe2_build_data.py
+++ b/.circleci/cimodel/data/caffe2_build_data.py
@@ -11,7 +11,6 @@
         (Ver("gcc", "4.9"), [X("py2")]),
     ]),
     (Ver("ubuntu", "16.04"), [
-        (Ver("cuda", "8.0"), [X("py2")]),
         (Ver("cuda", "9.0"), [
             # TODO make explicit that this is a "secret TensorRT build"
             #  (see https://github.com/pytorch/pytorch/pull/17323#discussion_r259446749)
diff --git a/.circleci/cimodel/data/caffe2_build_definitions.py b/.circleci/cimodel/data/caffe2_build_definitions.py
index aa67a49894a9..44ef7f4ac3d3 100644
--- a/.circleci/cimodel/data/caffe2_build_definitions.py
+++ b/.circleci/cimodel/data/caffe2_build_definitions.py
@@ -10,7 +10,7 @@
 
 DOCKER_IMAGE_PATH_BASE = "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/"
 
-DOCKER_IMAGE_VERSION = 273
+DOCKER_IMAGE_VERSION = 276
 
 
 class Conf(object):
diff --git a/.circleci/cimodel/data/pytorch_build_data.py b/.circleci/cimodel/data/pytorch_build_data.py
index d18cbb8f17cc..7d102413daf5 100644
--- a/.circleci/cimodel/data/pytorch_build_data.py
+++ b/.circleci/cimodel/data/pytorch_build_data.py
@@ -8,7 +8,7 @@
         (None, [
             X("2.7.9"),
             X("2.7"),
-            X("3.5"),
+            ("3.5", [("important", [X(True)])]),
             X("nightly"),
         ]),
         ("gcc", [
@@ -28,7 +28,6 @@
             ("5", [X("3.6")]),
         ]),
         ("cuda", [
-            ("8", [X("3.6")]),
             ("9", [
                 # Note there are magic strings here
                 # https://github.com/pytorch/pytorch/blob/master/.jenkins/pytorch/build.sh#L21
@@ -37,7 +36,7 @@
                 # and
                 # https://github.com/pytorch/pytorch/blob/master/.jenkins/pytorch/build.sh#L153
                 # (from https://github.com/pytorch/pytorch/pull/17323#discussion_r259453144)
-                X("2.7"),
+                ("2.7", [("important", [X(True)])]),
                 X("3.6"),
             ]),
             ("9.2", [X("3.6")]),
@@ -136,6 +135,7 @@ def child_constructor(self):
         next_nodes = {
             "xla": XlaConfigNode,
             "namedtensor": NamedTensorConfigNode,
+            "important": ImportantConfigNode,
         }
         return next_nodes[experimental_feature]
 
@@ -156,6 +156,14 @@ def init2(self, node_name):
         self.props["is_namedtensor"] = node_name
 
 
+class ImportantConfigNode(TreeConfigNode):
+    def modify_label(self, label):
+        return "IMPORTANT=" + str(label)
+
+    def init2(self, node_name):
+        self.props["is_important"] = node_name
+
+
 class XenialCompilerConfigNode(TreeConfigNode):
 
     def init2(self, node_name):
diff --git a/.circleci/cimodel/data/pytorch_build_definitions.py b/.circleci/cimodel/data/pytorch_build_definitions.py
index 387410c562c7..f42d8db5c063 100644
--- a/.circleci/cimodel/data/pytorch_build_definitions.py
+++ b/.circleci/cimodel/data/pytorch_build_definitions.py
@@ -25,7 +25,8 @@ def __init__(self,
                  gpu_resource=None,
                  dependent_tests=None,
                  parent_build=None,
-                 is_namedtensor=False):
+                 is_namedtensor=False,
+                 is_important=False):
 
         self.distro = distro
         self.pyver = pyver
@@ -37,6 +38,7 @@ def __init__(self,
         # (from https://github.com/pytorch/pytorch/pull/17323#discussion_r259453608)
         self.is_xla = is_xla
         self.is_namedtensor = is_namedtensor
+        self.is_important = is_important
 
         self.restrict_phases = restrict_phases
         self.gpu_resource = gpu_resource
@@ -46,7 +48,10 @@ def __init__(self,
     # TODO: Eliminate the special casing for docker paths
     # In the short term, we *will* need to support special casing as docker images are merged for caffe2 and pytorch
     def get_parms(self, for_docker):
-        leading = ["pytorch"]
+        leading = []
+        if self.is_important and not for_docker:
+            leading.append("AAA")
+        leading.append("pytorch")
         if self.is_xla and not for_docker:
             leading.append("xla")
         if self.is_namedtensor and not for_docker:
@@ -225,6 +230,7 @@ def instantiate_configs():
 
         is_xla = fc.find_prop("is_xla") or False
         is_namedtensor = fc.find_prop("is_namedtensor") or False
+        is_important = fc.find_prop("is_important") or False
 
         gpu_resource = None
         if cuda_version and cuda_version != "10":
@@ -239,9 +245,10 @@ def instantiate_configs():
             restrict_phases,
             gpu_resource,
             is_namedtensor=is_namedtensor,
+            is_important=is_important,
         )
 
-        if cuda_version == "8":
+        if cuda_version == "9" and python_version == "3.6":
             c.dependent_tests = gen_dependent_configs(c)
 
         config_list.append(c)
diff --git a/.circleci/config.yml b/.circleci/config.yml
index 69fe24f832fa..95155de67632 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -139,6 +139,11 @@ setup_ci_environment: &setup_ci_environment
   no_output_timeout: "1h"
   command: ~/workspace/.circleci/scripts/setup_ci_environment.sh
 
+# Installs expect and moreutils so that we can call `unbuffer` and `ts`.
+# Also installs OpenMP
+# !!!!NOTE!!!! this is copied into a binary_macos_brew_update job which is the
+# same but does not install libomp. If you are changing this, consider if you
+# need to change that step as well.
 macos_brew_update: &macos_brew_update
   name: Brew update and install moreutils, expect and libomp
   no_output_timeout: "1h"
@@ -154,21 +159,6 @@ macos_brew_update: &macos_brew_update
     brew install expect
     brew install libomp
 
-# In version 2.1 and above we could make this a command and pass a parameter to
-# it, but in this version there is no way to pass a parameter to a step
-binary_macos_brew_update: &binary_macos_brew_update
-  name: Brew update and install moreutils and expect
-  no_output_timeout: "1h"
-  command: |
-    set -ex
-    # moreutils installs a `parallel` executable by default, which conflicts
-    # with the executable from the GNU `parallel`, so we must unlink GNU
-    # `parallel` first, and relink it afterwards
-    brew update
-    brew unlink parallel
-    brew install moreutils
-    brew link parallel --overwrite
-    brew install expect
 
 
 ##############################################################################
@@ -197,7 +187,7 @@ pytorch_linux_build_defaults: &pytorch_linux_build_defaults
         docker pull ${DOCKER_IMAGE} >/dev/null
         export id=$(docker run -t -d -w /var/lib/jenkins ${DOCKER_IMAGE})
 
-        git submodule sync && git submodule update -q --init
+        git submodule sync && git submodule update -q --init --recursive
 
         docker cp /home/circleci/project/. $id:/var/lib/jenkins/workspace
 
@@ -387,6 +377,7 @@ caffe2_linux_test_defaults: &caffe2_linux_test_defaults
         export COMMAND='((echo "source ./workspace/env" && echo "sudo chown -R jenkins workspace && cd workspace && ./ci_test_script.sh") | docker exec -u jenkins -i "$id" bash) 2>&1'
         echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
 
+
 ##############################################################################
 # Macos build defaults
 ##############################################################################
@@ -498,13 +489,13 @@ caffe2_macos_build_defaults: &caffe2_macos_build_defaults
 # do not need both the pytorch and builder repos, so this is a little wasteful
 # (smoke tests and upload jobs do not need the pytorch repo).
 binary_checkout: &binary_checkout
-  name: Checkout
+  name: Checkout pytorch/builder repo
   command: ~/workspace/.circleci/scripts/binary_checkout.sh
 
 # Parses circleci arguments in a consistent way, essentially routing to the
 # correct pythonXgccXcudaXos build we want
 binary_populate_env: &binary_populate_env
-  name: Set up env
+  name: Set up binary env variables
   command: ~/workspace/.circleci/scripts/binary_populate_env.sh
 
 binary_install_miniconda: &binary_install_miniconda
@@ -521,6 +512,25 @@ binary_run_in_docker: &binary_run_in_docker
   # This step only runs on circleci linux machine executors that themselves
   # need to start docker images
   command: ~/workspace/.circleci/scripts/binary_run_in_docker.sh
+
+# This is copied almost verbatim from the macos_brew_update job
+# In version 2.1 and above we could make this a command and pass a parameter to
+# it, but in this version there is no way to pass a parameter to a step
+binary_macos_brew_update: &binary_macos_brew_update
+  name: Brew update and install moreutils and expect
+  no_output_timeout: "1h"
+  command: |
+    set -eux -o pipefail
+    # moreutils installs a `parallel` executable by default, which conflicts
+    # with the executable from the GNU `parallel`, so we must unlink GNU
+    # `parallel` first, and relink it afterwards
+    brew update
+    brew unlink parallel
+    brew install moreutils
+    brew link parallel --overwrite
+    brew install expect
+
+
 # binary linux build defaults
 ##############################################################################
 binary_linux_build: &binary_linux_build
@@ -535,14 +545,14 @@ binary_linux_build: &binary_linux_build
   - run:
       name: Install unbuffer and ts
       command: |
-        set -ex
+        set -eux -o pipefail
         source /env
         retry yum -q -y install epel-release
         retry yum -q -y install expect moreutils
   - run:
       name: Upgrade gcc version (based on env var)
       command: |
-        set -ex
+        set -eux -o pipefail
         source /env
         if [[ "$DESIRED_DEVTOOLSET" == 'devtoolset7' ]]; then
           source "/builder/upgrade_gcc_abi.sh"
@@ -550,6 +560,11 @@ binary_linux_build: &binary_linux_build
           # Env variables are not persisted into the next step
           echo "export PATH=$PATH" >> /env
           echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH" >> /env
+
+          # We need to set this variable manually because
+          # https://github.com/pytorch/pytorch/blob/master/torch/abi-check.cpp
+          # sets the ABI to 0 by default
+          echo "export _GLIBCXX_USE_CXX11_ABI=1" >> /env
         else
           echo "Not upgrading gcc version"
         fi
@@ -573,10 +588,14 @@ binary_linux_test: &binary_linux_test
   steps:
   - attach_workspace:
       at: ~/workspace
+  - attach_workspace:
+      at: /home/circleci/project
   - run:
       <<: *setup_linux_system_environment
   - run:
       <<: *setup_ci_environment
+  - run:
+      <<: *binary_checkout
   - run:
       <<: *binary_populate_env
   - run:
@@ -607,6 +626,7 @@ binary_linux_upload: &binary_linux_upload
       no_output_timeout: "1h"
       command: ~/workspace/.circleci/scripts/binary_linux_upload.sh
 
+
 ##############################################################################
 # Macos binary build defaults
 # The root of everything is /Users/distiller/pytorch-ci-env/workspace
@@ -630,7 +650,7 @@ binary_mac_build: &binary_mac_build
       name: Build
       no_output_timeout: "1h"
       command: |
-        set -ex
+        set -eux -o pipefail
         script="/Users/distiller/project/pytorch/.circleci/scripts/binary_macos_build.sh"
         cat "$script"
         source "$script"
@@ -639,7 +659,7 @@ binary_mac_build: &binary_mac_build
       name: Test
       no_output_timeout: "1h"
       command: |
-        set -ex
+        set -eux -o pipefail
         script="/Users/distiller/project/pytorch/.circleci/scripts/binary_macos_test.sh"
         cat "$script"
         source "$script"
@@ -671,6 +691,8 @@ binary_mac_upload: &binary_mac_upload
         script="/Users/distiller/project/pytorch/.circleci/scripts/binary_macos_upload.sh"
         cat "$script"
         source "$script"
+
+
 # Nighlty build smoke tests defaults
 # These are the second-round smoke tests. These make sure that the binaries are
 # correct from a user perspective, testing that they exist from the cloud are
@@ -682,10 +704,14 @@ smoke_linux_test: &smoke_linux_test
   steps:
   - attach_workspace:
       at: ~/workspace
+  - attach_workspace:
+      at: /home/circleci/project
   - run:
       <<: *setup_linux_system_environment
   - run:
       <<: *setup_ci_environment
+  - run:
+      <<: *binary_checkout
   - run:
       <<: *binary_populate_env
   - run:
@@ -695,8 +721,7 @@ smoke_linux_test: &smoke_linux_test
         set -ex
         cat >/home/circleci/project/ci_test_script.sh <<EOL
         # The following code will be executed inside Docker container
-        set -ex
-        git clone https://github.com/pytorch/builder.git /builder
+        set -eux -o pipefail
         /builder/smoke_test.sh
         # The above code will be executed inside Docker container
         EOL
@@ -709,18 +734,28 @@ smoke_mac_test: &smoke_mac_test
   steps:
     - attach_workspace:
         at: ~/workspace
+    - attach_workspace: # TODO - we can `cp` from ~/workspace
+        at: /Users/distiller/project
+    - run:
+        <<: *binary_checkout
     - run:
         <<: *binary_populate_env
     - run:
         <<: *binary_macos_brew_update
+    - run:
+        <<: *binary_install_miniconda
     - run:
         name: Build
         no_output_timeout: "1h"
         command: |
           set -ex
           source "/Users/distiller/project/env"
-          git clone https://github.com/pytorch/builder.git
-          unbuffer ./builder/smoke_test.sh | ts
+          export "PATH=$workdir/miniconda/bin:$PATH"
+          # TODO unbuffer and ts this, but it breaks cause miniconda overwrites
+          # tclsh. But unbuffer and ts aren't that important so they're just
+          # disabled for now
+          ./builder/smoke_test.sh
+
 ##############################################################################
 # Job specifications job specs
 ##############################################################################
@@ -752,15 +787,15 @@ jobs:
     resource_class: large
     <<: *pytorch_linux_test_defaults
 
-  pytorch_linux_trusty_py3_5_build:
+  AAA_pytorch_linux_trusty_py3_5_build:
     environment:
-      BUILD_ENVIRONMENT: pytorch-linux-trusty-py3.5-build
+      BUILD_ENVIRONMENT: AAA-pytorch-linux-trusty-py3.5-build
       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.5:300"
     <<: *pytorch_linux_build_defaults
 
-  pytorch_linux_trusty_py3_5_test:
+  AAA_pytorch_linux_trusty_py3_5_test:
     environment:
-      BUILD_ENVIRONMENT: pytorch-linux-trusty-py3.5-test
+      BUILD_ENVIRONMENT: AAA-pytorch-linux-trusty-py3.5-test
       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-trusty-py3.5:300"
     resource_class: large
     <<: *pytorch_linux_test_defaults
@@ -858,97 +893,81 @@ jobs:
     resource_class: large
     <<: *pytorch_linux_test_defaults
 
-  pytorch_linux_xenial_cuda8_cudnn7_py3_build:
+  AAA_pytorch_linux_xenial_cuda9_cudnn7_py2_build:
     environment:
-      BUILD_ENVIRONMENT: pytorch-linux-xenial-cuda8-cudnn7-py3-build
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn7-py3:300"
-      PYTHON_VERSION: "3.6"
+      BUILD_ENVIRONMENT: AAA-pytorch-linux-xenial-cuda9-cudnn7-py2-build
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py2:300"
+      PYTHON_VERSION: "2.7"
     <<: *pytorch_linux_build_defaults
 
-  pytorch_linux_xenial_cuda8_cudnn7_py3_test:
+  AAA_pytorch_linux_xenial_cuda9_cudnn7_py2_test:
     environment:
-      BUILD_ENVIRONMENT: pytorch-linux-xenial-cuda8-cudnn7-py3-test
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn7-py3:300"
-      PYTHON_VERSION: "3.6"
+      BUILD_ENVIRONMENT: AAA-pytorch-linux-xenial-cuda9-cudnn7-py2-test
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py2:300"
+      PYTHON_VERSION: "2.7"
       USE_CUDA_DOCKER_RUNTIME: "1"
     resource_class: gpu.medium
     <<: *pytorch_linux_test_defaults
 
-  pytorch_linux_xenial_cuda8_cudnn7_py3_multigpu_test:
+  pytorch_linux_xenial_cuda9_cudnn7_py3_build:
     environment:
-      BUILD_ENVIRONMENT: pytorch-linux-xenial-cuda8-cudnn7-py3-multigpu-test
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn7-py3:300"
+      BUILD_ENVIRONMENT: pytorch-linux-xenial-cuda9-cudnn7-py3-build
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py3:300"
       PYTHON_VERSION: "3.6"
-      USE_CUDA_DOCKER_RUNTIME: "1"
-      MULTI_GPU: "1"
-    resource_class: gpu.large
-    <<: *pytorch_linux_test_defaults
+    <<: *pytorch_linux_build_defaults
 
-  pytorch_linux_xenial_cuda8_cudnn7_py3_NO_AVX2_test:
+  pytorch_linux_xenial_cuda9_cudnn7_py3_test:
     environment:
-      BUILD_ENVIRONMENT: pytorch-linux-xenial-cuda8-cudnn7-py3-NO_AVX2-test
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn7-py3:300"
+      BUILD_ENVIRONMENT: pytorch-linux-xenial-cuda9-cudnn7-py3-test
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py3:300"
       PYTHON_VERSION: "3.6"
       USE_CUDA_DOCKER_RUNTIME: "1"
     resource_class: gpu.medium
     <<: *pytorch_linux_test_defaults
 
-  pytorch_linux_xenial_cuda8_cudnn7_py3_NO_AVX_NO_AVX2_test:
+  pytorch_linux_xenial_cuda9_cudnn7_py3_multigpu_test:
     environment:
-      BUILD_ENVIRONMENT: pytorch-linux-xenial-cuda8-cudnn7-py3-NO_AVX-NO_AVX2-test
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn7-py3:300"
+      BUILD_ENVIRONMENT: pytorch-linux-xenial-cuda9-cudnn7-py3-multigpu-test
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py3:300"
       PYTHON_VERSION: "3.6"
       USE_CUDA_DOCKER_RUNTIME: "1"
-    resource_class: gpu.medium
+      MULTI_GPU: "1"
+    resource_class: gpu.large
     <<: *pytorch_linux_test_defaults
 
-  pytorch_linux_xenial_cuda8_cudnn7_py3_slow_test:
+  pytorch_linux_xenial_cuda9_cudnn7_py3_NO_AVX2_test:
     environment:
-      BUILD_ENVIRONMENT: pytorch-linux-xenial-cuda8-cudnn7-py3-slow-test
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn7-py3:300"
+      BUILD_ENVIRONMENT: pytorch-linux-xenial-cuda9-cudnn7-py3-NO_AVX2-test
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py3:300"
       PYTHON_VERSION: "3.6"
       USE_CUDA_DOCKER_RUNTIME: "1"
     resource_class: gpu.medium
     <<: *pytorch_linux_test_defaults
 
-  pytorch_linux_xenial_cuda8_cudnn7_py3_nogpu_test:
+  pytorch_linux_xenial_cuda9_cudnn7_py3_NO_AVX_NO_AVX2_test:
     environment:
-      BUILD_ENVIRONMENT: pytorch-linux-xenial-cuda8-cudnn7-py3-nogpu-test
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn7-py3:300"
+      BUILD_ENVIRONMENT: pytorch-linux-xenial-cuda9-cudnn7-py3-NO_AVX-NO_AVX2-test
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py3:300"
       PYTHON_VERSION: "3.6"
-    resource_class: large
-    <<: *pytorch_linux_test_defaults
-
-  pytorch_linux_xenial_cuda9_cudnn7_py2_build:
-    environment:
-      BUILD_ENVIRONMENT: pytorch-linux-xenial-cuda9-cudnn7-py2-build
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py2:300"
-      PYTHON_VERSION: "2.7"
-    <<: *pytorch_linux_build_defaults
-
-  pytorch_linux_xenial_cuda9_cudnn7_py2_test:
-    environment:
-      BUILD_ENVIRONMENT: pytorch-linux-xenial-cuda9-cudnn7-py2-test
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py2:300"
-      PYTHON_VERSION: "2.7"
       USE_CUDA_DOCKER_RUNTIME: "1"
     resource_class: gpu.medium
     <<: *pytorch_linux_test_defaults
 
-  pytorch_linux_xenial_cuda9_cudnn7_py3_build:
+  pytorch_linux_xenial_cuda9_cudnn7_py3_slow_test:
     environment:
-      BUILD_ENVIRONMENT: pytorch-linux-xenial-cuda9-cudnn7-py3-build
+      BUILD_ENVIRONMENT: pytorch-linux-xenial-cuda9-cudnn7-py3-slow-test
       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py3:300"
       PYTHON_VERSION: "3.6"
-    <<: *pytorch_linux_build_defaults
+      USE_CUDA_DOCKER_RUNTIME: "1"
+    resource_class: gpu.medium
+    <<: *pytorch_linux_test_defaults
 
-  pytorch_linux_xenial_cuda9_cudnn7_py3_test:
+  pytorch_linux_xenial_cuda9_cudnn7_py3_nogpu_test:
     environment:
-      BUILD_ENVIRONMENT: pytorch-linux-xenial-cuda9-cudnn7-py3-test
+      BUILD_ENVIRONMENT: pytorch-linux-xenial-cuda9-cudnn7-py3-nogpu-test
       DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py3:300"
       PYTHON_VERSION: "3.6"
-      USE_CUDA_DOCKER_RUNTIME: "1"
-    resource_class: gpu.medium
+    resource_class: large
     <<: *pytorch_linux_test_defaults
 
   pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_build:
@@ -981,6 +1000,7 @@ jobs:
       PYTHON_VERSION: "3.6"
     <<: *pytorch_linux_build_defaults
 
+  
   setup:
     docker:
       - image: circleci/python:3.7.3
@@ -993,10 +1013,12 @@ jobs:
       - persist_to_workspace:
           root: .
           paths: .circleci/scripts
+
+  
   pytorch_short_perf_test_gpu:
     environment:
       BUILD_ENVIRONMENT: pytorch-short-perf-test-gpu
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn7-py3:300"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py3:300"
       PYTHON_VERSION: "3.6"
       USE_CUDA_DOCKER_RUNTIME: "1"
     resource_class: gpu.medium
@@ -1033,7 +1055,7 @@ jobs:
   pytorch_doc_push:
     environment:
       BUILD_ENVIRONMENT: pytorch-doc-push
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn7-py3:300"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py3:300"
     resource_class: large
     machine:
       image: ubuntu-1604:201903-01
@@ -1192,150 +1214,137 @@ jobs:
             export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V4}
             set -x
 
-            git submodule sync && git submodule update -q --init
+            git submodule sync && git submodule update -q --init --recursive
             chmod a+x .jenkins/pytorch/macos-build.sh
             unbuffer .jenkins/pytorch/macos-build.sh 2>&1 | ts
+
   caffe2_py2_gcc4_8_ubuntu14_04_build:
     environment:
       BUILD_ENVIRONMENT: "caffe2-py2-gcc4.8-ubuntu14.04-build"
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc4.8-ubuntu14.04:273"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc4.8-ubuntu14.04:276"
     <<: *caffe2_linux_build_defaults
 
   caffe2_py2_gcc4_8_ubuntu14_04_test:
     environment:
       BUILD_ENVIRONMENT: "caffe2-py2-gcc4.8-ubuntu14.04-test"
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc4.8-ubuntu14.04:273"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc4.8-ubuntu14.04:276"
     resource_class: large
     <<: *caffe2_linux_test_defaults
 
   caffe2_py2_gcc4_9_ubuntu14_04_build:
     environment:
       BUILD_ENVIRONMENT: "caffe2-py2-gcc4.9-ubuntu14.04-build"
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc4.9-ubuntu14.04:273"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc4.9-ubuntu14.04:276"
       BUILD_ONLY: "1"
     <<: *caffe2_linux_build_defaults
 
-  caffe2_py2_cuda8_0_cudnn7_ubuntu16_04_build:
-    environment:
-      BUILD_ENVIRONMENT: "caffe2-py2-cuda8.0-cudnn7-ubuntu16.04-build"
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda8.0-cudnn7-ubuntu16.04:273"
-    <<: *caffe2_linux_build_defaults
-
-  caffe2_py2_cuda8_0_cudnn7_ubuntu16_04_test:
-    environment:
-      BUILD_ENVIRONMENT: "caffe2-py2-cuda8.0-cudnn7-ubuntu16.04-test"
-      USE_CUDA_DOCKER_RUNTIME: "1"
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda8.0-cudnn7-ubuntu16.04:273"
-    resource_class: gpu.medium
-    <<: *caffe2_linux_test_defaults
-
   caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_build:
     environment:
       BUILD_ENVIRONMENT: "caffe2-py2-cuda9.0-cudnn7-ubuntu16.04-build"
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-ubuntu16.04:273"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-ubuntu16.04:276"
     <<: *caffe2_linux_build_defaults
 
   caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_test:
     environment:
       BUILD_ENVIRONMENT: "caffe2-py2-cuda9.0-cudnn7-ubuntu16.04-test"
       USE_CUDA_DOCKER_RUNTIME: "1"
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-ubuntu16.04:273"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-ubuntu16.04:276"
     resource_class: gpu.medium
     <<: *caffe2_linux_test_defaults
 
   caffe2_cmake_cuda9_0_cudnn7_ubuntu16_04_build:
     environment:
       BUILD_ENVIRONMENT: "caffe2-cmake-cuda9.0-cudnn7-ubuntu16.04-build"
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-ubuntu16.04:273"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-ubuntu16.04:276"
     <<: *caffe2_linux_build_defaults
 
   caffe2_cmake_cuda9_0_cudnn7_ubuntu16_04_test:
     environment:
       BUILD_ENVIRONMENT: "caffe2-cmake-cuda9.0-cudnn7-ubuntu16.04-test"
       USE_CUDA_DOCKER_RUNTIME: "1"
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-ubuntu16.04:273"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-ubuntu16.04:276"
     resource_class: gpu.medium
     <<: *caffe2_linux_test_defaults
 
   caffe2_py2_cuda9_1_cudnn7_ubuntu16_04_build:
     environment:
       BUILD_ENVIRONMENT: "caffe2-py2-cuda9.1-cudnn7-ubuntu16.04-build"
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.1-cudnn7-ubuntu16.04:273"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.1-cudnn7-ubuntu16.04:276"
     <<: *caffe2_linux_build_defaults
 
   caffe2_py2_cuda9_1_cudnn7_ubuntu16_04_test:
     environment:
       BUILD_ENVIRONMENT: "caffe2-py2-cuda9.1-cudnn7-ubuntu16.04-test"
       USE_CUDA_DOCKER_RUNTIME: "1"
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.1-cudnn7-ubuntu16.04:273"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.1-cudnn7-ubuntu16.04:276"
     resource_class: gpu.medium
     <<: *caffe2_linux_test_defaults
 
   caffe2_py2_mkl_ubuntu16_04_build:
     environment:
       BUILD_ENVIRONMENT: "caffe2-py2-mkl-ubuntu16.04-build"
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-mkl-ubuntu16.04:273"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-mkl-ubuntu16.04:276"
     <<: *caffe2_linux_build_defaults
 
   caffe2_py2_mkl_ubuntu16_04_test:
     environment:
       BUILD_ENVIRONMENT: "caffe2-py2-mkl-ubuntu16.04-test"
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-mkl-ubuntu16.04:273"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-mkl-ubuntu16.04:276"
     resource_class: large
     <<: *caffe2_linux_test_defaults
 
   caffe2_onnx_py2_gcc5_ubuntu16_04_build:
     environment:
       BUILD_ENVIRONMENT: "caffe2-onnx-py2-gcc5-ubuntu16.04-build"
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc5-ubuntu16.04:273"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc5-ubuntu16.04:276"
     <<: *caffe2_linux_build_defaults
 
   caffe2_onnx_py2_gcc5_ubuntu16_04_test:
     environment:
       BUILD_ENVIRONMENT: "caffe2-onnx-py2-gcc5-ubuntu16.04-test"
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc5-ubuntu16.04:273"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc5-ubuntu16.04:276"
     resource_class: large
     <<: *caffe2_linux_test_defaults
 
   caffe2_py2_clang3_8_ubuntu16_04_build:
     environment:
       BUILD_ENVIRONMENT: "caffe2-py2-clang3.8-ubuntu16.04-build"
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-clang3.8-ubuntu16.04:273"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-clang3.8-ubuntu16.04:276"
       BUILD_ONLY: "1"
     <<: *caffe2_linux_build_defaults
 
   caffe2_py2_clang3_9_ubuntu16_04_build:
     environment:
       BUILD_ENVIRONMENT: "caffe2-py2-clang3.9-ubuntu16.04-build"
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-clang3.9-ubuntu16.04:273"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-clang3.9-ubuntu16.04:276"
       BUILD_ONLY: "1"
     <<: *caffe2_linux_build_defaults
 
   caffe2_py2_clang7_ubuntu16_04_build:
     environment:
       BUILD_ENVIRONMENT: "caffe2-py2-clang7-ubuntu16.04-build"
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-clang7-ubuntu16.04:273"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-clang7-ubuntu16.04:276"
       BUILD_ONLY: "1"
     <<: *caffe2_linux_build_defaults
 
   caffe2_py2_android_ubuntu16_04_build:
     environment:
       BUILD_ENVIRONMENT: "caffe2-py2-android-ubuntu16.04-build"
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-android-ubuntu16.04:273"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-android-ubuntu16.04:276"
       BUILD_ONLY: "1"
     <<: *caffe2_linux_build_defaults
 
   caffe2_py2_cuda9_0_cudnn7_centos7_build:
     environment:
       BUILD_ENVIRONMENT: "caffe2-py2-cuda9.0-cudnn7-centos7-build"
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-centos7:273"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-centos7:276"
     <<: *caffe2_linux_build_defaults
 
   caffe2_py2_cuda9_0_cudnn7_centos7_test:
     environment:
       BUILD_ENVIRONMENT: "caffe2-py2-cuda9.0-cudnn7-centos7-test"
       USE_CUDA_DOCKER_RUNTIME: "1"
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-centos7:273"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-centos7:276"
     resource_class: gpu.medium
     <<: *caffe2_linux_test_defaults
 
@@ -1352,6 +1361,7 @@ jobs:
       PYTHON_VERSION: "2"
     <<: *caffe2_macos_build_defaults
 
+  
   # update_s3_htmls job
   # These jobs create html files for every cpu/cu## folder in s3. The html
   # files just store the names of all the files in that folder (which are
@@ -1363,6 +1373,8 @@ jobs:
     machine:
       image: ubuntu-1604:201903-01
     steps:
+    - attach_workspace:
+        at: ~/workspace
     - run:
         <<: *setup_linux_system_environment
     - run:
@@ -1388,7 +1400,7 @@ jobs:
           echo "declare -x \"AWS_ACCESS_KEY_ID=${PYTORCH_BINARY_AWS_ACCESS_KEY_ID}\"" >> /home/circleci/project/env
           echo "declare -x \"AWS_SECRET_ACCESS_KEY=${PYTORCH_BINARY_AWS_SECRET_ACCESS_KEY}\"" >> /home/circleci/project/env
           source /home/circleci/project/env
-          set -ex
+          set -eux -o pipefail
           retry () {
               $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
           }
@@ -1416,15 +1428,12 @@ jobs:
     machine:
       image: ubuntu-1604:201903-01
     steps:
+    - attach_workspace:
+        at: ~/workspace
     - run:
         <<: *setup_linux_system_environment
     - run:
         <<: *binary_checkout
-    # N.B. This sources binary_populate_env so that it takes the Pytorch
-    # version listed there. The only variables it needs are the date and the
-    # version string.
-    - run:
-        <<: *binary_populate_env
     - run:
         <<: *binary_install_miniconda
     - run:
@@ -1432,18 +1441,24 @@ jobs:
         no_output_timeout: "1h"
         command: |
           set +x
-          echo "declare -x \"AWS_ACCESS_KEY_ID=${PYTORCH_BINARY_AWS_ACCESS_KEY_ID}\"" >> /home/circleci/project/env
+          echo "declare -x \"AWS_ACCESS_KEY_ID=${PYTORCH_BINARY_AWS_ACCESS_KEY_ID}\"" > /home/circleci/project/env
           echo "declare -x \"AWS_SECRET_ACCESS_KEY=${PYTORCH_BINARY_AWS_SECRET_ACCESS_KEY}\"" >> /home/circleci/project/env
+          export DATE="$(date -u +%Y_%m_%d)"
+          retry () {
+              $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
+          }
           source /home/circleci/project/env
-          set -ex
+          set -eux -o pipefail
 
+          # This is hardcoded to match binary_install_miniconda.sh
+          export PATH="/home/circleci/project/miniconda/bin:$PATH"
           # Not any awscli will work. Most won't. This one will work
-          export PATH="$MINICONDA_ROOT/bin:$PATH"
           retry conda create -qyn aws36 python=3.6
           source activate aws36
           pip install awscli==1.16.46
 
           "/home/circleci/project/builder/cron/upload_binary_sizes.sh"
+
 ##############################################################################
 # Binary build specs individual job specifications
 ##############################################################################
@@ -2059,6 +2074,7 @@ jobs:
     resource_class: gpu.medium
     <<: *binary_linux_test
 
+
 # There is currently no testing for libtorch TODO
 #  binary_linux_libtorch_2.7m_cpu_test:
 #    environment:
@@ -2843,13 +2859,13 @@ workflows:
           requires:
             - setup
             - pytorch_linux_trusty_py2_7_build
-      - pytorch_linux_trusty_py3_5_build:
+      - AAA_pytorch_linux_trusty_py3_5_build:
           requires:
             - setup
-      - pytorch_linux_trusty_py3_5_test:
+      - AAA_pytorch_linux_trusty_py3_5_test:
           requires:
             - setup
-            - pytorch_linux_trusty_py3_5_build
+            - AAA_pytorch_linux_trusty_py3_5_build
       - pytorch_linux_trusty_pynightly_build:
           requires:
             - setup
@@ -2899,52 +2915,45 @@ workflows:
           requires:
             - setup
             - pytorch_linux_xenial_py3_clang5_asan_build
-      - pytorch_linux_xenial_cuda8_cudnn7_py3_build:
+      - AAA_pytorch_linux_xenial_cuda9_cudnn7_py2_build:
           requires:
             - setup
-      - pytorch_linux_xenial_cuda8_cudnn7_py3_test:
+      - AAA_pytorch_linux_xenial_cuda9_cudnn7_py2_test:
           requires:
             - setup
-            - pytorch_linux_xenial_cuda8_cudnn7_py3_build
-      - pytorch_linux_xenial_cuda8_cudnn7_py3_multigpu_test:
+            - AAA_pytorch_linux_xenial_cuda9_cudnn7_py2_build
+      - pytorch_linux_xenial_cuda9_cudnn7_py3_build:
           requires:
             - setup
-            - pytorch_linux_xenial_cuda8_cudnn7_py3_build
-      - pytorch_linux_xenial_cuda8_cudnn7_py3_NO_AVX2_test:
+      - pytorch_linux_xenial_cuda9_cudnn7_py3_test:
           requires:
             - setup
-            - pytorch_linux_xenial_cuda8_cudnn7_py3_build
-      - pytorch_linux_xenial_cuda8_cudnn7_py3_NO_AVX_NO_AVX2_test:
+            - pytorch_linux_xenial_cuda9_cudnn7_py3_build
+      - pytorch_linux_xenial_cuda9_cudnn7_py3_multigpu_test:
           requires:
             - setup
-            - pytorch_linux_xenial_cuda8_cudnn7_py3_build
-      - pytorch_linux_xenial_cuda8_cudnn7_py3_slow_test:
+            - pytorch_linux_xenial_cuda9_cudnn7_py3_build
+      - pytorch_linux_xenial_cuda9_cudnn7_py3_NO_AVX2_test:
           requires:
             - setup
-            - pytorch_linux_xenial_cuda8_cudnn7_py3_build
-      - pytorch_linux_xenial_cuda8_cudnn7_py3_nogpu_test:
+            - pytorch_linux_xenial_cuda9_cudnn7_py3_build
+      - pytorch_linux_xenial_cuda9_cudnn7_py3_NO_AVX_NO_AVX2_test:
           requires:
             - setup
-            - pytorch_linux_xenial_cuda8_cudnn7_py3_build
-      - pytorch_short_perf_test_gpu:
-          requires:
-            - pytorch_linux_xenial_cuda8_cudnn7_py3_build
-      - pytorch_doc_push:
-          requires:
-            - pytorch_linux_xenial_cuda8_cudnn7_py3_build
-      - pytorch_linux_xenial_cuda9_cudnn7_py2_build:
+            - pytorch_linux_xenial_cuda9_cudnn7_py3_build
+      - pytorch_linux_xenial_cuda9_cudnn7_py3_slow_test:
           requires:
             - setup
-      - pytorch_linux_xenial_cuda9_cudnn7_py2_test:
+            - pytorch_linux_xenial_cuda9_cudnn7_py3_build
+      - pytorch_linux_xenial_cuda9_cudnn7_py3_nogpu_test:
           requires:
             - setup
-            - pytorch_linux_xenial_cuda9_cudnn7_py2_build
-      - pytorch_linux_xenial_cuda9_cudnn7_py3_build:
+            - pytorch_linux_xenial_cuda9_cudnn7_py3_build
+      - pytorch_short_perf_test_gpu:
           requires:
-            - setup
-      - pytorch_linux_xenial_cuda9_cudnn7_py3_test:
+            - pytorch_linux_xenial_cuda9_cudnn7_py3_build
+      - pytorch_doc_push:
           requires:
-            - setup
             - pytorch_linux_xenial_cuda9_cudnn7_py3_build
       - pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_build:
           requires:
@@ -2971,6 +2980,7 @@ workflows:
       - pytorch_macos_10_13_cuda9_2_cudnn7_py3_build:
           requires:
             - setup
+            
       - caffe2_py2_gcc4_8_ubuntu14_04_build:
           requires:
             - setup
@@ -2978,13 +2988,6 @@ workflows:
           requires:
             - setup
             - caffe2_py2_gcc4_8_ubuntu14_04_build
-      - caffe2_py2_cuda8_0_cudnn7_ubuntu16_04_build:
-          requires:
-            - setup
-      - caffe2_py2_cuda8_0_cudnn7_ubuntu16_04_test:
-          requires:
-            - setup
-            - caffe2_py2_cuda8_0_cudnn7_ubuntu16_04_build
       - caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_build:
           requires:
             - setup
@@ -3087,6 +3090,7 @@ workflows:
       #     requires:
       #       - setup
       #       - binary_linux_conda_3.6_cu90_build
+
 ##############################################################################
 # Daily smoke test trigger
 ##############################################################################
@@ -3899,9 +3903,17 @@ workflows:
               only:
                 - master
     jobs:
+      - setup
       - update_s3_htmls_for_nightlies:
           context: org-member
+          requires:
+            - setup
       - update_s3_htmls_for_nightlies_devtoolset7:
           context: org-member
+          requires:
+            - setup
       - upload_binary_sizes:
           context: org-member
+          requires:
+            - setup
+
diff --git a/.circleci/scripts/binary_checkout.sh b/.circleci/scripts/binary_checkout.sh
index f4d9fa7f7beb..c1fa7472590b 100755
--- a/.circleci/scripts/binary_checkout.sh
+++ b/.circleci/scripts/binary_checkout.sh
@@ -1,6 +1,5 @@
 #!/bin/bash
-
-set -ex
+set -eux -o pipefail
 # This step runs on multiple executors with different envfile locations
 if [[ "$(uname)" == Darwin ]]; then
   # macos executor (builds and tests)
@@ -20,13 +19,13 @@ export BUILDER_ROOT="$workdir/builder"
 # Clone the Pytorch branch
 git clone https://github.com/pytorch/pytorch.git "$PYTORCH_ROOT"
 pushd "$PYTORCH_ROOT"
-if [[ -n "$CIRCLE_PR_NUMBER" ]]; then
+if [[ -n "${CIRCLE_PR_NUMBER:-}" ]]; then
   # "smoke" binary build on PRs
   git fetch --force origin "pull/${CIRCLE_PR_NUMBER}/head:remotes/origin/pull/${CIRCLE_PR_NUMBER}"
   git reset --hard "$CIRCLE_SHA1"
   git checkout -q -B "$CIRCLE_BRANCH"
   git reset --hard "$CIRCLE_SHA1"
-elif [[ -n "$CIRCLE_SHA1" ]]; then
+elif [[ -n "${CIRCLE_SHA1:-}" ]]; then
   # Scheduled workflows & "smoke" binary build on master on PR merges
   git reset --hard "$CIRCLE_SHA1"
   git checkout -q -B master
diff --git a/.circleci/scripts/binary_install_miniconda.sh b/.circleci/scripts/binary_install_miniconda.sh
index 05f4f7a3bfde..ea419ff3030b 100755
--- a/.circleci/scripts/binary_install_miniconda.sh
+++ b/.circleci/scripts/binary_install_miniconda.sh
@@ -1,15 +1,32 @@
 #!/bin/bash
 
-set -ex
+set -eux -o pipefail
+
 # This step runs on multiple executors with different envfile locations
 if [[ "$(uname)" == Darwin ]]; then
-  source "/Users/distiller/project/env"
+  envfile="/Users/distiller/project/env"
 elif [[ -d "/home/circleci/project" ]]; then
   # machine executor (binary tests)
-  source "/home/circleci/project/env"
+  envfile="/home/circleci/project/env"
 else
   # docker executor (binary builds)
-  source "/env"
+  envfile="/env"
+fi
+
+# TODO this is super hacky and ugly. Basically, the binary_update_html job does
+# not have an env file, since it does not call binary_populate_env.sh, since it
+# does not have a BUILD_ENVIRONMENT. So for this one case, which we detect by a
+# lack of an env file, we manually export the environment variables that we
+# need to install miniconda
+if [[ ! -f "$envfile" ]]; then
+  MINICONDA_ROOT="/home/circleci/project/miniconda"
+  workdir="/home/circleci/project"
+  retry () {
+      $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
+  }
+  export -f retry
+else
+  source "$envfile"
 fi
 
 conda_sh="$workdir/install_miniconda.sh"
@@ -22,10 +39,6 @@ chmod +x "$conda_sh"
 "$conda_sh" -b -p "$MINICONDA_ROOT"
 rm -f "$conda_sh"
 
-# TODO we can probably remove the next two lines
-export PATH="$MINICONDA_ROOT/bin:$PATH"
-source "$MINICONDA_ROOT/bin/activate"
-
 # We can't actually add miniconda to the PATH in the envfile, because that
 # breaks 'unbuffer' in Mac jobs. This is probably because conda comes with
 # a tclsh, which then gets inserted before the tclsh needed in /usr/bin
diff --git a/.circleci/scripts/binary_linux_build.sh b/.circleci/scripts/binary_linux_build.sh
index 38507ea06ce0..9061b86d42e5 100755
--- a/.circleci/scripts/binary_linux_build.sh
+++ b/.circleci/scripts/binary_linux_build.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 echo "RUNNING ON $(uname -a) WITH $(nproc) CPUS AND $(free -m)"
-set -ex
+set -eux -o pipefail
 source /env
 
 # Defaults here so they can be changed in one place
diff --git a/.circleci/scripts/binary_linux_test.sh b/.circleci/scripts/binary_linux_test.sh
index bf867316ad3b..663fcfba465c 100755
--- a/.circleci/scripts/binary_linux_test.sh
+++ b/.circleci/scripts/binary_linux_test.sh
@@ -3,7 +3,7 @@
 source /home/circleci/project/env
 cat >/home/circleci/project/ci_test_script.sh <<EOL
 # =================== The following code will be executed inside Docker container ===================
-set -ex
+set -eux -o pipefail
 
 # Set up Python
 if [[ "$PACKAGE_TYPE" == conda ]]; then
@@ -18,18 +18,33 @@ fi
 
 # Install the package
 # These network calls should not have 'retry's because they are installing
-# locally
+# locally and aren't actually network calls
+# TODO there is duplicated and inconsistent test-python-env setup across this
+#   file, builder/smoke_test.sh, and builder/run_tests.sh, and also in the
+#   conda build scripts themselves. These should really be consolidated
 pkg="/final_pkgs/\$(ls /final_pkgs)"
 if [[ "$PACKAGE_TYPE" == conda ]]; then
   conda install -y "\$pkg" --offline
+  retry conda install -yq future numpy protobuf six
+  if [[ "$DESIRED_CUDA" != 'cpu' ]]; then
+    # DESIRED_CUDA is in format cu90 or cu100
+    if [[ "${#DESIRED_CUDA}" == 4 ]]; then
+      cu_ver="${DESIRED_CUDA:2:1}.${DESIRED_CUDA:3}"
+    else
+      cu_ver="${DESIRED_CUDA:2:2}.${DESIRED_CUDA:4}"
+    fi
+    retry conda install -yq -c pytorch "cudatoolkit=\${cu_ver}"
+  fi
 else
   pip install "\$pkg"
+  retry pip install -q future numpy protobuf six
 fi
 
 # Test the package
-pushd /pytorch
-/builder/run_tests.sh "$PACKAGE_TYPE" "$DESIRED_PYTHON" "$DESIRED_CUDA"
+/builder/check_binary.sh
 # =================== The above code will be executed inside Docker container ===================
 EOL
-echo "Prepared script to run in next step"
+echo
+echo
+echo "The script that will run in the next step is:"
 cat /home/circleci/project/ci_test_script.sh
diff --git a/.circleci/scripts/binary_linux_upload.sh b/.circleci/scripts/binary_linux_upload.sh
index b5c900a05f0a..6133e342b15e 100755
--- a/.circleci/scripts/binary_linux_upload.sh
+++ b/.circleci/scripts/binary_linux_upload.sh
@@ -1,6 +1,7 @@
 #!/bin/bash
 # Do NOT set -x
 source /home/circleci/project/env
+set -eu -o pipefail
 set +x
 declare -x "AWS_ACCESS_KEY_ID=${PYTORCH_BINARY_AWS_ACCESS_KEY_ID}"
 declare -x "AWS_SECRET_ACCESS_KEY=${PYTORCH_BINARY_AWS_SECRET_ACCESS_KEY}"
@@ -17,7 +18,7 @@ chmod +x /home/circleci/project/login_to_anaconda.sh
 #!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!
 # DO NOT TURN -x ON BEFORE THIS LINE
 #!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!
-set -ex
+set -eux -o pipefail
 export PATH="$MINICONDA_ROOT/bin:$PATH"
 
 # Upload the package to the final location
diff --git a/.circleci/scripts/binary_macos_build.sh b/.circleci/scripts/binary_macos_build.sh
index a806ad3af3be..c402cdd00801 100755
--- a/.circleci/scripts/binary_macos_build.sh
+++ b/.circleci/scripts/binary_macos_build.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-set -ex
+set -eux -o pipefail
 
 source "/Users/distiller/project/env"
 mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR"
diff --git a/.circleci/scripts/binary_macos_test.sh b/.circleci/scripts/binary_macos_test.sh
index 9780dacd3fe5..3b0701a72c1e 100755
--- a/.circleci/scripts/binary_macos_test.sh
+++ b/.circleci/scripts/binary_macos_test.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-set -ex
+set -eux -o pipefail
 
 source "/Users/distiller/project/env"
 export "PATH=$workdir/miniconda/bin:$PATH"
diff --git a/.circleci/scripts/binary_macos_upload.sh b/.circleci/scripts/binary_macos_upload.sh
index 37632920c3c7..6ef4fe4cbf8f 100755
--- a/.circleci/scripts/binary_macos_upload.sh
+++ b/.circleci/scripts/binary_macos_upload.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
 # Do NOT set -x
+set -eu -o pipefail
 set +x
 export AWS_ACCESS_KEY_ID="${PYTORCH_BINARY_AWS_ACCESS_KEY_ID}"
 export AWS_SECRET_ACCESS_KEY="${PYTORCH_BINARY_AWS_SECRET_ACCESS_KEY}"
@@ -16,7 +17,7 @@ chmod +x /Users/distiller/project/login_to_anaconda.sh
 #!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!
 # DO NOT TURN -x ON BEFORE THIS LINE
 #!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!
-set -ex
+set -eux -o pipefail
 
 source "/Users/distiller/project/env"
 export "PATH=$workdir/miniconda/bin:$PATH"
diff --git a/.circleci/scripts/binary_populate_env.sh b/.circleci/scripts/binary_populate_env.sh
index d54d0c7bcb65..6af64f4bb98d 100755
--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@@ -1,6 +1,5 @@
 #!/bin/bash
-
-set -ex
+set -eux -o pipefail
 export TZ=UTC
 
 # We need to write an envfile to persist these variables to following
@@ -24,7 +23,7 @@ configs=($BUILD_ENVIRONMENT)
 export PACKAGE_TYPE="${configs[0]}"
 export DESIRED_PYTHON="${configs[1]}"
 export DESIRED_CUDA="${configs[2]}"
-export DESIRED_DEVTOOLSET="${configs[3]}"
+export DESIRED_DEVTOOLSET="${configs[3]:-}"
 if [[ "$PACKAGE_TYPE" == 'libtorch' ]]; then
   export BUILD_PYTHONLESS=1
 fi
@@ -63,8 +62,8 @@ echo "Running on $(uname -a) at $(date)"
 export PACKAGE_TYPE="$PACKAGE_TYPE"
 export DESIRED_PYTHON="$DESIRED_PYTHON"
 export DESIRED_CUDA="$DESIRED_CUDA"
-export LIBTORCH_VARIANT="$LIBTORCH_VARIANT"
-export BUILD_PYTHONLESS="$BUILD_PYTHONLESS"
+export LIBTORCH_VARIANT="${LIBTORCH_VARIANT:-}"
+export BUILD_PYTHONLESS="${BUILD_PYTHONLESS:-}"
 export DESIRED_DEVTOOLSET="$DESIRED_DEVTOOLSET"
 
 export DATE="$DATE"
@@ -87,9 +86,9 @@ export BUILDER_ROOT="$workdir/builder"
 export MINICONDA_ROOT="$workdir/miniconda"
 export PYTORCH_FINAL_PACKAGE_DIR="$workdir/final_pkgs"
 
-export CIRCLE_TAG="$CIRCLE_TAG"
+export CIRCLE_TAG="${CIRCLE_TAG:-}"
 export CIRCLE_SHA1="$CIRCLE_SHA1"
-export CIRCLE_PR_NUMBER="$CIRCLE_PR_NUMBER"
+export CIRCLE_PR_NUMBER="${CIRCLE_PR_NUMBER:-}"
 export CIRCLE_BRANCH="$CIRCLE_BRANCH"
 # =================== The above code will be executed inside Docker container ===================
 EOL
diff --git a/.circleci/scripts/binary_run_in_docker.sh b/.circleci/scripts/binary_run_in_docker.sh
index 195444e0c02d..2e2c35d72e3b 100755
--- a/.circleci/scripts/binary_run_in_docker.sh
+++ b/.circleci/scripts/binary_run_in_docker.sh
@@ -9,13 +9,15 @@
 source /home/circleci/project/env
 echo "Running the following code in Docker"
 cat /home/circleci/project/ci_test_script.sh
-set -ex
+echo
+echo
+set -eux -o pipefail
 
 # Expect actual code to be written to this file
 chmod +x /home/circleci/project/ci_test_script.sh
 
 # Run the docker
-if [ -n "${USE_CUDA_DOCKER_RUNTIME}" ]; then
+if [ -n "${USE_CUDA_DOCKER_RUNTIME:-}" ]; then
   export id=$(docker run --runtime=nvidia -t -d "${DOCKER_IMAGE}")
 else
   export id=$(docker run -t -d "${DOCKER_IMAGE}")
diff --git a/.circleci/scripts/setup_ci_environment.sh b/.circleci/scripts/setup_ci_environment.sh
index e3936ac4b887..5be0b1604708 100755
--- a/.circleci/scripts/setup_ci_environment.sh
+++ b/.circleci/scripts/setup_ci_environment.sh
@@ -1,11 +1,11 @@
 #!/usr/bin/env bash
-set -ex
+set -ex -o pipefail
 
 # Check if we should actually run
 echo "BUILD_ENVIRONMENT: ${BUILD_ENVIRONMENT}"
-echo "CIRCLE_PULL_REQUEST: ${CIRCLE_PULL_REQUEST}"
+echo "CIRCLE_PULL_REQUEST: ${CIRCLE_PULL_REQUEST:-}"
 if [[ "${BUILD_ENVIRONMENT}" == *-slow-* ]]; then
-  if ! [ -z "${CIRCLE_PULL_REQUEST}" ]; then
+  if ! [ -z "${CIRCLE_PULL_REQUEST:-}" ]; then
     # It's a PR; test for [slow ci] tag on the TOPMOST commit
     topmost_commit=$(git log --format='%B' -n 1 HEAD)
     if !(echo $topmost_commit | grep -q -e '\[slow ci\]' -e '\[ci slow\]' -e '\[test slow\]' -e '\[slow test\]'); then
@@ -15,7 +15,7 @@ if [[ "${BUILD_ENVIRONMENT}" == *-slow-* ]]; then
   fi
 fi
 if [[ "${BUILD_ENVIRONMENT}" == *xla* ]]; then
-  if ! [ -z "${CIRCLE_PULL_REQUEST}" ]; then
+  if ! [ -z "${CIRCLE_PULL_REQUEST:-}" ]; then
     # It's a PR; test for [xla ci] tag on the TOPMOST commit
     topmost_commit=$(git log --format='%B' -n 1 HEAD)
     if !(echo $topmost_commit | grep -q -e '\[xla ci\]' -e '\[ci xla\]' -e '\[test xla\]' -e '\[xla test\]'); then
@@ -28,7 +28,7 @@ if [[ "${BUILD_ENVIRONMENT}" == *xla* ]]; then
   fi
 fi
 if [[ "${BUILD_ENVIRONMENT}" == *namedtensor* ]]; then
-  if ! [ -z "${CIRCLE_PULL_REQUEST}" ]; then
+  if ! [ -z "${CIRCLE_PULL_REQUEST:-}" ]; then
     # It's a PR; test for [namedtensor] tag on the TOPMOST commit
     topmost_commit=$(git log --format='%B' -n 1 HEAD)
     if !(echo $topmost_commit | grep -q -e '\[namedtensor\]' -e '\[ci namedtensor\]' -e '\[namedtensor ci\]'); then
@@ -74,7 +74,7 @@ sudo pkill -SIGHUP dockerd
 
 sudo pip -q install awscli==1.16.35
 
-if [ -n "${USE_CUDA_DOCKER_RUNTIME}" ]; then
+if [ -n "${USE_CUDA_DOCKER_RUNTIME:-}" ]; then
   DRIVER_FN="NVIDIA-Linux-x86_64-410.104.run"
   wget "https://s3.amazonaws.com/ossci-linux/nvidia_driver/$DRIVER_FN"
   sudo /bin/bash "$DRIVER_FN" -s --no-drm || (sudo cat /var/log/nvidia-installer.log && false)
@@ -83,10 +83,10 @@ fi
 
 if [[ "${BUILD_ENVIRONMENT}" == *-build ]]; then
   echo "declare -x IN_CIRCLECI=1" > /home/circleci/project/env
-  echo "declare -x COMMIT_SOURCE=${CIRCLE_BRANCH}" >> /home/circleci/project/env
-  echo "declare -x PYTHON_VERSION=${PYTHON_VERSION}" >> /home/circleci/project/env
+  echo "declare -x COMMIT_SOURCE=${CIRCLE_BRANCH:-}" >> /home/circleci/project/env
+  echo "declare -x PYTHON_VERSION=${PYTHON_VERSION:-}" >> /home/circleci/project/env
   echo "declare -x SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> /home/circleci/project/env
-  if [ -n "${USE_CUDA_DOCKER_RUNTIME}" ]; then
+  if [ -n "${USE_CUDA_DOCKER_RUNTIME:-}" ]; then
     echo "declare -x TORCH_CUDA_ARCH_LIST=5.2" >> /home/circleci/project/env
   fi
   export SCCACHE_MAX_JOBS=`expr $(nproc) - 1`
@@ -97,21 +97,21 @@ if [[ "${BUILD_ENVIRONMENT}" == *-build ]]; then
   if [[ "${BUILD_ENVIRONMENT}" == *xla* ]]; then
     # This IAM user allows write access to S3 bucket for sccache & bazels3cache
     set +x
-    echo "declare -x AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_AND_XLA_BAZEL_S3_BUCKET_V2}" >> /home/circleci/project/env
-    echo "declare -x AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_AND_XLA_BAZEL_S3_BUCKET_V2}" >> /home/circleci/project/env
+    echo "declare -x AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_AND_XLA_BAZEL_S3_BUCKET_V2:-}" >> /home/circleci/project/env
+    echo "declare -x AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_AND_XLA_BAZEL_S3_BUCKET_V2:-}" >> /home/circleci/project/env
     set -x
   else
     # This IAM user allows write access to S3 bucket for sccache
     set +x
-    echo "declare -x AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V4}" >> /home/circleci/project/env
-    echo "declare -x AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V4}" >> /home/circleci/project/env
+    echo "declare -x AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V4:-}" >> /home/circleci/project/env
+    echo "declare -x AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V4:-}" >> /home/circleci/project/env
     set -x
   fi
 fi
 
 # This IAM user only allows read-write access to ECR
 set +x
-export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_ECR_READ_WRITE_V4}
-export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_ECR_READ_WRITE_V4}
+export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_ECR_READ_WRITE_V4:-}
+export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_ECR_READ_WRITE_V4:-}
 eval $(aws ecr get-login --region us-east-1 --no-include-email)
 set -x
diff --git a/.circleci/scripts/setup_linux_system_environment.sh b/.circleci/scripts/setup_linux_system_environment.sh
index 2782b103a1f5..e6bc004aef6a 100755
--- a/.circleci/scripts/setup_linux_system_environment.sh
+++ b/.circleci/scripts/setup_linux_system_environment.sh
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-set -ex
+set -eux -o pipefail
 
 # Set up CircleCI GPG keys for apt, if needed
 curl -L https://packagecloud.io/circleci/trusty/gpgkey | sudo apt-key add -
diff --git a/.circleci/verbatim-sources/binary-build-tests.yml b/.circleci/verbatim-sources/binary-build-tests.yml
index 540151acb6cc..a260cd880838 100644
--- a/.circleci/verbatim-sources/binary-build-tests.yml
+++ b/.circleci/verbatim-sources/binary-build-tests.yml
@@ -1,3 +1,4 @@
+
 # There is currently no testing for libtorch TODO
 #  binary_linux_libtorch_2.7m_cpu_test:
 #    environment:
diff --git a/.circleci/verbatim-sources/binary_update_htmls.yml b/.circleci/verbatim-sources/binary_update_htmls.yml
index 969b3615e027..0ac7d16d0e37 100644
--- a/.circleci/verbatim-sources/binary_update_htmls.yml
+++ b/.circleci/verbatim-sources/binary_update_htmls.yml
@@ -1,3 +1,4 @@
+  
   # update_s3_htmls job
   # These jobs create html files for every cpu/cu## folder in s3. The html
   # files just store the names of all the files in that folder (which are
@@ -9,6 +10,8 @@
     machine:
       image: ubuntu-1604:201903-01
     steps:
+    - attach_workspace:
+        at: ~/workspace
     - run:
         <<: *setup_linux_system_environment
     - run:
@@ -34,7 +37,7 @@
           echo "declare -x \"AWS_ACCESS_KEY_ID=${PYTORCH_BINARY_AWS_ACCESS_KEY_ID}\"" >> /home/circleci/project/env
           echo "declare -x \"AWS_SECRET_ACCESS_KEY=${PYTORCH_BINARY_AWS_SECRET_ACCESS_KEY}\"" >> /home/circleci/project/env
           source /home/circleci/project/env
-          set -ex
+          set -eux -o pipefail
           retry () {
               $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
           }
@@ -62,15 +65,12 @@
     machine:
       image: ubuntu-1604:201903-01
     steps:
+    - attach_workspace:
+        at: ~/workspace
     - run:
         <<: *setup_linux_system_environment
     - run:
         <<: *binary_checkout
-    # N.B. This sources binary_populate_env so that it takes the Pytorch
-    # version listed there. The only variables it needs are the date and the
-    # version string.
-    - run:
-        <<: *binary_populate_env
     - run:
         <<: *binary_install_miniconda
     - run:
@@ -78,15 +78,21 @@
         no_output_timeout: "1h"
         command: |
           set +x
-          echo "declare -x \"AWS_ACCESS_KEY_ID=${PYTORCH_BINARY_AWS_ACCESS_KEY_ID}\"" >> /home/circleci/project/env
+          echo "declare -x \"AWS_ACCESS_KEY_ID=${PYTORCH_BINARY_AWS_ACCESS_KEY_ID}\"" > /home/circleci/project/env
           echo "declare -x \"AWS_SECRET_ACCESS_KEY=${PYTORCH_BINARY_AWS_SECRET_ACCESS_KEY}\"" >> /home/circleci/project/env
+          export DATE="$(date -u +%Y_%m_%d)"
+          retry () {
+              $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
+          }
           source /home/circleci/project/env
-          set -ex
+          set -eux -o pipefail
 
+          # This is hardcoded to match binary_install_miniconda.sh
+          export PATH="/home/circleci/project/miniconda/bin:$PATH"
           # Not any awscli will work. Most won't. This one will work
-          export PATH="$MINICONDA_ROOT/bin:$PATH"
           retry conda create -qyn aws36 python=3.6
           source activate aws36
           pip install awscli==1.16.46
 
           "/home/circleci/project/builder/cron/upload_binary_sizes.sh"
+
diff --git a/.circleci/verbatim-sources/header-section.yml b/.circleci/verbatim-sources/header-section.yml
index e4a4f0feb50a..d1cd51b6e222 100644
--- a/.circleci/verbatim-sources/header-section.yml
+++ b/.circleci/verbatim-sources/header-section.yml
@@ -139,6 +139,11 @@ setup_ci_environment: &setup_ci_environment
   no_output_timeout: "1h"
   command: ~/workspace/.circleci/scripts/setup_ci_environment.sh
 
+# Installs expect and moreutils so that we can call `unbuffer` and `ts`.
+# Also installs OpenMP
+# !!!!NOTE!!!! this is copied into a binary_macos_brew_update job which is the
+# same but does not install libomp. If you are changing this, consider if you
+# need to change that step as well.
 macos_brew_update: &macos_brew_update
   name: Brew update and install moreutils, expect and libomp
   no_output_timeout: "1h"
@@ -154,18 +159,3 @@ macos_brew_update: &macos_brew_update
     brew install expect
     brew install libomp
 
-# In version 2.1 and above we could make this a command and pass a parameter to
-# it, but in this version there is no way to pass a parameter to a step
-binary_macos_brew_update: &binary_macos_brew_update
-  name: Brew update and install moreutils and expect
-  no_output_timeout: "1h"
-  command: |
-    set -ex
-    # moreutils installs a `parallel` executable by default, which conflicts
-    # with the executable from the GNU `parallel`, so we must unlink GNU
-    # `parallel` first, and relink it afterwards
-    brew update
-    brew unlink parallel
-    brew install moreutils
-    brew link parallel --overwrite
-    brew install expect
diff --git a/.circleci/verbatim-sources/job-specs-custom.yml b/.circleci/verbatim-sources/job-specs-custom.yml
index 47012225b7cc..79da6b5dd402 100644
--- a/.circleci/verbatim-sources/job-specs-custom.yml
+++ b/.circleci/verbatim-sources/job-specs-custom.yml
@@ -1,7 +1,8 @@
+  
   pytorch_short_perf_test_gpu:
     environment:
       BUILD_ENVIRONMENT: pytorch-short-perf-test-gpu
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn7-py3:300"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py3:300"
       PYTHON_VERSION: "3.6"
       USE_CUDA_DOCKER_RUNTIME: "1"
     resource_class: gpu.medium
@@ -38,7 +39,7 @@
   pytorch_doc_push:
     environment:
       BUILD_ENVIRONMENT: pytorch-doc-push
-      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda8-cudnn7-py3:300"
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9-cudnn7-py3:300"
     resource_class: large
     machine:
       image: ubuntu-1604:201903-01
@@ -197,6 +198,7 @@
             export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V4}
             set -x
 
-            git submodule sync && git submodule update -q --init
+            git submodule sync && git submodule update -q --init --recursive
             chmod a+x .jenkins/pytorch/macos-build.sh
             unbuffer .jenkins/pytorch/macos-build.sh 2>&1 | ts
+
diff --git a/.circleci/verbatim-sources/job-specs-setup.yml b/.circleci/verbatim-sources/job-specs-setup.yml
index 2032a463457a..9b748a4aba5a 100644
--- a/.circleci/verbatim-sources/job-specs-setup.yml
+++ b/.circleci/verbatim-sources/job-specs-setup.yml
@@ -1,3 +1,4 @@
+  
   setup:
     docker:
       - image: circleci/python:3.7.3
@@ -10,3 +11,4 @@
       - persist_to_workspace:
           root: .
           paths: .circleci/scripts
+
diff --git a/.circleci/verbatim-sources/linux-binary-build-defaults.yml b/.circleci/verbatim-sources/linux-binary-build-defaults.yml
index 4b87359f1291..c9a0216f0557 100644
--- a/.circleci/verbatim-sources/linux-binary-build-defaults.yml
+++ b/.circleci/verbatim-sources/linux-binary-build-defaults.yml
@@ -1,3 +1,4 @@
+
 # binary linux build defaults
 ##############################################################################
 binary_linux_build: &binary_linux_build
@@ -12,14 +13,14 @@ binary_linux_build: &binary_linux_build
   - run:
       name: Install unbuffer and ts
       command: |
-        set -ex
+        set -eux -o pipefail
         source /env
         retry yum -q -y install epel-release
         retry yum -q -y install expect moreutils
   - run:
       name: Upgrade gcc version (based on env var)
       command: |
-        set -ex
+        set -eux -o pipefail
         source /env
         if [[ "$DESIRED_DEVTOOLSET" == 'devtoolset7' ]]; then
           source "/builder/upgrade_gcc_abi.sh"
@@ -27,6 +28,11 @@ binary_linux_build: &binary_linux_build
           # Env variables are not persisted into the next step
           echo "export PATH=$PATH" >> /env
           echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH" >> /env
+
+          # We need to set this variable manually because
+          # https://github.com/pytorch/pytorch/blob/master/torch/abi-check.cpp
+          # sets the ABI to 0 by default
+          echo "export _GLIBCXX_USE_CXX11_ABI=1" >> /env
         else
           echo "Not upgrading gcc version"
         fi
@@ -50,10 +56,14 @@ binary_linux_test: &binary_linux_test
   steps:
   - attach_workspace:
       at: ~/workspace
+  - attach_workspace:
+      at: /home/circleci/project
   - run:
       <<: *setup_linux_system_environment
   - run:
       <<: *setup_ci_environment
+  - run:
+      <<: *binary_checkout
   - run:
       <<: *binary_populate_env
   - run:
@@ -83,3 +93,4 @@ binary_linux_upload: &binary_linux_upload
       name: Upload
       no_output_timeout: "1h"
       command: ~/workspace/.circleci/scripts/binary_linux_upload.sh
+
diff --git a/.circleci/verbatim-sources/linux-build-defaults.yml b/.circleci/verbatim-sources/linux-build-defaults.yml
index 605176da7819..106ad6f645e6 100644
--- a/.circleci/verbatim-sources/linux-build-defaults.yml
+++ b/.circleci/verbatim-sources/linux-build-defaults.yml
@@ -26,7 +26,7 @@ pytorch_linux_build_defaults: &pytorch_linux_build_defaults
         docker pull ${DOCKER_IMAGE} >/dev/null
         export id=$(docker run -t -d -w /var/lib/jenkins ${DOCKER_IMAGE})
 
-        git submodule sync && git submodule update -q --init
+        git submodule sync && git submodule update -q --init --recursive
 
         docker cp /home/circleci/project/. $id:/var/lib/jenkins/workspace
 
@@ -215,3 +215,4 @@ caffe2_linux_test_defaults: &caffe2_linux_test_defaults
 
         export COMMAND='((echo "source ./workspace/env" && echo "sudo chown -R jenkins workspace && cd workspace && ./ci_test_script.sh") | docker exec -u jenkins -i "$id" bash) 2>&1'
         echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
+
diff --git a/.circleci/verbatim-sources/macos-binary-build-defaults.yml b/.circleci/verbatim-sources/macos-binary-build-defaults.yml
index 2fc4ca37f1ec..c49f9bbe0289 100644
--- a/.circleci/verbatim-sources/macos-binary-build-defaults.yml
+++ b/.circleci/verbatim-sources/macos-binary-build-defaults.yml
@@ -22,7 +22,7 @@ binary_mac_build: &binary_mac_build
       name: Build
       no_output_timeout: "1h"
       command: |
-        set -ex
+        set -eux -o pipefail
         script="/Users/distiller/project/pytorch/.circleci/scripts/binary_macos_build.sh"
         cat "$script"
         source "$script"
@@ -31,7 +31,7 @@ binary_mac_build: &binary_mac_build
       name: Test
       no_output_timeout: "1h"
       command: |
-        set -ex
+        set -eux -o pipefail
         script="/Users/distiller/project/pytorch/.circleci/scripts/binary_macos_test.sh"
         cat "$script"
         source "$script"
@@ -63,3 +63,4 @@ binary_mac_upload: &binary_mac_upload
         script="/Users/distiller/project/pytorch/.circleci/scripts/binary_macos_upload.sh"
         cat "$script"
         source "$script"
+
diff --git a/.circleci/verbatim-sources/nightly-binary-build-defaults.yml b/.circleci/verbatim-sources/nightly-binary-build-defaults.yml
index c95c74c2a43e..0d354a5e44f9 100644
--- a/.circleci/verbatim-sources/nightly-binary-build-defaults.yml
+++ b/.circleci/verbatim-sources/nightly-binary-build-defaults.yml
@@ -25,13 +25,13 @@
 # do not need both the pytorch and builder repos, so this is a little wasteful
 # (smoke tests and upload jobs do not need the pytorch repo).
 binary_checkout: &binary_checkout
-  name: Checkout
+  name: Checkout pytorch/builder repo
   command: ~/workspace/.circleci/scripts/binary_checkout.sh
 
 # Parses circleci arguments in a consistent way, essentially routing to the
 # correct pythonXgccXcudaXos build we want
 binary_populate_env: &binary_populate_env
-  name: Set up env
+  name: Set up binary env variables
   command: ~/workspace/.circleci/scripts/binary_populate_env.sh
 
 binary_install_miniconda: &binary_install_miniconda
@@ -48,3 +48,21 @@ binary_run_in_docker: &binary_run_in_docker
   # This step only runs on circleci linux machine executors that themselves
   # need to start docker images
   command: ~/workspace/.circleci/scripts/binary_run_in_docker.sh
+
+# This is copied almost verbatim from the macos_brew_update job
+# In version 2.1 and above we could make this a command and pass a parameter to
+# it, but in this version there is no way to pass a parameter to a step
+binary_macos_brew_update: &binary_macos_brew_update
+  name: Brew update and install moreutils and expect
+  no_output_timeout: "1h"
+  command: |
+    set -eux -o pipefail
+    # moreutils installs a `parallel` executable by default, which conflicts
+    # with the executable from the GNU `parallel`, so we must unlink GNU
+    # `parallel` first, and relink it afterwards
+    brew update
+    brew unlink parallel
+    brew install moreutils
+    brew link parallel --overwrite
+    brew install expect
+
diff --git a/.circleci/verbatim-sources/nightly-build-smoke-tests-defaults.yml b/.circleci/verbatim-sources/nightly-build-smoke-tests-defaults.yml
index 729555bbe7d6..b8745547e785 100644
--- a/.circleci/verbatim-sources/nightly-build-smoke-tests-defaults.yml
+++ b/.circleci/verbatim-sources/nightly-build-smoke-tests-defaults.yml
@@ -1,3 +1,4 @@
+
 # Nighlty build smoke tests defaults
 # These are the second-round smoke tests. These make sure that the binaries are
 # correct from a user perspective, testing that they exist from the cloud are
@@ -9,10 +10,14 @@ smoke_linux_test: &smoke_linux_test
   steps:
   - attach_workspace:
       at: ~/workspace
+  - attach_workspace:
+      at: /home/circleci/project
   - run:
       <<: *setup_linux_system_environment
   - run:
       <<: *setup_ci_environment
+  - run:
+      <<: *binary_checkout
   - run:
       <<: *binary_populate_env
   - run:
@@ -22,8 +27,7 @@ smoke_linux_test: &smoke_linux_test
         set -ex
         cat >/home/circleci/project/ci_test_script.sh <<EOL
         # The following code will be executed inside Docker container
-        set -ex
-        git clone https://github.com/pytorch/builder.git /builder
+        set -eux -o pipefail
         /builder/smoke_test.sh
         # The above code will be executed inside Docker container
         EOL
@@ -36,15 +40,25 @@ smoke_mac_test: &smoke_mac_test
   steps:
     - attach_workspace:
         at: ~/workspace
+    - attach_workspace: # TODO - we can `cp` from ~/workspace
+        at: /Users/distiller/project
+    - run:
+        <<: *binary_checkout
     - run:
         <<: *binary_populate_env
     - run:
         <<: *binary_macos_brew_update
+    - run:
+        <<: *binary_install_miniconda
     - run:
         name: Build
         no_output_timeout: "1h"
         command: |
           set -ex
           source "/Users/distiller/project/env"
-          git clone https://github.com/pytorch/builder.git
-          unbuffer ./builder/smoke_test.sh | ts
+          export "PATH=$workdir/miniconda/bin:$PATH"
+          # TODO unbuffer and ts this, but it breaks cause miniconda overwrites
+          # tclsh. But unbuffer and ts aren't that important so they're just
+          # disabled for now
+          ./builder/smoke_test.sh
+
diff --git a/.circleci/verbatim-sources/workflows-binary-builds-smoke-subset.yml b/.circleci/verbatim-sources/workflows-binary-builds-smoke-subset.yml
index 707b05e342ed..8f0a3f466d6e 100644
--- a/.circleci/verbatim-sources/workflows-binary-builds-smoke-subset.yml
+++ b/.circleci/verbatim-sources/workflows-binary-builds-smoke-subset.yml
@@ -40,3 +40,4 @@
       #     requires:
       #       - setup
       #       - binary_linux_conda_3.6_cu90_build
+
diff --git a/.circleci/verbatim-sources/workflows-pytorch-macos-builds.yml b/.circleci/verbatim-sources/workflows-pytorch-macos-builds.yml
index fcab38286485..f033d27d5c59 100644
--- a/.circleci/verbatim-sources/workflows-pytorch-macos-builds.yml
+++ b/.circleci/verbatim-sources/workflows-pytorch-macos-builds.yml
@@ -10,3 +10,4 @@
       - pytorch_macos_10_13_cuda9_2_cudnn7_py3_build:
           requires:
             - setup
+            
diff --git a/.circleci/verbatim-sources/workflows-s3-html.yml b/.circleci/verbatim-sources/workflows-s3-html.yml
index 5a5c55352817..736c61a2c25b 100644
--- a/.circleci/verbatim-sources/workflows-s3-html.yml
+++ b/.circleci/verbatim-sources/workflows-s3-html.yml
@@ -14,9 +14,17 @@
               only:
                 - master
     jobs:
+      - setup
       - update_s3_htmls_for_nightlies:
           context: org-member
+          requires:
+            - setup
       - update_s3_htmls_for_nightlies_devtoolset7:
           context: org-member
+          requires:
+            - setup
       - upload_binary_sizes:
           context: org-member
+          requires:
+            - setup
+
diff --git a/.jenkins/caffe2/test.sh b/.jenkins/caffe2/test.sh
index 00a83b952956..63f50d384790 100755
--- a/.jenkins/caffe2/test.sh
+++ b/.jenkins/caffe2/test.sh
@@ -120,6 +120,5 @@ pip install --user pytest-sugar
 # torchvision tests #
 #####################
 if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then
-  pip install --user torchvision
   "$ROOT_DIR/scripts/onnx/test.sh"
 fi
diff --git a/.jenkins/pytorch/build-asan.sh b/.jenkins/pytorch/build-asan.sh
index a83d2af27070..892b7ee6dc13 100755
--- a/.jenkins/pytorch/build-asan.sh
+++ b/.jenkins/pytorch/build-asan.sh
@@ -29,11 +29,11 @@ export ASAN_OPTIONS=detect_leaks=0:symbolize=1
 # [2] https://wiki.gentoo.org/wiki/AddressSanitizer/Problems
 # [3] https://github.com/Kitware/CMake/commit/e9a1ddc594de6e6251bf06d732775dae2cabe4c8
 #
-# TODO: Make the ASAN flags a more unified env var
+# TODO: Make the ASAN flags a centralized env var and unify with USE_ASAN option
 CC="clang" CXX="clang++" LDSHARED="clang --shared" \
   CFLAGS="-fsanitize=address -fsanitize=undefined -fno-sanitize-recover=all -shared-libasan -pthread" \
   CXX_FLAGS="-pthread" \
-  NO_CUDA=1 USE_MKLDNN=0 \
+  USE_ASAN=1 NO_CUDA=1 USE_MKLDNN=0 \
   python setup.py install
 
 assert_git_not_dirty
diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh
index fbef26a95714..687995934036 100755
--- a/.jenkins/pytorch/build.sh
+++ b/.jenkins/pytorch/build.sh
@@ -20,7 +20,7 @@ if [[ "$BUILD_ENVIRONMENT" == *-xenial-cuda9-* ]]; then
   sudo apt-get -qq install --allow-downgrades --allow-change-held-packages libnccl-dev=2.2.13-1+cuda9.0 libnccl2=2.2.13-1+cuda9.0
 fi
 
-if [[ "$BUILD_ENVIRONMENT" == *-xenial-cuda9*gcc7* ]] || [[ "$BUILD_ENVIRONMENT" == *-xenial-cuda8-* ]] || [[ "$BUILD_ENVIRONMENT" == *-xenial-cuda9-cudnn7-py2* ]] || [[ "$BUILD_ENVIRONMENT" == *-trusty-py2.7.9* ]]; then
+if [[ "$BUILD_ENVIRONMENT" == *-xenial-cuda9*gcc7* ]] || [[ "$BUILD_ENVIRONMENT" == *-xenial-cuda9-* ]] || [[ "$BUILD_ENVIRONMENT" == *-trusty-py2.7.9* ]]; then
   # TODO: move this to Docker
   sudo apt-get -qq update
   if [[ "$BUILD_ENVIRONMENT" == *-trusty-py2.7.9* ]]; then
@@ -53,7 +53,7 @@ if ! which conda; then
   # In ROCm CIs, we are doing cross compilation on build machines with
   # intel cpu and later run tests on machines with amd cpu.
   # Also leave out two builds to make sure non-mkldnn builds still work.
-  if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *-trusty-py3.5-* && "$BUILD_ENVIRONMENT" != *-xenial-cuda8-cudnn7-py3-* ]]; then
+  if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *-trusty-py3.5-* && "$BUILD_ENVIRONMENT" != *-xenial-cuda9-cudnn7-py3-* ]]; then
     pip install -q mkl mkl-devel
     export USE_MKLDNN=1
   else
@@ -155,7 +155,7 @@ fi
 assert_git_not_dirty
 
 # Test documentation build
-if [[ "$BUILD_ENVIRONMENT" == *xenial-cuda8-cudnn7-py3* ]]; then
+if [[ "$BUILD_ENVIRONMENT" == *xenial-cuda9-cudnn7-py3* ]]; then
   pushd docs
   # TODO: Don't run this here
   pip install -q -r requirements.txt || true
@@ -165,7 +165,7 @@ if [[ "$BUILD_ENVIRONMENT" == *xenial-cuda8-cudnn7-py3* ]]; then
 fi
 
 # Test standalone c10 build
-if [[ "$BUILD_ENVIRONMENT" == *xenial-cuda8-cudnn7-py3* ]]; then
+if [[ "$BUILD_ENVIRONMENT" == *xenial-cuda9-cudnn7-py3* ]]; then
   mkdir -p c10/build
   pushd c10/build
   cmake ..
diff --git a/.jenkins/pytorch/common.sh b/.jenkins/pytorch/common.sh
index b06b487ac9fa..cfd16fb6eb8d 100644
--- a/.jenkins/pytorch/common.sh
+++ b/.jenkins/pytorch/common.sh
@@ -116,20 +116,6 @@ if [ -z "$COMPACT_JOB_NAME" ]; then
   exit 1
 fi
 
-if grep --line-regexp -q "$COMPACT_JOB_NAME" "$(dirname "${BASH_SOURCE[0]}")/disabled-configs.txt"; then
-  echo "Job is explicitly disabled, SKIPPING"
-  exit 0
-else
-  echo "Job is not disabled, proceeding"
-fi
-
-if grep --line-regexp -q "$COMPACT_JOB_NAME" "$(dirname "${BASH_SOURCE[0]}")/enabled-configs.txt"; then
-  echo "Job is enabled, proceeding"
-else
-  echo "Job is not enabled, FAILING now (revert changes to enabled-configs.txt to fix this)"
-  exit 1
-fi
-
 if [[ "$BUILD_ENVIRONMENT" == *pytorch-linux-xenial-cuda9-cudnn7-py3* ]] || \
    [[ "$BUILD_ENVIRONMENT" == *pytorch-linux-trusty-py3.6-gcc7* ]] || \
    [[ "$BUILD_ENVIRONMENT" == *pytorch_macos* ]]; then
@@ -141,7 +127,7 @@ fi
 # Use conda cmake in some CI build. Conda cmake will be newer than our supported
 # min version 3.5, so we only do it in two builds that we know should use conda.
 if [[ "$BUILD_ENVIRONMENT" == *pytorch-linux-xenial-cuda* ]]; then
-  if [[ "$BUILD_ENVIRONMENT" == *cuda8-cudnn7-py2* ]] || \
+  if [[ "$BUILD_ENVIRONMENT" == *cuda9-cudnn7-py2* ]] || \
      [[ "$BUILD_ENVIRONMENT" == *cuda9-cudnn7-py3* ]]; then
     if ! which conda; then
       echo "Expected ${BUILD_ENVIRONMENT} to use conda, but 'which conda' returns empty"
diff --git a/.jenkins/pytorch/disabled-configs.txt b/.jenkins/pytorch/disabled-configs.txt
deleted file mode 100644
index cdd51d3fb54a..000000000000
--- a/.jenkins/pytorch/disabled-configs.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-# This file contains a list of disabled configurations.  Disabled
-# configurations are skipped and not considered a failure if they
-# fail.  You can use this to temporarily reserve a test name to
-# turn on CI side before PyTorch repository supports it.  This
-# file has the same format as .jenkins/enabled-configs.txt
diff --git a/.jenkins/pytorch/enabled-configs.txt b/.jenkins/pytorch/enabled-configs.txt
deleted file mode 100644
index e6b0c2bb5565..000000000000
--- a/.jenkins/pytorch/enabled-configs.txt
+++ /dev/null
@@ -1,59 +0,0 @@
-# This file contains a list of enabled configurations
-# to perform tests on.  If you want to run tests on CI on
-# a limited set of tests before enabling the full test suite,
-# you can delete lines from this file.  Any test that is not
-# in this file will report a failure (so you don't forget to
-# reenable the tests on merge ;)
-
-pytorch-linux-xenial-cuda8-cudnn7-py3-build
-pytorch-linux-xenial-cuda8-cudnn7-py3-test
-pytorch-linux-xenial-cuda8-cudnn7-py3-multigpu-test
-pytorch-linux-xenial-cuda8-cudnn7-py3-nogpu-test
-pytorch-linux-xenial-cuda9-cudnn7-py2-build
-pytorch-linux-xenial-cuda9-cudnn7-py2-test
-pytorch-linux-xenial-cuda9-cudnn7-py3-build
-pytorch-linux-xenial-cuda9-cudnn7-py3-test
-pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7-build
-pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7-test
-pytorch-linux-xenial-cuda10-cudnn7-py3-gcc7-build
-pytorch-linux-xenial-cuda10-cudnn7-py3-gcc7-test
-pytorch-linux-xenial-py3-clang5-asan-build
-pytorch-linux-xenial-py3-clang5-asan-test
-pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build
-pytorch-linux-trusty-py2.7.9-build
-pytorch-linux-trusty-py2.7.9-test
-pytorch-linux-trusty-py2.7-build
-pytorch-linux-trusty-py2.7-test
-pytorch-linux-trusty-py3.5-build
-pytorch-linux-trusty-py3.5-test
-pytorch-linux-trusty-py3.6-gcc4.8-build
-pytorch-linux-trusty-py3.6-gcc4.8-test
-pytorch-linux-trusty-py3.6-gcc5.4-build
-pytorch-linux-trusty-py3.6-gcc5.4-test
-pytorch-linux-trusty-py3.6-gcc7.2-build
-pytorch-linux-trusty-py3.6-gcc7.2-test
-pytorch-linux-trusty-py3.6-gcc7-build
-pytorch-linux-trusty-py3.6-gcc7-test
-pytorch-linux-trusty-pynightly-build
-pytorch-linux-trusty-pynightly-test
-pytorch-win-ws2016-cuda9-cudnn7-py3-build
-pytorch-win-ws2016-cuda9-cudnn7-py3-test
-pytorch-macos-10.13-py3-build
-pytorch-macos-10.13-py3-test
-pytorch-macos-10.13-cuda9.2-cudnn7-py3-build
-pytorch-docker-build-test
-short-perf-test-cpu
-short-perf-test-gpu
-py2-clang7-rocmdeb-ubuntu16.04
-py2-devtoolset7-rocmrpm-centos7.5
-pytorch-ppc64le-cuda9.2-cudnn7-py3-build
-pytorch-ppc64le-cuda9.2-cudnn7-py3-test
-pytorch-ppc64le-cuda9.1-cudnn7-py3-build
-pytorch-ppc64le-cuda9.1-cudnn7-py3-test
-pytorch-linux-xenial-cuda8-cudnn7-py3-NO_AVX2-test
-pytorch-linux-xenial-cuda8-cudnn7-py3-NO_AVX-NO_AVX2-test
-pytorch-linux-xenial-cuda8-cudnn7-py3-slow-test
-pytorch-xla-linux-trusty-py3.6-gcc5.4-build
-pytorch-xla-linux-trusty-py3.6-gcc5.4-test
-pytorch-namedtensor-linux-trusty-py3.6-gcc5.4-build
-pytorch-namedtensor-linux-trusty-py3.6-gcc5.4-test
diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index 03c96cb16e54..431b805aef0d 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -94,9 +94,6 @@ fi
 
 if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
   export PYTORCH_TEST_WITH_ROCM=1
-  # ROCm CI is using Caffe2 docker images, which doesn't have several packages
-  # needed in testing. We install them here.
-  pip install -q psutil "librosa>=0.6.2" --user
 fi
 
 if [[ "${BUILD_ENVIRONMENT}" == *-NO_AVX-* ]]; then
diff --git a/.jenkins/pytorch/win-build.sh b/.jenkins/pytorch/win-build.sh
index 3b6f330dd5b9..dbe2fd48995a 100755
--- a/.jenkins/pytorch/win-build.sh
+++ b/.jenkins/pytorch/win-build.sh
@@ -15,6 +15,7 @@ COMPACT_JOB_NAME=pytorch-win-ws2016-cuda9-cudnn7-py3-build
 SCRIPT_PARENT_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
 source "$SCRIPT_PARENT_DIR/common.sh"
 
+export IMAGE_COMMIT_ID=`git rev-parse HEAD`
 export IMAGE_COMMIT_TAG=${BUILD_ENVIRONMENT}-${IMAGE_COMMIT_ID}
 if [[ ${JOB_NAME} == *"develop"* ]]; then
   export IMAGE_COMMIT_TAG=develop-${IMAGE_COMMIT_TAG}
diff --git a/.jenkins/pytorch/win-test-helpers/build_pytorch.bat b/.jenkins/pytorch/win-test-helpers/build_pytorch.bat
index dc4bc2ab4f24..4208d462f647 100644
--- a/.jenkins/pytorch/win-test-helpers/build_pytorch.bat
+++ b/.jenkins/pytorch/win-test-helpers/build_pytorch.bat
@@ -21,13 +21,42 @@ if "%REBUILD%"=="" ( pip install -q ninja )
 git submodule sync --recursive
 git submodule update --init --recursive
 
-set PATH=%TMP_DIR_WIN%\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0\libnvvp;%PATH%
+if "%CUDA_VERSION%" == "9" goto cuda_build_9
+if "%CUDA_VERSION%" == "10" goto cuda_build_10
+goto cuda_build_end
+
+:cuda_build_9
+
+:: Override VS env here
+pushd .
+call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsall.bat" x64
+@echo on
+popd
+set DISTUTILS_USE_SDK=1
+
 set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0
-set CUDA_PATH_V9_0=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0
+set CUDA_PATH_V9_0=%CUDA_PATH%
+
+goto cuda_build_common
+
+:cuda_build_10
+
+set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1
+set CUDA_PATH_V10_1=%CUDA_PATH%
+
+goto cuda_build_common
+
+:cuda_build_common
+
+set CUDNN_LIB_DIR=%CUDA_PATH%\lib\x64
+set CUDA_TOOLKIT_ROOT_DIR=%CUDA_PATH%
+set CUDNN_ROOT_DIR=%CUDA_PATH%
 set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt
-set CUDNN_LIB_DIR=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0\lib\x64
-set CUDA_TOOLKIT_ROOT_DIR=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0
-set CUDNN_ROOT_DIR=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0
+set PATH=%CUDA_PATH%\bin;%CUDA_PATH%\libnvvp;%PATH%
+
+:cuda_build_end
+
+set PATH=%TMP_DIR_WIN%\bin;%PATH%
 
 :: Target only our CI GPU machine's CUDA arch to speed up the build
 set TORCH_CUDA_ARCH_LIST=5.2
diff --git a/.jenkins/pytorch/win-test-helpers/installation-helpers/install_magma.bat b/.jenkins/pytorch/win-test-helpers/installation-helpers/install_magma.bat
index 4e4c3b7ad337..631f5d1e6f64 100644
--- a/.jenkins/pytorch/win-test-helpers/installation-helpers/install_magma.bat
+++ b/.jenkins/pytorch/win-test-helpers/installation-helpers/install_magma.bat
@@ -1,9 +1,17 @@
+if "%CUDA_VERSION%" == "9" set CUDA_SUFFIX=cuda90
+if "%CUDA_VERSION%" == "10" set CUDA_SUFFIX=cuda101
+
+if "%CUDA_SUFFIX%" == "" (
+  echo unknown CUDA version, please set `CUDA_VERSION` to 9 or 10.
+  exit /b 1
+)
+
 if "%REBUILD%"=="" (
   if "%BUILD_ENVIRONMENT%"=="" (
-    curl -k https://s3.amazonaws.com/ossci-windows/magma_2.5.0_cuda90_%BUILD_TYPE%.7z --output %TMP_DIR_WIN%\magma_2.5.0_cuda90_%BUILD_TYPE%.7z
+    curl -k https://s3.amazonaws.com/ossci-windows/magma_2.5.0_%CUDA_SUFFIX%_%BUILD_TYPE%.7z --output %TMP_DIR_WIN%\magma_2.5.0_%CUDA_SUFFIX%_%BUILD_TYPE%.7z
   ) else (
-    aws s3 cp s3://ossci-windows/magma_2.5.0_cuda90_%BUILD_TYPE%.7z %TMP_DIR_WIN%\magma_2.5.0_cuda90_%BUILD_TYPE%.7z --quiet
+    aws s3 cp s3://ossci-windows/magma_2.5.0_%CUDA_SUFFIX%_%BUILD_TYPE%.7z %TMP_DIR_WIN%\magma_2.5.0_%CUDA_SUFFIX%_%BUILD_TYPE%.7z --quiet
   )
-  7z x -aoa %TMP_DIR_WIN%\magma_2.5.0_cuda90_%BUILD_TYPE%.7z -o%TMP_DIR_WIN%\magma
+  7z x -aoa %TMP_DIR_WIN%\magma_2.5.0_%CUDA_SUFFIX%_%BUILD_TYPE%.7z -o%TMP_DIR_WIN%\magma
 )
 set MAGMA_HOME=%TMP_DIR_WIN%\magma
diff --git a/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat b/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat
index 4431e7d10643..345a03aacf77 100644
--- a/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat
+++ b/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat
@@ -25,21 +25,48 @@ pip install -q ninja future hypothesis "librosa>=0.6.2" psutil
 :: No need to install faulthandler since we only test Python >= 3.6 on Windows
 :: faulthandler is builtin since Python 3.3
 
+if "%CUDA_VERSION%" == "9" goto cuda_build_9
+if "%CUDA_VERSION%" == "10" goto cuda_build_10
+goto cuda_build_end
+
+:cuda_build_9
+
 pushd .
-call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsall.bat" x86_amd64
+call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsall.bat" x64
+@echo on
 popd
 
-set PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0\libnvvp;%PATH%
 set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0
-set CUDA_PATH_V9_0=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0
+set CUDA_PATH_V9_0=%CUDA_PATH%
+
+goto cuda_build_common
+
+:cuda_build_10
+
+pushd .
+call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Auxiliary\Build\vcvarsall.bat" x64
+@echo on
+popd
+
+set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1
+set CUDA_PATH_V10_1=%CUDA_PATH%
+
+goto cuda_build_common
+
+:cuda_build_common
+
+set CUDNN_LIB_DIR=%CUDA_PATH%\lib\x64
+set CUDA_TOOLKIT_ROOT_DIR=%CUDA_PATH%
+set CUDNN_ROOT_DIR=%CUDA_PATH%
 set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt
-set CUDNN_LIB_DIR=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0\lib\x64
-set CUDA_TOOLKIT_ROOT_DIR=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0
-set CUDNN_ROOT_DIR=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0
+set PATH=%CUDA_PATH%\bin;%CUDA_PATH%\libnvvp;%PATH%
+set NUMBAPRO_CUDALIB=%CUDA_PATH%\bin
+set NUMBAPRO_LIBDEVICE=%CUDA_PATH%\nvvm\libdevice
+set NUMBAPRO_NVVM=%CUDA_PATH%\nvvm\bin\nvvm64_32_0.dll
+
+:cuda_build_end
+
 set PYTHONPATH=%TMP_DIR_WIN%\build;%PYTHONPATH%
-set NUMBAPRO_CUDALIB=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0\bin
-set NUMBAPRO_LIBDEVICE=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0\nvvm\libdevice
-set NUMBAPRO_NVVM=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0\nvvm\bin\nvvm64_32_0.dll
 
 if NOT "%BUILD_ENVIRONMENT%"=="" (
     pushd %TMP_DIR_WIN%\build
@@ -51,4 +78,7 @@ if NOT "%BUILD_ENVIRONMENT%"=="" (
     xcopy /s %CONDA_PARENT_DIR%\Miniconda3\Lib\site-packages\torch %TMP_DIR_WIN%\build\torch\
 )
 
+@echo off
+echo @echo off >> %TMP_DIR%/ci_scripts/pytorch_env_restore.bat
 for /f "usebackq tokens=*" %%i in (`set`) do echo set "%%i" >> %TMP_DIR%/ci_scripts/pytorch_env_restore.bat
+@echo on
diff --git a/.jenkins/pytorch/win-test-helpers/test_custom_script_ops.bat b/.jenkins/pytorch/win-test-helpers/test_custom_script_ops.bat
index 0c13e1ccc4fa..d86692dbabba 100644
--- a/.jenkins/pytorch/win-test-helpers/test_custom_script_ops.bat
+++ b/.jenkins/pytorch/win-test-helpers/test_custom_script_ops.bat
@@ -4,11 +4,22 @@ cd test\custom_operator
 
 :: Build the custom operator library.
 mkdir build
-cd build
+pushd build
+
+echo "Executing CMake for custom_operator test..."
+
 :: Note: Caffe2 does not support MSVC + CUDA + Debug mode (has to be Release mode)
 cmake -DCMAKE_PREFIX_PATH=%TMP_DIR_WIN%\build\torch -DCMAKE_BUILD_TYPE=Release -GNinja ..
+if ERRORLEVEL 1 exit /b 1
+
+echo "Executing Ninja for custom_operator test..."
+
 ninja -v
-cd ..
+if ERRORLEVEL 1 exit /b 1
+
+echo "Ninja succeeded for custom_operator test."
+
+popd
 
 :: Run tests Python-side and export a script module.
 python test_custom_ops.py -v
diff --git a/.jenkins/pytorch/win-test.sh b/.jenkins/pytorch/win-test.sh
index c51066622684..425a4b8365aa 100755
--- a/.jenkins/pytorch/win-test.sh
+++ b/.jenkins/pytorch/win-test.sh
@@ -6,6 +6,7 @@ COMPACT_JOB_NAME=pytorch-win-ws2016-cuda9-cudnn7-py3-test
 SCRIPT_PARENT_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
 source "$SCRIPT_PARENT_DIR/common.sh"
 
+export IMAGE_COMMIT_ID=`git rev-parse HEAD`
 export IMAGE_COMMIT_TAG=${BUILD_ENVIRONMENT}-${IMAGE_COMMIT_ID}
 if [[ ${JOB_NAME} == *"develop"* ]]; then
   export IMAGE_COMMIT_TAG=develop-${IMAGE_COMMIT_TAG}
diff --git a/CITATION b/CITATION
index 046a2fa42038..9597a50fa754 100644
--- a/CITATION
+++ b/CITATION
@@ -1,6 +1,6 @@
 @inproceedings{paszke2017automatic,
-  title={Automatic differentiation in PyTorch},
+  title={Automatic Differentiation in {PyTorch}},
   author={Paszke, Adam and Gross, Sam and Chintala, Soumith and Chanan, Gregory and Yang, Edward and DeVito, Zachary and Lin, Zeming and Desmaison, Alban and Antiga, Luca and Lerer, Adam},
-  booktitle={NIPS-W},
+  booktitle={NIPS Autodiff Workshop},
   year={2017}
 }
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 70264b802390..172b2d3b7296 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -16,6 +16,11 @@ set(CMAKE_CXX_STANDARD 11)
 if (NOT MSVC)
   set(CMAKE_C_STANDARD 11)
 endif()
+if (DEFINED GLIBCXX_USE_CXX11_ABI)
+  if (${GLIBCXX_USE_CXX11_ABI} EQUAL 1)
+    set(CXX_STANDARD_REQUIRED ON)
+  endif()
+endif()
 
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
@@ -61,7 +66,6 @@ endif()
 # Note to developers: if you add an option below, make sure you also add it to
 # cmake/Summary.cmake so that the summary prints out the option values.
 include(CMakeDependentOption)
-option(BUILD_TORCH "Build Torch" OFF)
 option(ATEN_NO_TEST "Do not build ATen test binaries" OFF)
 option(BUILD_ATEN_ONLY "Build only a subset focused on ATen only" OFF)
 option(BUILD_BINARY "Build C++ binaries" OFF)
@@ -318,6 +322,10 @@ if(NOT MSVC)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-constexpr-not-const")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-missing-braces")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Qunused-arguments")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fcolor-diagnostics")
+  endif()
+  if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.9)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fdiagnostics-color=always")
   endif()
   if ((APPLE AND (NOT ("${CLANG_VERSION_STRING}" VERSION_LESS "9.0")))
     OR (CMAKE_COMPILER_IS_GNUCXX
@@ -343,7 +351,9 @@ if(NOT MSVC)
 else()
   foreach(flag_var
       CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
-      CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
+      CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
+      CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
+      CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
     if (${CAFFE2_USE_MSVC_STATIC_RUNTIME})
       if(${flag_var} MATCHES "/MD")
         string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
@@ -508,6 +518,7 @@ if (BUILD_SHARED_LIBS)
       ${PROJECT_SOURCE_DIR}/cmake/Modules_CUDA_fix
       DESTINATION share/cmake/Caffe2/
       COMPONENT dev)
+
   install(EXPORT Caffe2Targets DESTINATION share/cmake/Caffe2
       FILE Caffe2Targets.cmake
       COMPONENT dev)
diff --git a/README.md b/README.md
index 3b4ee37e4231..be886cda25b5 100644
--- a/README.md
+++ b/README.md
@@ -151,13 +151,13 @@ They requires JetPack 4.2 and above and are maintained by @dusty-nv
 ### From Source
 
 If you are installing from source, we highly recommend installing an [Anaconda](https://www.anaconda.com/distribution/#download-section) environment.
-You will get a high-quality BLAS library (MKL) and you get a controlled compiler version regardless of your Linux distro.
+You will get a high-quality BLAS library (MKL) and you get controlled dependency versions regardless of your Linux distro.
 
 Once you have [Anaconda](https://www.anaconda.com/distribution/#download-section) installed, here are the instructions.
 
 If you want to compile with CUDA support, install
-- [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads) 7.5 or above
-- [NVIDIA cuDNN](https://developer.nvidia.com/cudnn) v6.x or above
+- [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads) 9 or above
+- [NVIDIA cuDNN](https://developer.nvidia.com/cudnn) v7 or above
 
 If you want to disable CUDA support, export environment variable `NO_CUDA=1`.
 Other potentially useful environment variables may be found in `setup.py`.
@@ -175,7 +175,7 @@ conda install numpy ninja pyyaml mkl mkl-include setuptools cmake cffi typing
 On Linux
 ```bash
 # Add LAPACK support for the GPU if needed
-conda install -c pytorch magma-cuda90 # or [magma-cuda80 | magma-cuda92 | magma-cuda100 ] depending on your cuda version
+conda install -c pytorch magma-cuda90 # or [magma-cuda92 | magma-cuda100 ] depending on your cuda version
 ```
 
 #### Get the PyTorch Source
@@ -183,7 +183,7 @@ conda install -c pytorch magma-cuda90 # or [magma-cuda80 | magma-cuda92 | magma-
 git clone --recursive https://github.com/pytorch/pytorch
 cd pytorch
 # if you are updating an existing checkout
-git submodule sync 
+git submodule sync
 git submodule update --init --recursive
 ```
 
@@ -209,9 +209,6 @@ If the version of Visual Studio 2017 is higher than 15.4.5, installing of "VC++
 <br/> There is no guarantee of the correct building with VC++ 2017 toolsets, others than version 15.4 v14.11.
 <br/> "VC++ 2017 version 15.4 v14.11 toolset" might be installed onto already installed Visual Studio 2017 by running its installation once again and checking the corresponding checkbox under "Individual components"/"Compilers, build tools, and runtimes".
 
-For building against CUDA 8.0 Visual Studio 2015 Update 3 (version 14.0), and the [patch](https://download.microsoft.com/download/8/1/d/81dbe6bb-ed92-411a-bef5-3a75ff972c6a/vc14-kb4020481.exe) are needed to be installed too.
-The details of the patch can be found [here](https://support.microsoft.com/en-gb/help/4020481/fix-link-exe-crashes-with-a-fatal-lnk1000-error-when-you-use-wholearch).
-
 NVTX is a part of CUDA distributive, where it is called "Nsight Compute". For installing it onto already installed CUDA run CUDA installation once again and check the corresponding checkbox.
 Be sure that CUDA with Nsight Compute is installed after Visual Studio 2017.
 
@@ -221,9 +218,6 @@ REM [Optional] The following two lines are needed for Python 2.7, but the suppor
 set MSSdk=1
 set FORCE_PY27_BUILD=1
 
-REM [Optional] As for CUDA 8, VS2015 Update 3 is required; use the following line.
-set "CUDAHOSTCXX=%VS140COMNTOOLS%..\..\VC\bin\amd64\cl.exe"
-
 set CMAKE_GENERATOR=Visual Studio 15 2017 Win64
 set DISTUTILS_USE_SDK=1
 
diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
index 213d6465d2d1..b3d0a4c32457 100644
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@@ -252,12 +252,7 @@ IF(USE_CUDA AND NOT USE_ROCM)
 
     # build fake CuFFT lib in build dir
     EXECUTE_PROCESS(COMMAND touch ${CMAKE_CURRENT_BINARY_DIR}/empty_file.cc)
-    if(${CUDA_VERSION_MAJOR} EQUAL "8")
-      SET(CUFFT_FAKELINK_OPTIONS
-        --generate-code arch=compute_35,code=sm_35
-        --generate-code arch=compute_50,code=sm_50
-        --generate-code arch=compute_60,code=sm_60)
-    elseif(${CUDA_VERSION_MAJOR} EQUAL "9")
+    if(${CUDA_VERSION_MAJOR} EQUAL "9")
       SET(CUFFT_FAKELINK_OPTIONS
         --generate-code arch=compute_35,code=sm_35
         --generate-code arch=compute_50,code=sm_50
diff --git a/aten/src/ATen/CPUApplyUtils.h b/aten/src/ATen/CPUApplyUtils.h
index b1bed17f4140..8842eddd1fa2 100644
--- a/aten/src/ATen/CPUApplyUtils.h
+++ b/aten/src/ATen/CPUApplyUtils.h
@@ -30,7 +30,7 @@ inline std::pair<int64_t, int64_t> collapse_dims(
     T* strides,
     int64_t dims,
     const int excludeDim = -1) {
-  AT_CHECK(
+  TORCH_CHECK(
       excludeDim >= -1 && excludeDim < dims,
       "expected excluded dim between -1 and dims - 1");
 
@@ -331,69 +331,6 @@ apply_op(int64_t numel, int64_t offset, const Op& op, Args... iters) {
   }
 }
 
-
-inline void apply_kernel(){};
-
-// TODO: Deal elegantly with 0-dim tensors. iters.strides_ of 0-dim
-// strided_tensor_iter will be of size 0 for dim 0 and iters.strides_[iters.dim_
-// - 1] will index at -1. C++14 integer_sequence could be of use here.
-template <typename Op, typename... Args>
-inline void
-apply_kernel(int64_t numel, int64_t offset, const Op& op, Args... iters) {
-  if (offset > 0)
-    forward(offset, iters...);
-  int64_t size = std::min(numel, max_iterate_size(iters...));
-  op(size, iters.data_..., iters.strides_[iters.dim_ - 1]...);
-  iterate(size, iters...);
-  iterate_overflow(iters...);
-  int64_t i = size;
-  size = std::min(numel, max_iterate_size(iters...));
-  for (; i < numel;) {
-    op(size, iters.data_..., iters.strides_[iters.dim_ - 1]...);
-    iterate(size, iters...);
-    i += size;
-    iterate_overflow(iters...);
-  }
-}
-
-template <typename scalar1, typename scalar2, typename Op>
-inline void
-CPU_tensor_parallel_kernel_apply2(Tensor tensor1, Tensor tensor2, const Op op) {
-  if (!_apply_preamble({tensor1, tensor2}))
-    return;
-  if (tensor1.numel() == 1) {
-    op(1, tensor1.data<scalar1>(), tensor2.data<scalar2>(), 0, 0);
-    return;
-  }
-  if (tensor1.ndimension() < 8 && tensor2.ndimension() < 8) {
-    parallel_for(
-        0,
-        tensor1.numel(),
-        1,
-        [&tensor1, &tensor2, &op](int64_t begin, int64_t end) {
-          apply_kernel(
-              end - begin,
-              begin,
-              op,
-              strided_tensor_iter_fixed<scalar1, 8>(tensor1),
-              strided_tensor_iter_fixed<scalar2, 8>(tensor2));
-        });
-  } else {
-    parallel_for(
-        0,
-        tensor1.numel(),
-        1,
-        [&tensor1, &tensor2, &op](int64_t begin, int64_t end) {
-          apply_kernel(
-              end - begin,
-              begin,
-              op,
-              strided_tensor_iter<scalar1>(tensor1),
-              strided_tensor_iter<scalar2>(tensor2));
-        });
-  }
-}
-
 /*
   Apply a pointwise operator to sequence of tensors
 
diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp
index 29a1d6709131..c3a64623e9e0 100644
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@@ -123,7 +123,7 @@ TypeExtendedInterface& getType(TensorOptions options) {
 TypeExtendedInterface& getType(const TensorImpl* impl) {
   Backend backend = tensorTypeIdToBackend(impl->type_id());
   return globalContext().getType(
-            backend, typeMetaToScalarType(impl->dtype()), impl->is_variable() && !at::NonVariableTypeMode::is_enabled());
+            backend, typeMetaToScalarType(impl->dtype()), impl->is_variable());
 }
 
 TypeExtendedInterface& getType(const Tensor& t) {
diff --git a/aten/src/ATen/DLConvertor.cpp b/aten/src/ATen/DLConvertor.cpp
index 8604ec563047..e75db146ed20 100644
--- a/aten/src/ATen/DLConvertor.cpp
+++ b/aten/src/ATen/DLConvertor.cpp
@@ -42,6 +42,12 @@ static DLDataType getDLDataType(const Tensor& t) {
     case ScalarType::QInt8:
       throw std::logic_error("QInt8 is not supported by dlpack");
       break;
+    case ScalarType::QUInt8:
+      throw std::logic_error("QUInt8 is not supported by dlpack");
+      break;
+    case ScalarType::QInt32:
+      throw std::logic_error("QInt32 is not supported by dlpack");
+      break;
     case ScalarType::ComplexHalf:
       throw std::logic_error("ComplexHalf is not supported by dlpack");
     case ScalarType::ComplexFloat:
diff --git a/aten/src/ATen/Declarations.cwrap b/aten/src/ATen/Declarations.cwrap
index 0eea8e39909b..75f831c12b87 100644
--- a/aten/src/ATen/Declarations.cwrap
+++ b/aten/src/ATen/Declarations.cwrap
@@ -92,11 +92,13 @@
   options:
     - arguments:
       - arg: THTensor* self
+        broadcast: mask inplace fallback types:Bool
       - THBoolTensor* mask
       - real value
     - zero_dim_tensor_only: True
       arguments:
       - arg: THTensor* self
+        broadcast: mask inplace fallback types:Bool
       - THBoolTensor* mask
       - THTensor* value
 ]]
@@ -118,12 +120,15 @@
   return: self
   arguments:
     - arg: THTensor* self
+      broadcast: mask inplace fallback types:Bool
     - THBoolTensor* mask
     - THTensor* source
 ]]
 [[
   name: _th_masked_select
   cname: maskedSelect
+  cpu_bool: True
+  cuda_bool: True
   variants:
     - function
   return: argument 0
@@ -137,6 +142,8 @@
 [[
   name: _th_masked_select_bool
   cname: maskedSelectBool
+  cpu_bool: True
+  cuda_bool: True
   variants:
     - function
   return: argument 0
@@ -144,11 +151,13 @@
     - arg: THTensor* result
       output: True
     - arg: THTensor* self
+      broadcast: mask fallback types:Bool
     - THBoolTensor* mask
 ]]
 [[
   name: _th_nonzero
   cname: nonzero
+  cpu_half: True
   cpu_bool: True
   cuda_bool: True
   variants:
@@ -365,6 +374,8 @@
 ]]
 [[
   name: _th_and
+  cpu_bool: True
+  cuda_bool: True
   cname: __and__
   variants:
     - function
@@ -387,6 +398,8 @@
 [[
   name: _th_iand_
   cname: __iand__
+  cpu_bool: True
+  cuda_bool: True
   variants:
     - function
   return: argument 0
@@ -406,6 +419,8 @@
 [[
   name: _th_or
   cname: __or__
+  cpu_bool: True
+  cuda_bool: True
   variants:
     - function
   return: argument 0
@@ -427,6 +442,8 @@
 [[
   name: _th_ior_
   cname: __ior__
+  cpu_bool: True
+  cuda_bool: True
   variants:
     - function
   return: argument 0
@@ -446,6 +463,8 @@
 [[
   name: _th_xor
   cname: __xor__
+  cpu_bool: True
+  cuda_bool: True
   variants:
     - function
   return: argument 0
@@ -1771,6 +1790,8 @@
 [[
   name: _th_sign
   cname: sign
+  cpu_bool: True
+  cuda_bool: True
   variants:
     - function
   return: argument 0
@@ -2362,22 +2383,6 @@
       if_false: N
       default: S
 ]]
-[[
-  name: _th_getri_single
-  cname: getri
-  types:
-    - Float
-    - Double
-  backends:
-    - CPU
-    - CUDA
-  variants: function
-  return: argument 0
-  arguments:
-    - arg: THTensor* output
-      output: True
-    - THTensor* self
-]]
 [[
   name: _th_potri
   cname: potri
@@ -2624,7 +2629,6 @@
     - floating_point
   backends:
     - CPU
-    - CUDA
   cname: uniform
   variants: function
   return: self
diff --git a/aten/src/ATen/Dispatch.h b/aten/src/ATen/Dispatch.h
index 649bc9d4cec1..8deac1ac9aee 100644
--- a/aten/src/ATen/Dispatch.h
+++ b/aten/src/ATen/Dispatch.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <ATen/core/Tensor.h>
+#include <c10/macros/Macros.h>
 #include <c10/util/Half.h>
 #include <c10/util/Exception.h>
 #include <ATen/core/DeprecatedTypeProperties.h>
@@ -11,6 +12,14 @@
     return __VA_ARGS__();                          \
   }
 
+#define AT_QINT_PRIVATE_CASE_TYPE(enum_type, type, underlying_enum, underlying_type, ...) \
+  case enum_type: {                                                     \
+    const auto& UNDERLYING_TYPE C10_UNUSED = underlying_enum;           \
+    using scalar_t C10_UNUSED = type;                                   \
+    using underlying_t C10_UNUSED = underlying_type;                    \
+    return __VA_ARGS__();                                               \
+  }
+
 namespace detail {
 
 template <at::ScalarType N>
@@ -59,6 +68,54 @@ inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {}
 
 }
 
+// The AT_DISPATCH_* family of macros provides the ability to
+// conveniently generate specializations of a kernel over all of the
+// dtypes we care about in PyTorch.  We call it "dispatch" because
+// we are "dispatching" to the correct, dtype-specific kernel.
+//
+// A standard usage looks like:
+//
+//      AT_DISPATCH_ALL_TYPES(self.scalar_type(), "op_name", [&] {
+//          // Your code here, with 'scalar_t' now defined to
+//          // be the dtype in question
+//      })
+//
+// There are many variations of this macro, so it's important to
+// understand exactly /which/ dtypes you want to get instantiated, as
+// well as what the "default" set is.
+//
+// The default set of dtypes that are instantiated (e.g., by
+// AT_DISPATCH_ALL_TYPES) are floating point types (float, double),
+// and integral types (int32_t, int64_t, int16_t, int8_t, uint8_t),
+// but NOT booleans (bool), half-precision floats (Half) or
+// complex number (std::complex<float>, std::complex<double>).
+// This "cut" is somewhat historical (the default types are the
+// ones that TH historically supported), but it also reflects the
+// fact that the non-default types are "poorly" behaved (booleans
+// are NOT integers mod 2, half precision operations ~essentially
+// don't exist on CPU, complex numbers are an experimental application).
+//
+// Here are the questions you should generally ask to decide which
+// dispatch you want:
+//
+// 1. Is this an integral or floating point specific operation?
+//    (If so, you'll want one of the FLOATING or INTEGRAL macros.)
+//
+// 2. Should half be supported?  (If you're on CPU, the answer is almost
+//    definitely no.  If you do want support, use one of the AND_HALF
+//    macros)
+//
+// Much rarer situations:
+//
+// 3. Should bool be supported?  (You often have to write your kernel
+//    differently if arithmetic operations are involved.)  If so,
+//    Use AT_DISPATCH_ALL_TYPES_AND along with ScalarType::Bool
+//
+// 4. Should complex be supported?  The answer is almost always no,
+//    unless you are working on "generic" code that should work on
+//    all dtypes.
+
+
 // NB: the the_type variable is not used, but we have kept it for
 // backwards compatibility.  It's probably not used by anyone though;
 // but we're just being safe (and it doesn't hurt.)  Note we must
@@ -127,26 +184,6 @@ inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {}
     }                                                                        \
   }()
 
-#define AT_DISPATCH_ALL_TYPES_AND_HALF(TYPE, NAME, ...)                      \
-  [&] {                                                                      \
-    detail::deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF();                     \
-    const auto& the_type = TYPE;                                             \
-    /* don't use TYPE again in case it is an expensive or side-effect op */  \
-    at::ScalarType _st = ::detail::scalar_type(the_type);                    \
-    switch (_st) {                                                           \
-      AT_PRIVATE_CASE_TYPE(at::ScalarType::Byte, uint8_t, __VA_ARGS__)       \
-      AT_PRIVATE_CASE_TYPE(at::ScalarType::Char, int8_t, __VA_ARGS__)        \
-      AT_PRIVATE_CASE_TYPE(at::ScalarType::Double, double, __VA_ARGS__)      \
-      AT_PRIVATE_CASE_TYPE(at::ScalarType::Float, float, __VA_ARGS__)        \
-      AT_PRIVATE_CASE_TYPE(at::ScalarType::Int, int32_t, __VA_ARGS__)        \
-      AT_PRIVATE_CASE_TYPE(at::ScalarType::Long, int64_t, __VA_ARGS__)       \
-      AT_PRIVATE_CASE_TYPE(at::ScalarType::Short, int16_t, __VA_ARGS__)      \
-      AT_PRIVATE_CASE_TYPE(at::ScalarType::Half, at::Half, __VA_ARGS__)      \
-      default:                                                               \
-        AT_ERROR(#NAME, " not implemented for '", toString(_st), "'");       \
-    }                                                                        \
-  }()
-
 #define AT_DISPATCH_ALL_TYPES(TYPE, NAME, ...)                               \
   [&] {                                                                      \
     const auto& the_type = TYPE;                                             \
@@ -180,6 +217,21 @@ inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {}
     }                                                                        \
   }()
 
+#define AT_DISPATCH_QINT_TYPES(TYPE, NAME, ...)                         \
+  [&] {                                                                 \
+    const auto& SCALAR_TYPE C10_UNUSED = TYPE;                          \
+    switch (TYPE) {                                                     \
+      AT_QINT_PRIVATE_CASE_TYPE(                                        \
+          kQInt8, qint8, kChar, int8_t, __VA_ARGS__)                    \
+      AT_QINT_PRIVATE_CASE_TYPE(                                        \
+          kQUInt8, quint8, kByte, uint8_t, __VA_ARGS__)                 \
+      AT_QINT_PRIVATE_CASE_TYPE(                                        \
+          kQInt32, qint32, kInt, int, __VA_ARGS__)                      \
+      default:                                                          \
+        AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
+    }                                                                   \
+  }()
+
 #define AT_DISPATCH_ALL_TYPES_AND_COMPLEX(TYPE, NAME, ...)                   \
   [&] {                                                                      \
     const auto& the_type = TYPE;                                             \
@@ -202,30 +254,6 @@ inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {}
     }                                                                        \
   }()
 
-#define AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX(TYPE, NAME, ...)          \
-  [&] {                                                                      \
-    detail::deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX()          \
-    const auto& the_type = TYPE;                                             \
-    /* don't use TYPE again in case it is an expensive or side-effect op */  \
-    at::ScalarType _st = ::detail::scalar_type(the_type);                    \
-    switch (_st) {                                                           \
-      AT_PRIVATE_CASE_TYPE(at::ScalarType::Byte, uint8_t, __VA_ARGS__)       \
-      AT_PRIVATE_CASE_TYPE(at::ScalarType::Char, int8_t, __VA_ARGS__)        \
-      AT_PRIVATE_CASE_TYPE(at::ScalarType::Double, double, __VA_ARGS__)      \
-      AT_PRIVATE_CASE_TYPE(at::ScalarType::Float, float, __VA_ARGS__)        \
-      AT_PRIVATE_CASE_TYPE(at::ScalarType::Int, int32_t, __VA_ARGS__)        \
-      AT_PRIVATE_CASE_TYPE(at::ScalarType::Long, int64_t, __VA_ARGS__)       \
-      AT_PRIVATE_CASE_TYPE(at::ScalarType::Short, int16_t, __VA_ARGS__)      \
-      AT_PRIVATE_CASE_TYPE(at::ScalarType::Half, at::Half, __VA_ARGS__)      \
-      AT_PRIVATE_CASE_TYPE(                                                  \
-          at::ScalarType::ComplexFloat, std::complex<float>, __VA_ARGS__)    \
-      AT_PRIVATE_CASE_TYPE(                                                  \
-          at::ScalarType::ComplexDouble, std::complex<double>, __VA_ARGS__)  \
-      default:                                                               \
-        AT_ERROR(#NAME, " not implemented for '", toString(_st), "'");       \
-    }                                                                        \
-  }()
-
 #define AT_DISPATCH_ALL_TYPES_AND(SCALARTYPE, TYPE, NAME, ...)                                            \
   [&] {                                                                                                   \
     switch (TYPE) {                                                                                       \
@@ -279,3 +307,51 @@ inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {}
         AT_ERROR(#NAME, " not implemented for '", TYPE, "'");                                               \
     }                                                                                                       \
   }()
+
+// ----------------------------------------------------------------------------
+// DEPRECATED MACROS, DON'T USE THESE
+// ----------------------------------------------------------------------------
+
+#define AT_DISPATCH_ALL_TYPES_AND_HALF(TYPE, NAME, ...)                      \
+  [&] {                                                                      \
+    detail::deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF();                     \
+    const auto& the_type = TYPE;                                             \
+    /* don't use TYPE again in case it is an expensive or side-effect op */  \
+    at::ScalarType _st = ::detail::scalar_type(the_type);                    \
+    switch (_st) {                                                           \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Byte, uint8_t, __VA_ARGS__)       \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Char, int8_t, __VA_ARGS__)        \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Double, double, __VA_ARGS__)      \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Float, float, __VA_ARGS__)        \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Int, int32_t, __VA_ARGS__)        \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Long, int64_t, __VA_ARGS__)       \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Short, int16_t, __VA_ARGS__)      \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Half, at::Half, __VA_ARGS__)      \
+      default:                                                               \
+        AT_ERROR(#NAME, " not implemented for '", toString(_st), "'");       \
+    }                                                                        \
+  }()
+
+#define AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX(TYPE, NAME, ...)          \
+  [&] {                                                                      \
+    detail::deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX()          \
+    const auto& the_type = TYPE;                                             \
+    /* don't use TYPE again in case it is an expensive or side-effect op */  \
+    at::ScalarType _st = ::detail::scalar_type(the_type);                    \
+    switch (_st) {                                                           \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Byte, uint8_t, __VA_ARGS__)       \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Char, int8_t, __VA_ARGS__)        \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Double, double, __VA_ARGS__)      \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Float, float, __VA_ARGS__)        \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Int, int32_t, __VA_ARGS__)        \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Long, int64_t, __VA_ARGS__)       \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Short, int16_t, __VA_ARGS__)      \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Half, at::Half, __VA_ARGS__)      \
+      AT_PRIVATE_CASE_TYPE(                                                  \
+          at::ScalarType::ComplexFloat, std::complex<float>, __VA_ARGS__)    \
+      AT_PRIVATE_CASE_TYPE(                                                  \
+          at::ScalarType::ComplexDouble, std::complex<double>, __VA_ARGS__)  \
+      default:                                                               \
+        AT_ERROR(#NAME, " not implemented for '", toString(_st), "'");       \
+    }                                                                        \
+  }()
diff --git a/aten/src/ATen/ExpandUtils.cpp b/aten/src/ATen/ExpandUtils.cpp
index f0e854cf4dbb..54fcc7721f71 100644
--- a/aten/src/ATen/ExpandUtils.cpp
+++ b/aten/src/ATen/ExpandUtils.cpp
@@ -16,7 +16,7 @@ std::vector<int64_t> infer_size(IntArrayRef a, IntArrayRef b) {
     int64_t sizeA = (dimA >= 0) ? a[dimA] : 1;
     int64_t sizeB = (dimB >= 0) ? b[dimB] : 1;
 
-    AT_CHECK(
+    TORCH_CHECK(
         sizeA == sizeB || sizeA == 1 || sizeB == 1,
         "The size of tensor a (", sizeA,
         ") must match the size of tensor b (", sizeB,
@@ -53,7 +53,7 @@ std::tuple<std::vector<int64_t>, std::vector<int64_t>> inferExpandGeometry(
                                 : expandedSizes[i + 1] * expandedStrides[i + 1];
     int64_t targetSize = sizes[i];
     if (targetSize == -1) {
-      AT_CHECK(
+      TORCH_CHECK(
           dim >= 0,
           "The expanded size of the tensor (",
           targetSize,
@@ -62,7 +62,7 @@ std::tuple<std::vector<int64_t>, std::vector<int64_t>> inferExpandGeometry(
       targetSize = size;
     }
     if (size != targetSize) {
-      AT_CHECK(
+      TORCH_CHECK(
           size == 1,
           "The expanded size of the tensor (",
           targetSize,
diff --git a/aten/src/ATen/InferSize.h b/aten/src/ATen/InferSize.h
index d001bbd17152..aa6ac328f6a3 100644
--- a/aten/src/ATen/InferSize.h
+++ b/aten/src/ATen/InferSize.h
@@ -36,7 +36,7 @@ static std::vector<int64_t> infer_size(IntArrayRef shape, int64_t numel) {
       // works yet
       //   empty_tensor.view(-1, 0)
       // doesn't.
-      AT_CHECK(newsize != 0, "cannot reshape tensor of 0 elements into shape ",
+      TORCH_CHECK(newsize != 0, "cannot reshape tensor of 0 elements into shape ",
                shape, " because the unspecified dimension size -1 can be any "
                "value and is ambiguous");
       res[*infer_dim] = numel / newsize;
diff --git a/aten/src/ATen/MatrixRef.h b/aten/src/ATen/MatrixRef.h
index 869142ff8562..d6bcc08addd2 100644
--- a/aten/src/ATen/MatrixRef.h
+++ b/aten/src/ATen/MatrixRef.h
@@ -40,7 +40,7 @@ namespace at {
     /// Construct an MatrixRef from an ArrayRef and outer stride.
     /*implicit*/ MatrixRef(ArrayRef<T> arr, size_type stride0)
       : arr(arr), stride0(stride0) {
-        AT_CHECK(arr.size() % stride0 == 0, "MatrixRef: ArrayRef size ", arr.size(), " not divisible by stride ", stride0)
+        TORCH_CHECK(arr.size() % stride0 == 0, "MatrixRef: ArrayRef size ", arr.size(), " not divisible by stride ", stride0)
       }
 
     /// @}
@@ -59,7 +59,7 @@ namespace at {
       } else if (dim == 1) {
         return stride0;
       } else {
-        AT_CHECK(0, "MatrixRef: out of bounds dimension ", dim, "; expected 0 or 1");
+        TORCH_CHECK(0, "MatrixRef: out of bounds dimension ", dim, "; expected 0 or 1");
       }
     }
 
diff --git a/aten/src/ATen/OpaqueTensorImpl.h b/aten/src/ATen/OpaqueTensorImpl.h
index 12d8ea254b5c..92a206d002a1 100644
--- a/aten/src/ATen/OpaqueTensorImpl.h
+++ b/aten/src/ATen/OpaqueTensorImpl.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <c10/core/TensorImpl.h>
+#include <c10/core/MemoryFormat.h>
 #include <c10/util/Exception.h>
 
 namespace at {
@@ -36,7 +37,7 @@ struct CAFFE2_API OpaqueTensorImpl : public TensorImpl {
     AT_ERROR("opaque tensors do not have strides");
   }
 
-  bool is_contiguous() const override {
+  bool is_contiguous(c10::MemoryFormat memory_format=c10::MemoryFormat::Any) const override {
     AT_ERROR("opaque tensors do not have is_contiguous");
   }
 
@@ -78,15 +79,15 @@ struct CAFFE2_API OpaqueTensorImpl : public TensorImpl {
 
 // NOTE: `shallow_copy_and_detach()` does not copy the following TensorImpl fields:
 // 1. the AutogradMeta pointer, because it is unique for each Variable.
-// 2. the version counter, because although it lives in TensorImpl, the version counter is managed
-// by autograd, and the call sites of `shallow_copy_and_detach()` (from autograd) should decide what
-// the version counter should be for each new TensorImpl. See NOTE [ Version Counter Sharing ] for details.
+// 2. the version counter, because it is set to the passed in `version_counter`.
+//    See NOTE [ Version Counter Sharing ] for details.
 //
-// NOTE: We don't set `allow_tensor_metadata_change_` to false here, because there are call sites
-// to this function that need to change the shallow copy's size or storage afterwards, and setting
-// `allow_tensor_metadata_change_` to false would prevent those changes from happening and is
-// undesirable.
-c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach() const override {
+// NOTE: `allow_tensor_metadata_change` determines whether the TensorImpl shallow-copy
+// allows changes to its metadata (e.g. sizes / strides / storage / storage_offset).
+// See NOTE [ Metadata Change for a Detached Tensor ] for details.
+c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
+    const c10::VariableVersion& version_counter,
+    bool allow_tensor_metadata_change) const override {
   //AT_ASSERT(false);
   auto impl = c10::make_intrusive<OpaqueTensorImpl<OpaqueHandle>>(
     type_id(), dtype(), device(), opaque_handle_, sizes_);
@@ -99,6 +100,8 @@ c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach() const override {
   impl->is_contiguous_ = is_contiguous_;
   impl->is_wrapped_number_ = is_wrapped_number_;
   impl->reserved_ = reserved_;
+  impl->set_version_counter(version_counter);
+  impl->set_allow_tensor_metadata_change(allow_tensor_metadata_change);
 
   // OpaqueTensorImpl-specific fields (none currently).
   return impl;
diff --git a/aten/src/ATen/Parallel.cpp b/aten/src/ATen/Parallel.cpp
index 0a965f7dba06..0d81dd443699 100644
--- a/aten/src/ATen/Parallel.cpp
+++ b/aten/src/ATen/Parallel.cpp
@@ -5,6 +5,7 @@
 
 #include <atomic>
 #include <sstream>
+#include <thread>
 
 #ifdef TH_BLAS_MKL
 #include <mkl.h>
@@ -13,8 +14,43 @@
 namespace at {
 
 namespace {
+const int NOT_SET = -1;
+const int CONSUMED = -2;
+
 // Number of threads set by the user
-std::atomic<int> num_threads(-1);
+std::atomic<int> num_threads{NOT_SET};
+
+// Number of inter-op threads set by the user;
+// NOT_SET -> positive value -> CONSUMED
+// (CONSUMED - thread pool is initialized)
+// or
+// NOT_SET -> CONSUMED
+std::atomic<int> num_interop_threads{NOT_SET};
+
+// thread pool global instance is hidden,
+// users should use at::launch and get/set_num_interop_threads interface
+TaskThreadPoolBase& get_pool() {
+  static std::shared_ptr<TaskThreadPoolBase> pool =
+      ThreadPoolRegistry()->Create(
+          "C10",
+          /* device_id */ 0,
+          /* pool_size */ num_interop_threads.exchange(CONSUMED),
+          /* create_new */ true);
+  return *pool;
+}
+
+ // Factory function for ThreadPoolRegistry
+std::shared_ptr<TaskThreadPoolBase> create_c10_threadpool(
+    int device_id,
+    int pool_size,
+    bool create_new) {
+  // For now, the only accepted device id is 0
+  AT_CHECK(device_id == 0);
+  // Create new thread pool
+  AT_CHECK(create_new);
+  return std::make_shared<PTThreadPool>(pool_size);
+}
+
 }
 
 void init_num_threads() {
@@ -32,10 +68,9 @@ void init_num_threads() {
   }
 }
 
-void set_num_threads(size_t nthreads) {
-  if (nthreads == 0) {
-    return;
-  }
+void set_num_threads(int nthreads) {
+  AT_CHECK(nthreads > 0, "Expected positive number of threads");
+
   num_threads.store(nthreads);
 #ifdef _OPENMP
   omp_set_num_threads(nthreads);
@@ -56,7 +91,7 @@ void set_num_threads(size_t nthreads) {
 // region might be different in the new thread;
 // Use init_num_threads() during thread initialization to ensure
 // consistent size of parallel region in different threads
-size_t get_num_threads() {
+int get_num_threads() {
 #ifdef _OPENMP
   return omp_get_max_threads();
 #else
@@ -100,7 +135,7 @@ std::string get_parallel_info() {
 }
 
 PTThreadPool::PTThreadPool(
-    std::size_t pool_size,
+    int pool_size,
     int numa_node_id)
     : c10::ThreadPool(pool_size, numa_node_id) {}
 
@@ -109,26 +144,31 @@ void PTThreadPool::init_thread() {
   at::init_num_threads();
 }
 
-namespace {
+C10_REGISTER_CREATOR(ThreadPoolRegistry, C10, create_c10_threadpool);
 
-std::shared_ptr<TaskThreadPoolBase> createC10ThreadPool(
-    int device_id,
-    int pool_size,
-    bool create_new) {
-  static std::shared_ptr<TaskThreadPoolBase> pool =
-      std::make_shared<PTThreadPool>(pool_size);
-  // For now, the only accepted device id is 0
-  // for the JIT inter-op pool (CPU),
-  AT_ASSERT(device_id == 0);
-  // we use the shared thread pool
-  AT_ASSERT(!create_new);
-  // and the size does not change
-  AT_ASSERT(pool->size() == pool_size);
-  return pool;
+void set_num_interop_threads(int nthreads) {
+  AT_CHECK(nthreads > 0, "Expected positive number of threads");
+
+  int no_value = NOT_SET;
+  AT_CHECK(num_interop_threads.compare_exchange_strong(no_value, nthreads),
+      "Error: cannot set number of interop threads after parallel work "
+      "has started or set_num_interop_threads called");
 }
 
-} // namespace
+int get_num_interop_threads() {
+  int nthreads = num_interop_threads.load();
+  if (nthreads > 0) {
+    return nthreads;
+  } else if (nthreads == NOT_SET) {
+    // return default value
+    return TaskThreadPoolBase::defaultNumThreads();
+  } else {
+    return get_pool().size();
+  }
+}
 
-C10_REGISTER_CREATOR(ThreadPoolRegistry, C10, createC10ThreadPool);
+void launch(const std::function<void()>& func) {
+  get_pool().run(func);
+}
 
 } // namespace at
diff --git a/aten/src/ATen/Parallel.h b/aten/src/ATen/Parallel.h
index 2668619436c2..fe7530793589 100644
--- a/aten/src/ATen/Parallel.h
+++ b/aten/src/ATen/Parallel.h
@@ -30,10 +30,10 @@ inline int64_t divup(int64_t x, int64_t y) {
 CAFFE2_API void init_num_threads();
 
 // Sets the number of threads to be used in parallel region
-CAFFE2_API void set_num_threads(size_t);
+CAFFE2_API void set_num_threads(int);
 
 // Returns the number of threads used in parallel region
-CAFFE2_API size_t get_num_threads();
+CAFFE2_API int get_num_threads();
 
 // Returns the current thread number (starting from 0)
 // in the current parallel region, or 0 in the sequential region
@@ -153,10 +153,19 @@ CAFFE2_API std::string get_parallel_info();
 class CAFFE2_API PTThreadPool : public c10::ThreadPool {
  public:
   explicit PTThreadPool(
-      std::size_t pool_size,
+      int pool_size,
       int numa_node_id = -1);
 
   void init_thread() override;
 };
 
+// Sets number of threads used for inter-op parallelism
+CAFFE2_API void set_num_interop_threads(int);
+
+// Returns the number of threads used for inter-op parallelism
+CAFFE2_API int get_num_interop_threads();
+
+// Launches inter-op parallel task
+CAFFE2_API void launch(const std::function<void()>& func);
+
 } // namespace at
diff --git a/aten/src/ATen/SparseTensorImpl.cpp b/aten/src/ATen/SparseTensorImpl.cpp
index d87b29707fa7..5f61313b98fc 100644
--- a/aten/src/ATen/SparseTensorImpl.cpp
+++ b/aten/src/ATen/SparseTensorImpl.cpp
@@ -51,7 +51,7 @@ SparseTensorImpl::SparseTensorImpl(at::TensorTypeId type_id, const caffe2::TypeM
 IntArrayRef SparseTensorImpl::strides() const {
   AT_ERROR("sparse tensors do not have strides");
 }
-bool SparseTensorImpl::is_contiguous() const {
+bool SparseTensorImpl::is_contiguous(at::MemoryFormat memory_format) const {
   AT_ERROR("sparse tensors do not have is_contiguous");
 }
 int64_t SparseTensorImpl::stride(int64_t d) const {
@@ -74,7 +74,7 @@ int64_t SparseTensorImpl::dim() const {
   return sparse_dim_ + dense_dim_;
 }
 TensorImpl* SparseTensorImpl::maybe_zero_dim(bool condition_when_zero_dim) {
-  AT_CHECK(condition_when_zero_dim == (dim() == 0),
+  TORCH_CHECK(condition_when_zero_dim == (dim() == 0),
            "Attempted to maybe_zero_dim on a SparseTensorImpl to ", condition_when_zero_dim,
            " but the SparseTensor's dim() is ", dim(), " and SparseTensors do not support"
            " changing dimensionality via maybe_zero_dim");
@@ -90,29 +90,29 @@ int64_t SparseTensorImpl::storage_offset() const {
   AT_ERROR("sparse tensors do not have storage");
 }
 void SparseTensorImpl::set_indices_and_values_unsafe(const Tensor& indices, const Tensor& values) {
-  AT_CHECK(allow_tensor_metadata_change(), "set_indices_and_values_unsafe is not allowed on Tensor created from .data or .detach()");
+  TORCH_CHECK(allow_tensor_metadata_change(), "set_indices_and_values_unsafe is not allowed on Tensor created from .data or .detach()");
   AT_ASSERT(!indices.is_variable() && !values.is_variable());  // They should be plain tensors!  // TODO: change this to check `.requires_grad()` and `GradMode::is_enabled()` when Variable and Tensor are merged
 
-  AT_CHECK(!indices.is_sparse(), "expected indices to be a dense tensor, but got indices of layout ", indices.layout());
-  AT_CHECK(!values.is_sparse(), "expected values to be a dense tensor, but got values of layout ", values.layout());
+  TORCH_CHECK(!indices.is_sparse(), "expected indices to be a dense tensor, but got indices of layout ", indices.layout());
+  TORCH_CHECK(!values.is_sparse(), "expected values to be a dense tensor, but got values of layout ", values.layout());
 
-  AT_CHECK(values.device().type() == device().type(), "device type of values (", values.device().type(), ") must match device type of device().type()", device().type(), ")");
-  AT_CHECK(values.scalar_type() == typeMetaToScalarType(dtype()), "dtype of values (", values.scalar_type(), ") must match dtype of sparse tensor (", typeMetaToScalarType(dtype()), ")");
-  AT_CHECK(indices.scalar_type() == kLong, "indices must be an int64 tensor");
-  AT_CHECK(indices.type().backend() == values.type().backend(), "backend of indices (", indices.type().backend(), ") must match backend of values (", values.type().backend(), ")");
-  AT_CHECK(!indices.is_cuda() || indices.get_device() == values.get_device(), "device of indices (", indices.get_device(), ") must match device of values (", values.get_device(), ")");
+  TORCH_CHECK(values.device().type() == device().type(), "device type of values (", values.device().type(), ") must match device type of device().type()", device().type(), ")");
+  TORCH_CHECK(values.scalar_type() == typeMetaToScalarType(dtype()), "dtype of values (", values.scalar_type(), ") must match dtype of sparse tensor (", typeMetaToScalarType(dtype()), ")");
+  TORCH_CHECK(indices.scalar_type() == kLong, "indices must be an int64 tensor");
+  TORCH_CHECK(indices.type().backend() == values.type().backend(), "backend of indices (", indices.type().backend(), ") must match backend of values (", values.type().backend(), ")");
+  TORCH_CHECK(!indices.is_cuda() || indices.get_device() == values.get_device(), "device of indices (", indices.get_device(), ") must match device of values (", values.get_device(), ")");
 
-  AT_CHECK(indices.dim() == 2, "indices must be sparse_dim x nnz, but got: ", indices.sizes());
-  AT_CHECK(indices.size(1) == values.size(0), "indices and values must have same nnz, but got nnz from indices: ", indices.size(1), ", nnz from values: ", values.size(0));
-  AT_CHECK(indices.size(0) == sparse_dim_, "indices has incorrect first dimension, expected ", sparse_dim_, ", got ", indices.size(0));
-  AT_CHECK(values.dim() == dense_dim_ + 1, "values has incorrect number of dimensions, expected ", dense_dim_ + 1, ", got ", values.dim());
+  TORCH_CHECK(indices.dim() == 2, "indices must be sparse_dim x nnz, but got: ", indices.sizes());
+  TORCH_CHECK(indices.size(1) == values.size(0), "indices and values must have same nnz, but got nnz from indices: ", indices.size(1), ", nnz from values: ", values.size(0));
+  TORCH_CHECK(indices.size(0) == sparse_dim_, "indices has incorrect first dimension, expected ", sparse_dim_, ", got ", indices.size(0));
+  TORCH_CHECK(values.dim() == dense_dim_ + 1, "values has incorrect number of dimensions, expected ", dense_dim_ + 1, ", got ", values.dim());
 
   auto dense_size_original = sizes().slice(sparse_dim_);
   std::vector<int64_t> expected_values_size_vec = {values.size(0)};
   expected_values_size_vec.insert(expected_values_size_vec.end(), dense_size_original.begin(), dense_size_original.end());
   IntArrayRef expected_values_size(expected_values_size_vec);
   auto new_values_size = values.sizes();
-  AT_CHECK(
+  TORCH_CHECK(
     std::equal(expected_values_size.begin(), expected_values_size.end(), new_values_size.begin()),
     "values has incorrect size, expected ", expected_values_size, ", got ", new_values_size
   );
diff --git a/aten/src/ATen/SparseTensorImpl.h b/aten/src/ATen/SparseTensorImpl.h
index 29cdda99eef2..e611b3b86ee0 100644
--- a/aten/src/ATen/SparseTensorImpl.h
+++ b/aten/src/ATen/SparseTensorImpl.h
@@ -41,7 +41,7 @@ struct CAFFE2_API SparseTensorImpl : public TensorImpl {
   Tensor values() const { return values_; }
 
   IntArrayRef strides() const override;
-  bool is_contiguous() const override;
+  bool is_contiguous(at::MemoryFormat memory_format=at::MemoryFormat::Any) const override;
   int64_t stride(int64_t d) const override;
   void resize_dim(int64_t ndim) override;
   void set_size(int64_t dim, int64_t new_size) override;
@@ -57,7 +57,7 @@ struct CAFFE2_API SparseTensorImpl : public TensorImpl {
   // WARNING: This function does NOT preserve invariants of sparse_dim/dense_dim with
   // respect to indices and values
   void raw_resize_(int64_t sparse_dim, int64_t dense_dim, IntArrayRef size) {
-    AT_CHECK(allow_tensor_metadata_change(), "raw_resize_ is not allowed on Tensor created from .data or .detach()");
+    TORCH_CHECK(allow_tensor_metadata_change(), "raw_resize_ is not allowed on Tensor created from .data or .detach()");
     sizes_ = size.vec();
     sparse_dim_ = sparse_dim;
     dense_dim_ = dense_dim;
@@ -87,8 +87,8 @@ struct CAFFE2_API SparseTensorImpl : public TensorImpl {
   // 4. When we attempt to shrink the size of any of the sparse dimensions on a non-empty sparse tensor
   // (this could make some of the stored indices out-of-bound and thus unsafe).
   void resize_(int64_t sparse_dim, int64_t dense_dim, IntArrayRef size) {
-    AT_CHECK(allow_tensor_metadata_change(), "resize_ is not allowed on Tensor created from .data or .detach()");
-    AT_CHECK(sparse_dim + dense_dim == static_cast<int64_t>(size.size()), "number of dimensions must be sparse_dim (", sparse_dim, ") + dense_dim (", dense_dim, "), but got ", size.size());
+    TORCH_CHECK(allow_tensor_metadata_change(), "resize_ is not allowed on Tensor created from .data or .detach()");
+    TORCH_CHECK(sparse_dim + dense_dim == static_cast<int64_t>(size.size()), "number of dimensions must be sparse_dim (", sparse_dim, ") + dense_dim (", dense_dim, "), but got ", size.size());
     if (nnz() > 0) {
       auto alt_options_msg = "You could try the following options:\n\
 1. If you need an empty sparse tensor of this size, call `x = torch.sparse_coo_tensor(size)`.\n\
@@ -96,10 +96,10 @@ struct CAFFE2_API SparseTensorImpl : public TensorImpl {
     1. For both sparse and dense dimensions, keep the number of them constant and the size of them non-shrinking, and then try the same call again.\n\
     2. Or, create a new sparse tensor with the correct indices and values from this sparse tensor.";
 
-      AT_CHECK(sparse_dim == sparse_dim_,
+      TORCH_CHECK(sparse_dim == sparse_dim_,
         "changing the number of sparse dimensions (from ", sparse_dim_, " to ", sparse_dim, ") on a non-empty sparse tensor is not supported.\n", alt_options_msg);
 
-      AT_CHECK(dense_dim == dense_dim_,
+      TORCH_CHECK(dense_dim == dense_dim_,
         "changing the number of dense dimensions (from ", dense_dim_, " to ", dense_dim, ") on a non-empty sparse tensor is not supported.\n", alt_options_msg);
 
       bool shrinking_sparse_dims = false;
@@ -121,10 +121,10 @@ struct CAFFE2_API SparseTensorImpl : public TensorImpl {
         }
       }
 
-      AT_CHECK(!shrinking_sparse_dims,
+      TORCH_CHECK(!shrinking_sparse_dims,
         "shrinking the size of sparse dimensions (from ", sparse_size_original, " to ", sparse_size_new, ") on a non-empty sparse tensor is not supported.\n", alt_options_msg);
 
-      AT_CHECK(!shrinking_dense_dim,
+      TORCH_CHECK(!shrinking_dense_dim,
         "shrinking the size of dense dimensions (from ", dense_size_original, " to ", dense_size_new, ") on a non-empty sparse tensor is not supported.\n", alt_options_msg);
     }
 
@@ -145,8 +145,8 @@ struct CAFFE2_API SparseTensorImpl : public TensorImpl {
 
   // NOTE: this function will resize the sparse tensor and also set `indices` and `values` to empty.
   void resize_and_clear_(int64_t sparse_dim, int64_t dense_dim, IntArrayRef size) {
-    AT_CHECK(allow_tensor_metadata_change(), "resize_and_clear_ is not allowed on Tensor created from .data or .detach()");
-    AT_CHECK(sparse_dim + dense_dim == static_cast<int64_t>(size.size()), "number of dimensions must be sparse_dim (", sparse_dim, ") + dense_dim (", dense_dim, "), but got ", size.size());
+    TORCH_CHECK(allow_tensor_metadata_change(), "resize_and_clear_ is not allowed on Tensor created from .data or .detach()");
+    TORCH_CHECK(sparse_dim + dense_dim == static_cast<int64_t>(size.size()), "number of dimensions must be sparse_dim (", sparse_dim, ") + dense_dim (", dense_dim, "), but got ", size.size());
 
     sizes_ = size.vec();
     sparse_dim_ = sparse_dim;
@@ -162,13 +162,13 @@ struct CAFFE2_API SparseTensorImpl : public TensorImpl {
   }
 
   void set_coalesced(bool coalesced) {
-    AT_CHECK(allow_tensor_metadata_change(), "set_coalesced is not allowed on Tensor created from .data or .detach()");
+    TORCH_CHECK(allow_tensor_metadata_change(), "set_coalesced is not allowed on Tensor created from .data or .detach()");
     coalesced_ = coalesced;
   }
 
   // NOTE: this function is only used internally and not exposed to Python frontend
   void set_nnz_and_narrow(int64_t new_nnz) {
-    AT_CHECK(allow_tensor_metadata_change(), "set_nnz_and_narrow is not allowed on Tensor created from .data or .detach()");
+    TORCH_CHECK(allow_tensor_metadata_change(), "set_nnz_and_narrow is not allowed on Tensor created from .data or .detach()");
     AT_ASSERT(new_nnz <= nnz());
     indices_ = indices_.narrow(1, 0, new_nnz);
     values_ = values_.narrow(0, 0, new_nnz);
@@ -185,15 +185,15 @@ struct CAFFE2_API SparseTensorImpl : public TensorImpl {
 
   // NOTE: `shallow_copy_and_detach()` does not copy the following TensorImpl fields:
   // 1. the AutogradMeta pointer, because it is unique for each Variable.
-  // 2. the version counter, because although it lives in TensorImpl, the version counter is managed
-  // by autograd, and the call sites of `shallow_copy_and_detach()` (from autograd) should decide what
-  // the version counter should be for each new TensorImpl. See NOTE [ Version Counter Sharing ] for details.
+  // 2. the version counter, because it is set to the passed in `version_counter`.
+  //    See NOTE [ Version Counter Sharing ] for details.
   //
-  // NOTE: We don't set `allow_tensor_metadata_change_` to false here, because there are call sites
-  // to this function that need to change the shallow copy's size or storage afterwards, and setting
-  // `allow_tensor_metadata_change_` to false would prevent those changes from happening and is
-  // undesirable.
-  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach() const override {
+  // NOTE: `allow_tensor_metadata_change` determines whether the TensorImpl shallow-copy
+  // allows changes to its metadata (e.g. sizes / strides / storage / storage_offset).
+  // See NOTE [ Metadata Change for a Detached Tensor ] for details.
+  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
+      const c10::VariableVersion& version_counter,
+      bool allow_tensor_metadata_change) const override {
     auto impl = c10::make_intrusive<SparseTensorImpl>(type_id(), dtype());
     // TensorImpl general fields
     // Note that these fields are not used in sparse tensor code, and we copy them here only for completeness.
@@ -203,6 +203,8 @@ struct CAFFE2_API SparseTensorImpl : public TensorImpl {
     impl->is_contiguous_ = is_contiguous_;
     impl->is_wrapped_number_ = is_wrapped_number_;
     impl->reserved_ = reserved_;
+    impl->set_version_counter(version_counter);
+    impl->set_allow_tensor_metadata_change(allow_tensor_metadata_change);
 
     // Sparse-specific fields
     impl->sparse_dim_ = sparse_dim();
diff --git a/aten/src/ATen/TensorGeometry.h b/aten/src/ATen/TensorGeometry.h
index 3928494cae3b..1e4391961e1d 100644
--- a/aten/src/ATen/TensorGeometry.h
+++ b/aten/src/ATen/TensorGeometry.h
@@ -46,8 +46,8 @@ struct CAFFE2_API TensorGeometry {
 
   TensorGeometry transpose(int64_t dim0, int64_t dim1) {
     TensorGeometry r = *this; // copy
-    AT_CHECK(dim0 < dim(), "transpose: dim0=", dim0, " out of range (dim=", dim(), ")")
-    AT_CHECK(dim1 < dim(), "transpose: dim1=", dim1, " out of range (dim=", dim(), ")")
+    TORCH_CHECK(dim0 < dim(), "transpose: dim0=", dim0, " out of range (dim=", dim(), ")")
+    TORCH_CHECK(dim1 < dim(), "transpose: dim1=", dim1, " out of range (dim=", dim(), ")")
     std::swap(r.sizes_[dim0], r.sizes_[dim1]);
     std::swap(r.strides_[dim0], r.strides_[dim1]);
     return r;
diff --git a/aten/src/ATen/TensorUtils.cpp b/aten/src/ATen/TensorUtils.cpp
index 745870423aa7..742089f2c7e2 100644
--- a/aten/src/ATen/TensorUtils.cpp
+++ b/aten/src/ATen/TensorUtils.cpp
@@ -20,13 +20,13 @@ std::ostream& operator<<(std::ostream & out, TensorGeometryArg t) {
 }
 
 void checkDim(CheckedFrom c, const TensorGeometryArg& t, int64_t dim) {
-  AT_CHECK(t->dim() == dim,
+  TORCH_CHECK(t->dim() == dim,
     "Expected ", dim, "-dimensional tensor, but got ", t->dim(),
     "-dimensional tensor for ", t," (while checking arguments for ", c, ")");
 }
 
 void checkDimRange(CheckedFrom c, const TensorGeometryArg& t, int64_t dim_start, int64_t dim_end) {
-  AT_CHECK(
+  TORCH_CHECK(
     t->dim() >= dim_start && t->dim() < dim_end,
     "Expected ", dim_start, " to ", (dim_end - 1), " dimensions, but got ",
     t->dim(), "-dimensional tensor for ", t, " (while checking arguments for ",
@@ -34,7 +34,7 @@ void checkDimRange(CheckedFrom c, const TensorGeometryArg& t, int64_t dim_start,
 }
 
 void checkContiguous(CheckedFrom c, const TensorGeometryArg& t) {
-  AT_CHECK(
+  TORCH_CHECK(
     t->is_contiguous(),
     "Expected contiguous tensor, but got non-contiguous tensor for ", t,
      " (while checking arguments for ", c, ")");
@@ -49,14 +49,14 @@ void checkAllContiguous(CheckedFrom c, at::ArrayRef<TensorArg> ts) {
 
 void checkSize(CheckedFrom c, const TensorGeometryArg& t, IntArrayRef sizes) {
   checkDim(c, t, sizes.size());
-  AT_CHECK(
+  TORCH_CHECK(
     t->sizes().equals(sizes),
     "Expected tensor of size ", sizes, ", but got tensor of size ", t->sizes(),
     " for ", t, " (while checking arguments for ", c, ")");
 }
 
 void checkSize(CheckedFrom c, const TensorGeometryArg& t, int64_t dim, int64_t size) {
-  AT_CHECK(
+  TORCH_CHECK(
     t->size(dim) == size,
     "Expected tensor to have size ", size, " at dimension ", dim,
     ", but got size ", t->size(dim), " for ", t,
@@ -76,7 +76,7 @@ void checkAllSame(CheckedFrom c, ArrayRef<TensorArg> tensors, void(*fn)(CheckedF
 }
 
 void checkSameSize(CheckedFrom c, const TensorArg& t1, const TensorArg& t2) {
-  AT_CHECK(
+  TORCH_CHECK(
     t1->sizes().equals(t2->sizes()),
     "Expected tensor for ", t1, " to have same size as tensor for ", t2,
     "; but ", t1->sizes(), " does not equal ", t2->sizes(),
@@ -88,7 +88,7 @@ void checkAllSameSize(CheckedFrom c, ArrayRef<TensorArg> tensors) {
 }
 
 void checkNumel(CheckedFrom c, const TensorGeometryArg& t, int64_t numel) {
-  AT_CHECK(
+  TORCH_CHECK(
     t->numel() == numel,
     "Expected tensor for ", t, " to have ", numel,
     " elements; but it actually has ", t->numel(), " elements",
@@ -96,7 +96,7 @@ void checkNumel(CheckedFrom c, const TensorGeometryArg& t, int64_t numel) {
 }
 
 void checkSameNumel(CheckedFrom c, const TensorArg& t1, const TensorArg& t2) {
-  AT_CHECK(
+  TORCH_CHECK(
     t1->numel() == t2->numel(),
     "Expected tensor for ", t1,
     " to have same number of elements as tensor for ", t2, "; but ",
@@ -121,7 +121,7 @@ void checkSameGPU(CheckedFrom c, const TensorArg& t1, const TensorArg& t2) {
         << " to be on GPU (while checking arguments for " << c << ")";
     AT_ERROR(oss.str());
   }
-  AT_CHECK(
+  TORCH_CHECK(
     t1->get_device() == t2->get_device(),
     "Expected tensor for ", t1, " to have the same device as tensor for ", t2,
     "; but device ", t1->get_device(), " does not equal ", t2->get_device(),
@@ -133,7 +133,7 @@ void checkAllSameGPU(CheckedFrom c, ArrayRef<TensorArg> tensors) {
 }
 
 void checkSameType(CheckedFrom c, const TensorArg& t1, const TensorArg& t2) {
-  AT_CHECK(
+  TORCH_CHECK(
     t1->type() == t2->type(),
     "Expected tensor for ", t1, " to have the same type as tensor for ", t2,
     "; but type ", t1->toString(), " does not equal ", t2->toString(),
@@ -141,7 +141,7 @@ void checkSameType(CheckedFrom c, const TensorArg& t1, const TensorArg& t2) {
 }
 
 void checkScalarType(CheckedFrom c, const TensorArg& t, ScalarType ty) {
-  AT_CHECK(
+  TORCH_CHECK(
     t->scalar_type() == ty,
     "Expected tensor for ", t, " to have scalar type ", toString(ty),
     "; but got ", t->toString(), " instead (while checking arguments for ", c,
@@ -173,7 +173,7 @@ void checkAllSameType(CheckedFrom c, ArrayRef<TensorArg> tensors) {
 }
 
 void checkSameDim(CheckedFrom c, const TensorGeometryArg& t1, const TensorGeometryArg& t2) {
-  AT_CHECK(
+  TORCH_CHECK(
     t1->dim() == t2->dim(),
     "Expected tensor for ", t1, " to have the same dimension as tensor for ",
     t2, "; but ", t1->dim(), " does not equal ", t2->dim(),
@@ -181,7 +181,7 @@ void checkSameDim(CheckedFrom c, const TensorGeometryArg& t1, const TensorGeomet
 }
 
 void checkDefined(CheckedFrom c, const TensorArg& t) {
-  AT_CHECK(
+  TORCH_CHECK(
     t->defined(),
     "Expected tensor for ", t, " to be non-null, but it was undefined ",
     " (while checking arguments for ", c, ")");
@@ -195,7 +195,7 @@ void checkAllDefined(CheckedFrom c, ArrayRef<TensorArg> ts) {
 }
 
 void checkBackend(CheckedFrom c, const Tensor& t, Backend backend) {
-  AT_CHECK(
+  TORCH_CHECK(
     !t.defined() || t.type().backend() == backend,
     "Expected tensor to have ", toString(backend),
     " Backend, but got tensor with ", toString(t.type().backend()), " Backend ",
@@ -209,7 +209,7 @@ void checkBackend(CheckedFrom c, at::ArrayRef<Tensor> tensors, at::Backend backe
 }
 
 void checkDeviceType(CheckedFrom c, const Tensor& t, DeviceType device_type) {
-  AT_CHECK(
+  TORCH_CHECK(
       !t.defined() || t.type().device_type() == device_type,
       "Expected tensor to have ", device_type,
       " DeviceType, but got tensor with ", t.type().device_type(), " DeviceType ",
@@ -223,7 +223,7 @@ void checkDeviceType(CheckedFrom c, at::ArrayRef<Tensor> tensors, at::DeviceType
 }
 
 void checkLayout(CheckedFrom c, const Tensor& t, Layout layout) {
-  AT_CHECK(
+  TORCH_CHECK(
     !t.defined() || t.layout() == layout,
     "Expected tensor to have ", layout,
     " Layout, but got tensor with ", t.layout(), " Layout ",
@@ -263,6 +263,29 @@ bool geometry_is_contiguous(IntArrayRef sizes, IntArrayRef strides) {
   return contig_if_nonempty;
 }
 
+// Correspond to THCUNN_check_dim_size/THNN_check_dim_size
+void check_dim_size(
+    const Tensor& tensor,
+    int64_t dim,
+    int64_t dim_size,
+    int64_t size) {
+  /* Check dimension size of a tensor */
+  TORCH_CHECK(
+      tensor.dim() == dim && tensor.size(dim_size) == size,
+      "Expected a tensor of dimension ",
+      dim,
+      " and tensor.size[",
+      dim_size,
+      "] == ",
+      size,
+      " but got: dimension ",
+      tensor.dim(),
+      " and tensor.size[",
+      dim_size,
+      "] = ",
+      tensor.size(dim_size));
+}
+
 namespace detail {
 
 std::vector<int64_t> defaultStrides(IntArrayRef sizes) {
@@ -287,5 +310,6 @@ int64_t computeStorageSize(IntArrayRef sizes, IntArrayRef strides) {
   }
   return size;
 }
+
 }  // namespace detail
 }  // namespace at
diff --git a/aten/src/ATen/TensorUtils.h b/aten/src/ATen/TensorUtils.h
index 7ddd689376c6..3c8998c88f80 100644
--- a/aten/src/ATen/TensorUtils.h
+++ b/aten/src/ATen/TensorUtils.h
@@ -135,6 +135,13 @@ CAFFE2_API void* maybe_data_ptr(const TensorArg& tensor);
 // on whether a subgeometry is contiguous.
 CAFFE2_API bool geometry_is_contiguous(IntArrayRef sizes, IntArrayRef strides);
 
+// Correspond to THCUNN_check_dim_size/THNN_check_dim_size
+CAFFE2_API void check_dim_size(
+    const Tensor& tensor,
+    int64_t dim,
+    int64_t dim_size,
+    int64_t size);
+
 namespace detail {
 CAFFE2_API std::vector<int64_t> defaultStrides(IntArrayRef sizes);
 CAFFE2_API int64_t computeStorageSize(IntArrayRef sizes, IntArrayRef strides);
diff --git a/aten/src/ATen/WrapDimUtilsMulti.h b/aten/src/ATen/WrapDimUtilsMulti.h
index eefc0c80be77..a2af1b0dcd71 100644
--- a/aten/src/ATen/WrapDimUtilsMulti.h
+++ b/aten/src/ATen/WrapDimUtilsMulti.h
@@ -13,11 +13,11 @@ namespace at {
 constexpr size_t dim_bitset_size = 64;
 
 static inline std::bitset<dim_bitset_size> dim_list_to_bitset(IntArrayRef dims, int64_t ndims) {
-  AT_CHECK(ndims <= (int64_t) dim_bitset_size, "only tensors with up to ", dim_bitset_size, " dims are supported");
+  TORCH_CHECK(ndims <= (int64_t) dim_bitset_size, "only tensors with up to ", dim_bitset_size, " dims are supported");
   std::bitset<dim_bitset_size> seen;
   for (size_t i = 0; i < dims.size(); i++) {
     size_t dim = maybe_wrap_dim(dims[i], ndims);
-    AT_CHECK(!seen[dim], "dim ", dim, " appears multiple times in the list of dims");
+    TORCH_CHECK(!seen[dim], "dim ", dim, " appears multiple times in the list of dims");
     seen[dim] = true;
   }
   return seen;
diff --git a/aten/src/ATen/core/CMakeLists.txt b/aten/src/ATen/core/CMakeLists.txt
index 0f5d86f2db1d..dd1f3f9058d5 100644
--- a/aten/src/ATen/core/CMakeLists.txt
+++ b/aten/src/ATen/core/CMakeLists.txt
@@ -9,7 +9,6 @@ EXCLUDE(ATen_CORE_SRCS "${ATen_CORE_SRCS}" ${ATen_CORE_TEST_SRCS})
 # Add files needed from jit folders
 LIST(APPEND ATen_CORE_HEADERS
     ${CMAKE_CURRENT_SOURCE_DIR}/../../../../torch/csrc/jit/source_range.h
-    ${CMAKE_CURRENT_SOURCE_DIR}/../../../../torch/csrc/jit/source_location.h
     ${CMAKE_CURRENT_SOURCE_DIR}/../../../../torch/csrc/jit/script/function_schema_parser.h
     ${CMAKE_CURRENT_SOURCE_DIR}/../../../../torch/csrc/jit/script/lexer.h
     ${CMAKE_CURRENT_SOURCE_DIR}/../../../../torch/csrc/jit/script/strtod.h
@@ -23,6 +22,7 @@ LIST(APPEND ATen_CORE_SRCS
     ${CMAKE_CURRENT_SOURCE_DIR}/../../../../torch/csrc/jit/script/lexer.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/../../../../torch/csrc/jit/script/strtod.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/../../../../torch/csrc/jit/script/schema_type_parser.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../../torch/csrc/jit/source_range.cpp
 )
 
 # Pass to parent
diff --git a/aten/src/ATen/core/DeprecatedTypeProperties.h b/aten/src/ATen/core/DeprecatedTypeProperties.h
index ed09ccb8af31..50645b62e5d4 100644
--- a/aten/src/ATen/core/DeprecatedTypeProperties.h
+++ b/aten/src/ATen/core/DeprecatedTypeProperties.h
@@ -64,9 +64,16 @@ class CAFFE2_API DeprecatedTypeProperties {
   }
 
   std::string toString() const {
-    std::stringstream ss;
-    ss << at::toString(backend()) << at::toString(scalarType()) << "Type";
-    return ss.str();
+    std::string base_str;
+    if (backend_ == Backend::Undefined || scalar_type_ == ScalarType::Undefined) {
+      base_str = "UndefinedType";
+    } else {
+      base_str = std::string(at::toString(backend_)) + at::toString(scalar_type_) + "Type";
+    }
+    if (is_variable_) {
+      return "Variable[" + base_str + "]";
+    }
+    return base_str;
   }
 
   DeprecatedTypeProperties & toBackend(Backend b) const {
diff --git a/aten/src/ATen/core/Dict.h b/aten/src/ATen/core/Dict.h
index 43e7a5736072..2cd374677108 100644
--- a/aten/src/ATen/core/Dict.h
+++ b/aten/src/ATen/core/Dict.h
@@ -2,46 +2,19 @@
 
 #include <c10/macros/Macros.h>
 #include <c10/util/TypeTraits.h>
-#include <ATen/core/ivalue_base.h>
 #include <c10/util/flat_hash_map.h>
 
 namespace c10 {
+struct IValue;
 template<class Key, class Value> class Dict;
 namespace impl {
-
-inline bool shallowEquals(const IValue& lhs, const IValue& rhs) {
-  if (lhs.isNone()) {
-    return rhs.isNone();
-  } else if (lhs.isInt()) {
-    return rhs.isInt() && lhs.toInt() == rhs.toInt();
-  } else if (lhs.isString()) {
-    return rhs.isString() && lhs.toStringRef() == rhs.toStringRef();
-  } else if (lhs.isDouble()) {
-    return rhs.isDouble() && lhs.toDouble() == rhs.toDouble();
-  } else if (lhs.isBool()) {
-    return rhs.isBool() && lhs.toBool() == rhs.toBool();
-  } else {
-    AT_ERROR("shallowEquals(IValue, IValue) not implemented for type ", lhs.tagKind());
-  }
-}
+bool shallowEquals(const IValue& lhs, const IValue& rhs);
 }
 
 namespace detail {
 
 struct DictHash {
-  size_t operator()(const IValue& ivalue) const {
-    if (ivalue.isInt()) {
-      return std::hash<int>()(ivalue.toInt());
-    } else if (ivalue.isString()) {
-      return std::hash<std::string>()(ivalue.toStringRef());
-    } else if (ivalue.isDouble()) {
-      return std::hash<double>()(ivalue.toDouble());
-    } else if (ivalue.isBool()) {
-      return std::hash<bool>()(ivalue.toBool());
-    } else {
-      throw std::runtime_error("Can't hash IValues with this tag");
-    }
-  }
+  size_t operator()(const IValue& ivalue) const;
 };
 
 struct DictEqualTo {
@@ -208,71 +181,53 @@ class Dict final {
    * Returns an iterator to the first element of the container.
    * If the container is empty, the returned iterator will be equal to end().
    */
-  iterator begin() {
-    return iterator{map_.begin()};
-  }
+  iterator begin();
 
   /**
    * Returns an iterator to the first element of the container.
    * If the container is empty, the returned iterator will be equal to end().
    */
-  const_iterator begin() const {
-    return const_iterator{map_.begin()};
-  }
+  const_iterator begin() const;
 
   /**
    * Returns an iterator to the first element of the container.
    * If the container is empty, the returned iterator will be equal to end().
    */
-  const_iterator cbegin() const {
-    return const_iterator{map_.cbegin()};
-  }
+  const_iterator cbegin() const;
 
   /**
    * Returns an iterator to the element following the last element of the container.
    * This element acts as a placeholder; attempting to access it results in undefined behavior.
    */
-  iterator end() {
-    return iterator{map_.end()};
-  }
+  iterator end();
 
   /**
    * Returns an iterator to the element following the last element of the container.
    * This element acts as a placeholder; attempting to access it results in undefined behavior.
    */
-  const_iterator end() const {
-    return const_iterator{map_.end()};
-  }
+  const_iterator end() const;
 
   /**
    * Returns an iterator to the element following the last element of the container.
    * This element acts as a placeholder; attempting to access it results in undefined behavior.
    */
-  const_iterator cend() const {
-    return const_iterator{map_.cend()};
-  }
+  const_iterator cend() const;
 
   /**
    * Checks if the container has no elements.
    */
-  bool empty() const {
-    return map_.empty();
-  }
+  bool empty() const;
 
   /**
    * Returns the number of elements in the container.
    */
-  size_type size() const {
-    return map_.size();
-  }
+  size_type size() const;
 
   /**
    * Erases all elements from the container. After this call, size() returns zero.
    * Invalidates any references, pointers, or iterators referring to contained elements. May also invalidate past-the-end iterators.
    */
-  void clear() {
-    map_.clear();
-  }
+  void clear();
 
   /**
    * Inserts element(s) into the container, if the container doesn't already contain an element with an equivalent key.
@@ -281,14 +236,7 @@ class Dict final {
    * @return A pair consisting of an iterator to the inserted element (or to the element that prevented the insertion) and a bool denoting whether the insertion took place.
    */
   template<class Key_, class Value_>
-  std::pair<iterator, bool> insert(Key_&& key, Value_&& value) {
-    static_assert(std::is_constructible<Key, Key_>::value, "Wrong type for the key argument of Dict::insert");
-    static_assert(std::is_constructible<Value, Value_>::value, "Wrong type for the value argument of Dict::insert");
-    auto inserted = map_.insert({
-      Key(std::forward<Key_>(key)),
-      Value(std::forward<Value_>(value))});
-    return {iterator{inserted.first}, inserted.second};
-  }
+  std::pair<iterator, bool> insert(Key_&& key, Value_&& value);
 
   /**
    * If an element with the given key already exists, it is overwritten with the given value.
@@ -298,23 +246,14 @@ class Dict final {
    * @return The bool component is true if the insertion took place and false if the assignment took place. The iterator component is pointing at the element that was inserted or updated.
    */
   template<class Key_, class Value_>
-  std::pair<iterator, bool> insert_or_assign(Key_&& key, Value_&& value) {
-    static_assert(std::is_constructible<Key, Key_>::value, "Wrong type for the key argument of Dict::insert_or_assign");
-    static_assert(std::is_constructible<Value, Value_>::value, "Wrong type for the value argument of Dict::insert_or_assign");
-    auto inserted = map_.insert_or_assign(
-      Key(std::forward<Key_>(key)),
-      Value(std::forward<Value_>(value)));
-    return {iterator{inserted.first}, inserted.second};
-  }
+  std::pair<iterator, bool> insert_or_assign(Key_&& key, Value_&& value);
 
   /**
    * Removes the element pointed to by iter.
    * May invalidate any references, pointers, or iterators referring to contained elements.
    * The iterator iter must be valid and dereferenceable. Thus the end() iterator (which is valid, but is not dereferenceable) cannot be used as a value for iter.
    */
-  void erase(const_iterator iter) {
-    map_.erase(iter.entryRef_.iterator_);
-  }
+  void erase(const_iterator iter);
 
   /**
    * Removes the element with the given key, if it exists.
@@ -322,17 +261,13 @@ class Dict final {
    *
    * @return The number of elements removed. This is either '1' if an element with the key existed, or '0' if it didn't.
    */
-  C10_NODISCARD size_t erase(const Key& key) {
-    return map_.erase(key);
-  }
+  C10_NODISCARD size_t erase(const Key& key);
 
   /**
    * Returns the mapped value of the element with key equivalent to key.
    * If no such element exists, an exception of type std::out_of_range is thrown.
    */
-  Value at(const Key& key) {
-    return map_.at(key).template to<Value>();
-  }
+  Value at(const Key& key) const;
 
   /**
    * Finds an element with key equivalent to key.
@@ -340,9 +275,7 @@ class Dict final {
    * @return Iterator to an element with key equivalent to key.
    *         If no such element is found, past-the-end (see end()) iterator is returned.
    */
-  iterator find(const Key& key) {
-    return iterator{map_.find(key)};
-  }
+  iterator find(const Key& key);
 
   /**
    * Finds an element with key equivalent to key.
@@ -350,26 +283,20 @@ class Dict final {
    * @return Iterator to an element with key equivalent to key.
    *         If no such element is found, past-the-end (see end()) iterator is returned.
    */
-  const_iterator find(const Key& key) const {
-    return const_iterator{map_.find(key)};
-  }
+  const_iterator find(const Key& key) const;
 
   /**
    * Checks if there is an element with key equivalent to key in the container.
    *
    * @return true if there is such an element, otherwise false.
    */
-  bool contains(const Key& key) const {
-    return end() != find(key);
-  }
+  bool contains(const Key& key) const;
 
   /**
    * Increase the capacity so that at least count elements can be stored without
    * having to reallocate or rehash.
    */
-  void reserve(size_type count) {
-    map_.reserve(count);
-  }
+  void reserve(size_type count);
 };
 
 namespace impl {
@@ -391,4 +318,4 @@ GenericDict toGenericDict(Dict<Key, Value>&& dict) {
 
 }
 
-#include <ATen/core/ivalue.h>
+#include <ATen/core/Dict_inl.h>
diff --git a/aten/src/ATen/core/Dict_inl.h b/aten/src/ATen/core/Dict_inl.h
new file mode 100644
index 000000000000..0118bee18c41
--- /dev/null
+++ b/aten/src/ATen/core/Dict_inl.h
@@ -0,0 +1,144 @@
+#pragma once
+
+#include <ATen/core/ivalue.h>
+
+namespace c10 {
+namespace impl {
+inline bool shallowEquals(const IValue& lhs, const IValue& rhs) {
+  if (lhs.isNone()) {
+    return rhs.isNone();
+  } else if (lhs.isInt()) {
+    return rhs.isInt() && lhs.toInt() == rhs.toInt();
+  } else if (lhs.isString()) {
+    return rhs.isString() && lhs.toStringRef() == rhs.toStringRef();
+  } else if (lhs.isDouble()) {
+    return rhs.isDouble() && lhs.toDouble() == rhs.toDouble();
+  } else if (lhs.isBool()) {
+    return rhs.isBool() && lhs.toBool() == rhs.toBool();
+  } else {
+    AT_ERROR("shallowEquals(IValue, IValue) not implemented for type ", lhs.tagKind());
+  }
+}
+}
+
+namespace detail {
+
+inline size_t DictHash::operator()(const IValue& ivalue) const {
+  if (ivalue.isInt()) {
+    return std::hash<int>()(ivalue.toInt());
+  } else if (ivalue.isString()) {
+    return std::hash<std::string>()(ivalue.toStringRef());
+  } else if (ivalue.isDouble()) {
+    return std::hash<double>()(ivalue.toDouble());
+  } else if (ivalue.isBool()) {
+    return std::hash<bool>()(ivalue.toBool());
+  } else {
+    throw std::runtime_error("Can't hash IValues with this tag");
+  }
+}
+
+}
+
+template<class Key, class Value>
+typename Dict<Key, Value>::iterator Dict<Key, Value>::begin() {
+  return iterator{map_.begin()};
+}
+
+template<class Key, class Value>
+typename Dict<Key, Value>::const_iterator Dict<Key, Value>::begin() const {
+  return const_iterator{map_.begin()};
+}
+
+template<class Key, class Value>
+typename Dict<Key, Value>::const_iterator Dict<Key, Value>::cbegin() const {
+  return const_iterator{map_.cbegin()};
+}
+
+template<class Key, class Value>
+typename Dict<Key, Value>::iterator Dict<Key, Value>::end() {
+  return iterator{map_.end()};
+}
+
+template<class Key, class Value>
+typename Dict<Key, Value>::const_iterator Dict<Key, Value>::end() const {
+  return const_iterator{map_.end()};
+}
+
+template<class Key, class Value>
+typename Dict<Key, Value>::const_iterator Dict<Key, Value>::cend() const {
+  return const_iterator{map_.cend()};
+}
+
+template<class Key, class Value>
+bool Dict<Key, Value>::empty() const {
+  return map_.empty();
+}
+
+template<class Key, class Value>
+typename Dict<Key, Value>::size_type Dict<Key, Value>::size() const {
+  return map_.size();
+}
+
+template<class Key, class Value>
+void Dict<Key, Value>::clear() {
+  map_.clear();
+}
+
+template<class Key, class Value>
+template<class Key_, class Value_>
+std::pair<typename Dict<Key, Value>::iterator, bool> Dict<Key, Value>::insert(Key_&& key, Value_&& value) {
+  static_assert(std::is_constructible<Key, Key_>::value, "Wrong type for the key argument of Dict::insert");
+  static_assert(std::is_constructible<Value, Value_>::value, "Wrong type for the value argument of Dict::insert");
+  auto inserted = map_.insert(std::pair<IValue, IValue>{
+    Key(std::forward<Key_>(key)),
+    Value(std::forward<Value_>(value))});
+  return {iterator{inserted.first}, inserted.second};
+}
+
+template<class Key, class Value>
+template<class Key_, class Value_>
+std::pair<typename Dict<Key, Value>::iterator, bool> Dict<Key, Value>::insert_or_assign(Key_&& key, Value_&& value) {
+  static_assert(std::is_constructible<Key, Key_>::value, "Wrong type for the key argument of Dict::insert_or_assign");
+  static_assert(std::is_constructible<Value, Value_>::value, "Wrong type for the value argument of Dict::insert_or_assign");
+  auto inserted = map_.insert_or_assign(
+    Key(std::forward<Key_>(key)),
+    Value(std::forward<Value_>(value)));
+  return {iterator{inserted.first}, inserted.second};
+}
+
+template<class Key, class Value>
+void Dict<Key, Value>::erase(const_iterator iter) {
+  map_.erase(iter.entryRef_.iterator_);
+}
+
+template<class Key, class Value>
+C10_NODISCARD size_t Dict<Key, Value>::erase(const Key& key) {
+  return map_.erase(key);
+}
+
+template<class Key, class Value>
+Value Dict<Key, Value>::at(const Key& key) const {
+  return map_.at(key).template to<Value>();
+}
+
+template<class Key, class Value>
+typename Dict<Key, Value>::iterator Dict<Key, Value>::find(const Key& key) {
+  return iterator{map_.find(key)};
+}
+
+template<class Key, class Value>
+typename Dict<Key, Value>::const_iterator Dict<Key, Value>::find(const Key& key) const {
+  return const_iterator{map_.find(key)};
+}
+
+template<class Key, class Value>
+bool Dict<Key, Value>::contains(const Key& key) const {
+  return end() != find(key);
+}
+
+template<class Key, class Value>
+void Dict<Key, Value>::reserve(size_type count) {
+  map_.reserve(count);
+}
+
+}
diff --git a/aten/src/ATen/core/LegacyTypeDispatch.cpp b/aten/src/ATen/core/LegacyTypeDispatch.cpp
index d5b959a06508..f20062dbd34d 100644
--- a/aten/src/ATen/core/LegacyTypeDispatch.cpp
+++ b/aten/src/ATen/core/LegacyTypeDispatch.cpp
@@ -2,54 +2,6 @@
 
 namespace at {
 
-/// NOTE [ Treating Variables as non-Variables in type dispatch ]
-///
-/// Previously, in VariableType_*.cpp (generated by gen_variable_type.py), when
-/// a function is using the 'use_derived' strategy, we call its implementation
-/// on the base non-Variable type (`baseType`), passing unwrapped tensors to the
-/// call so that any `.dispatch_type()` calls in the implementation can treat the passed
-/// tensors as non-Variables and won't dispatch back to functions in VariableType.
-///
-/// However, after the Variable/Tensor merge, there is no concept of unwrapping
-/// a tensor anymore, and directly passing variables to the base type calls will
-/// cause the `.dispatch_type()` dispatch in the implementation to treat the tensor as a
-/// variable, and any function dispatch based on `.dispatch_type()` will dispatch back to
-/// VariableType, which is not what we want.
-///
-/// The solution to the above problem is to add `at::NonVariableTypeMode`, which
-/// when enabled will cause `legacyTensorType()` and `getType()` to always return
-/// non-Variable type, even if the tensor being called on is a variable.
-///
-/// TODO: Since `torch::NoGradGuard` serves the same purpose in libtorch, we should
-/// merge these two thread-local guards.
-
-/// In the CAFFE2_FB_LIMITED_MOBILE_CAPABILITY build setting,
-/// thread_local is not supported. In that case, we don't provide
-/// `at::NonVariableTypeMode`.
-#ifndef CAFFE2_FB_LIMITED_MOBILE_CAPABILITY
-
-thread_local bool NonVariableTypeMode_enabled = false;
-
-bool NonVariableTypeMode::is_enabled() {
-  return NonVariableTypeMode_enabled;
-}
-
-void NonVariableTypeMode::set_enabled(bool enabled) {
-  NonVariableTypeMode_enabled = enabled;
-}
-
-#else // defined(CAFFE2_FB_LIMITED_MOBILE_CAPABILITY)
-
-bool NonVariableTypeMode::is_enabled() {
-  throw std::runtime_error("NonVariableTypeMode is not supported on mobile");
-}
-
-void NonVariableTypeMode::set_enabled(bool enabled) {
-  throw std::runtime_error("NonVariableTypeMode is not supported on mobile");
-}
-
-#endif
-
 // TODO: This could be bad juju if someone calls globalContext() in the
 // destructor of an object with static lifetime.
 LegacyTypeDispatch & globalLegacyTypeDispatch() {
diff --git a/aten/src/ATen/core/LegacyTypeDispatch.h b/aten/src/ATen/core/LegacyTypeDispatch.h
index 93b5348ec6a4..e65205124d10 100644
--- a/aten/src/ATen/core/LegacyTypeDispatch.h
+++ b/aten/src/ATen/core/LegacyTypeDispatch.h
@@ -139,11 +139,6 @@ class CAFFE2_API LegacyTypeDispatch {
 
 CAFFE2_API LegacyTypeDispatch& globalLegacyTypeDispatch();
 
-struct CAFFE2_API NonVariableTypeMode {
-  static bool is_enabled();
-  static void set_enabled(bool enabled);
-};
-
 // A RAII, thread local (!) guard that has the following effect:
 //
 // Upon construction: sets NonVariableTypeMode_enabled for the current thread to
@@ -180,7 +175,7 @@ inline Type& legacyTensorType(const TensorImpl& tensor) {
   return *globalLegacyTypeDispatch().getTypeRaw(
       tensorTypeIdToBackend(tensor.type_id()),
       typeMetaToScalarType(tensor.dtype()),
-      tensor.is_variable() && !at::NonVariableTypeMode::is_enabled());
+      tensor.is_variable());
 }
 
 inline void initializeLegacyTypeDispatchFor(const TensorImpl& tensor) {
@@ -188,7 +183,7 @@ inline void initializeLegacyTypeDispatchFor(const TensorImpl& tensor) {
   globalLegacyTypeDispatch().getType(
       tensorTypeIdToBackend(tensor.type_id()),
       typeMetaToScalarType(tensor.dtype()),
-      tensor.is_variable() && !at::NonVariableTypeMode::is_enabled());
+      tensor.is_variable());
 }
 
 } // namespace at
diff --git a/aten/src/ATen/core/Tensor.cpp b/aten/src/ATen/core/Tensor.cpp
index aa611e87454f..4597432cf171 100644
--- a/aten/src/ATen/core/Tensor.cpp
+++ b/aten/src/ATen/core/Tensor.cpp
@@ -35,14 +35,14 @@ void Tensor::enforce_invariants() {
 
 void Tensor::print() const {
   if (defined()) {
-    std::cerr << "[" << dispatch_type().toString() << " " << sizes() << "]" << std::endl;
+    std::cerr << "[" << type().toString() << " " << sizes() << "]" << std::endl;
   } else {
     std::cerr << "[UndefinedTensor]" << std::endl;
   }
 }
 
-const char * Tensor::toString() const {
-  return dispatch_type().toString();
+std::string Tensor::toString() const {
+  return type().toString();
 }
 
 } // namespace at
diff --git a/aten/src/ATen/core/Tensor.h b/aten/src/ATen/core/Tensor.h
index ba1daec99ab6..55e80aa630f3 100644
--- a/aten/src/ATen/core/Tensor.h
+++ b/aten/src/ATen/core/Tensor.h
@@ -3,6 +3,7 @@
 #include <ATen/core/Type.h>
 #include <c10/core/Device.h>
 #include <c10/core/Layout.h>
+#include <c10/core/MemoryFormat.h>
 #include <c10/core/Scalar.h>
 #include <c10/core/ScalarType.h>
 #include <ATen/core/SparseTensorRef.h>
@@ -154,7 +155,7 @@ class CAFFE2_API Tensor {
     return impl_.weak_use_count();
   }
 
-  const char * toString() const;
+  std::string toString() const;
 
   IntArrayRef sizes() const {
     return impl_->sizes();
@@ -165,8 +166,8 @@ class CAFFE2_API Tensor {
   int64_t ndimension() const {
     return dim();
   }
-  bool is_contiguous() const {
-    return impl_->is_contiguous();
+  bool is_contiguous(at::MemoryFormat memory_format=at::MemoryFormat::Any) const {
+    return impl_->is_contiguous(memory_format);
   }
 
   // Total bytes consumed by the "view" of elements of the array.  Does not
@@ -193,7 +194,7 @@ class CAFFE2_API Tensor {
     return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties(
         tensorTypeIdToBackend(type_id()),
         scalar_type(),
-        is_variable() && !at::NonVariableTypeMode::is_enabled());
+        is_variable());
   }
   Type & dispatch_type() const {
     return legacyTensorType(*impl_);
@@ -266,7 +267,7 @@ class CAFFE2_API Tensor {
   template<typename T, size_t N>
   TensorAccessor<T,N> accessor() const& {
     static_assert(N > 0, "accessor is used for indexing tensor, for scalars use *data<T>()");
-    AT_CHECK(dim() == N, "expected ", N, " dims but tensor has ", dim());
+    TORCH_CHECK(dim() == N, "expected ", N, " dims but tensor has ", dim());
     return TensorAccessor<T,N>(data<T>(),sizes().data(),strides().data());
   }
   template<typename T, size_t N>
@@ -280,7 +281,7 @@ class CAFFE2_API Tensor {
   template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
   PackedTensorAccessor<T,N,PtrTraits,index_t> packed_accessor() const& {
     static_assert(N > 0, "accessor is used for indexing tensor, for scalars use *data<T>()");
-    AT_CHECK(dim() == N, "expected ", N, " dims but tensor has ", dim());
+    TORCH_CHECK(dim() == N, "expected ", N, " dims but tensor has ", dim());
     return PackedTensorAccessor<T,N,PtrTraits,index_t>(static_cast<typename PtrTraits<T>::PtrType>(data<T>()),sizes().data(),strides().data());
   }
   template<typename T, size_t N,  template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
@@ -373,7 +374,7 @@ class CAFFE2_API Tensor {
   Tensor & clamp_max_(Scalar max);
   Tensor clamp_min(Scalar min) const;
   Tensor & clamp_min_(Scalar min);
-  Tensor contiguous() const;
+  Tensor contiguous(MemoryFormat memory_format=MemoryFormat::Contiguous) const;
   Tensor & copy_(const Tensor & src, bool non_blocking=false);
   Tensor cos() const;
   Tensor & cos_();
@@ -580,8 +581,9 @@ class CAFFE2_API Tensor {
   Tensor to_sparse(int64_t sparse_dim) const;
   Tensor to_sparse() const;
   Tensor to_mkldnn() const;
-  Tensor quantize_linear(double scale, int64_t zero_point) const;
+  Tensor quantize_linear(double scale, int64_t zero_point, ScalarType dtype) const;
   Tensor dequantize() const;
+  Tensor dequantize_linear(double scale, int64_t zero_point, ScalarType dtype) const;
   Scalar q_scale() const;
   Scalar q_zero_point() const;
   Tensor int_repr() const;
diff --git a/aten/src/ATen/core/TensorMethods.h b/aten/src/ATen/core/TensorMethods.h
index 138842d532be..25ce96c9b749 100644
--- a/aten/src/ATen/core/TensorMethods.h
+++ b/aten/src/ATen/core/TensorMethods.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <c10/core/Scalar.h>
+#include <c10/core/MemoryFormat.h>
 #include <c10/macros/Macros.h>
 #include <ATen/core/SparseTensorRef.h>
 #include <c10/core/TensorOptions.h>
@@ -176,8 +177,8 @@ inline Tensor Tensor::clamp_min(Scalar min) const {
 inline Tensor & Tensor::clamp_min_(Scalar min) {
     return dispatch_type().clamp_min_(*this, min);
 }
-inline Tensor Tensor::contiguous() const {
-    return dispatch_type().contiguous(*this);
+inline Tensor Tensor::contiguous(MemoryFormat memory_format) const {
+    return dispatch_type().contiguous(*this, memory_format);
 }
 inline Tensor & Tensor::copy_(const Tensor & src, bool non_blocking) {
     return dispatch_type().copy_(*this, src, non_blocking);
@@ -797,12 +798,15 @@ inline Tensor Tensor::to_sparse() const {
 inline Tensor Tensor::to_mkldnn() const {
     return dispatch_type().to_mkldnn(*this);
 }
-inline Tensor Tensor::quantize_linear(double scale, int64_t zero_point) const {
-    return dispatch_type().quantize_linear(*this, scale, zero_point);
+inline Tensor Tensor::quantize_linear(double scale, int64_t zero_point, ScalarType dtype) const {
+    return dispatch_type().quantize_linear(*this, scale, zero_point, dtype);
 }
 inline Tensor Tensor::dequantize() const {
     return dispatch_type().dequantize(*this);
 }
+inline Tensor Tensor::dequantize_linear(double scale, int64_t zero_point, ScalarType dtype) const {
+    return dispatch_type().dequantize_linear(*this, scale, zero_point, dtype);
+}
 inline Scalar Tensor::q_scale() const {
     return dispatch_type().q_scale(*this);
 }
@@ -1372,7 +1376,7 @@ inline bool is_quantized(Tensor self) {
 #define DEFINE_CAST(T, name, _)                  \
   template <>                                    \
   inline T* Tensor::data() const {               \
-    AT_CHECK(                                    \
+    TORCH_CHECK(                                    \
         scalar_type() == ScalarType::name,       \
         "expected scalar type ",                 \
         #name,                                   \
diff --git a/aten/src/ATen/core/Type.h b/aten/src/ATen/core/Type.h
index 68451b81ce08..ff1bb03e7e55 100644
--- a/aten/src/ATen/core/Type.h
+++ b/aten/src/ATen/core/Type.h
@@ -5,6 +5,7 @@
 #include <c10/util/Deprecated.h>
 #include <ATen/core/Generator.h>
 #include <c10/core/Layout.h>
+#include <c10/core/MemoryFormat.h>
 #include <c10/core/Scalar.h>
 #include <c10/core/ScalarType.h>
 #include <ATen/core/SparseTensorRef.h>
@@ -182,7 +183,7 @@ struct CAFFE2_API Type {
   virtual Tensor & clamp_max_(Tensor & self, Scalar max) const = 0;
   virtual Tensor clamp_min(const Tensor & self, Scalar min) const = 0;
   virtual Tensor & clamp_min_(Tensor & self, Scalar min) const = 0;
-  virtual Tensor contiguous(const Tensor & self) const = 0;
+  virtual Tensor contiguous(const Tensor & self, MemoryFormat memory_format) const = 0;
   virtual Tensor & copy_(Tensor & self, const Tensor & src, bool non_blocking) const = 0;
   virtual Tensor cos(const Tensor & self) const = 0;
   virtual Tensor & cos_(Tensor & self) const = 0;
@@ -390,8 +391,9 @@ struct CAFFE2_API Type {
   virtual Tensor to_sparse(const Tensor & self, int64_t sparse_dim) const = 0;
   virtual Tensor to_sparse(const Tensor & self) const = 0;
   virtual Tensor to_mkldnn(const Tensor & self) const = 0;
-  virtual Tensor quantize_linear(const Tensor & self, double scale, int64_t zero_point) const = 0;
+  virtual Tensor quantize_linear(const Tensor & self, double scale, int64_t zero_point, ScalarType dtype) const = 0;
   virtual Tensor dequantize(const Tensor & self) const = 0;
+  virtual Tensor dequantize_linear(const Tensor & self, double scale, int64_t zero_point, ScalarType dtype) const = 0;
   virtual Scalar q_scale(const Tensor & self) const = 0;
   virtual Scalar q_zero_point(const Tensor & self) const = 0;
   virtual Tensor int_repr(const Tensor & self) const = 0;
diff --git a/aten/src/ATen/core/alias_info.h b/aten/src/ATen/core/alias_info.h
index 36704e424a8d..c9cb3d71f403 100644
--- a/aten/src/ATen/core/alias_info.h
+++ b/aten/src/ATen/core/alias_info.h
@@ -22,11 +22,6 @@ class AliasInfo {
     static const Symbol wc = Symbol::fromQualString("alias::*");
     return wc;
   }
-  static AliasInfo createWildcard() {
-    AliasInfo ret;
-    ret.addBeforeSet(wildcardSet());
-    return ret;
-  }
 
   void setIsWrite(bool isWrite) {
     isWrite_ = isWrite;
@@ -57,10 +52,14 @@ class AliasInfo {
     return *beforeSets_.begin();
   }
 
-  bool isWildcard() const {
+  bool isWildcardBefore() const {
     return beforeSets_.count(wildcardSet()) != 0;
   }
 
+  bool isWildcardAfter() const {
+    return afterSets_.count(wildcardSet()) != 0;
+  }
+
   // the alias info for the contained types of the type
   // e.g. if this is an annotation on List[T], `sets` refers to
   // the alias sets that the list may be in
diff --git a/aten/src/ATen/core/aten_interned_strings.h b/aten/src/ATen/core/aten_interned_strings.h
index cec23de35dc7..ac3e71ebf5d8 100644
--- a/aten/src/ATen/core/aten_interned_strings.h
+++ b/aten/src/ATen/core/aten_interned_strings.h
@@ -8,7 +8,6 @@
 // To explicitly use interned strings as symbols in your code, you must add
 // them to this list.
 
-#if !defined(C10_MOBILE) || defined(FEATURE_TORCH_MOBILE)
 #define FORALL_ATEN_BASE_SYMBOLS(_) \
 _(aten, __and__) \
 _(aten, __iand__) \
@@ -258,6 +257,8 @@ _(aten, cosh) \
 _(aten, cosine_embedding_loss) \
 _(aten, cosine_similarity) \
 _(aten, cross) \
+_(aten, std_mean) \
+_(aten, var_mean) \
 _(aten, ctc_loss) \
 _(aten, cudnn_affine_grid_generator) \
 _(aten, cudnn_affine_grid_generator_backward) \
@@ -906,6 +907,8 @@ _(attr, padding_value) \
 _(attr, params) \
 _(attr, pdist) \
 _(attr, cdist) \
+_(attr, std_mean) \
+_(attr, var_mean) \
 _(attr, periodic) \
 _(attr, pivot) \
 _(attr, pivots) \
@@ -1013,4 +1016,3 @@ _(attr, workspace) \
 _(attr, x) \
 _(attr, x1) \
 _(attr, x2)
-#endif
diff --git a/aten/src/ATen/core/dispatch/DispatchTable.h b/aten/src/ATen/core/dispatch/DispatchTable.h
index 9c4dafcbf116..ddf0564af187 100644
--- a/aten/src/ATen/core/dispatch/DispatchTable.h
+++ b/aten/src/ATen/core/dispatch/DispatchTable.h
@@ -61,7 +61,7 @@ class KernelTable_ final {
     if (!emplaced.second) {
       // Element already existed. Overwrite it.
       emplaced.first->second = value;
-      AT_WARN("Registered a kernel that overwrote a previoulsy registered kernel with same dispatch key '",
+      AT_WARN("Registered a kernel that overwrote a previously registered kernel with same dispatch key '",
           detail::dispatch_key_to_string(key), "' for operator '", operator_name ,"'.");
     }
   }
@@ -205,7 +205,7 @@ class DispatchTable final {
     bool is_valid_;
 
     TensorTypeId get_dispatch_key(const Stack* stack) const {
-      auto first_tensor_arg = torch::jit::peek(
+      const IValue& first_tensor_arg = torch::jit::peek(
         *stack,
         0,
         reverse_index_of_first_tensor_arg_
@@ -217,8 +217,7 @@ class DispatchTable final {
         }
         return tensor_list[0].type_id();
       } else {
-        // TODO Avoid bumping the refcounter
-        return first_tensor_arg.toTensor().type_id();
+        return first_tensor_arg.unsafeToTensorImpl()->type_id();
       }
     }
   };
diff --git a/aten/src/ATen/core/function_schema_inl.h b/aten/src/ATen/core/function_schema_inl.h
index d5e4c188589b..d1bfa03c4009 100644
--- a/aten/src/ATen/core/function_schema_inl.h
+++ b/aten/src/ATen/core/function_schema_inl.h
@@ -97,7 +97,7 @@ inline void FunctionSchema::checkAndNormalizeInputs(
     std::vector<IValue>& inputs,
     const std::unordered_map<std::string, IValue>& kwargs) const {
   // Do we have more inputs than the schema accepts?
-  AT_CHECK(
+  TORCH_CHECK(
       inputs.size() <= arguments().size(),
       "Expected at most ",
       arguments().size(),
diff --git a/aten/src/ATen/core/interned_strings.h b/aten/src/ATen/core/interned_strings.h
index f073f061259b..1f1d1f4e0e86 100644
--- a/aten/src/ATen/core/interned_strings.h
+++ b/aten/src/ATen/core/interned_strings.h
@@ -5,9 +5,12 @@
 #include <unordered_map>
 #include <algorithm>
 
-#include <ATen/core/aten_interned_strings.h>
 #include <c10/macros/Macros.h>
 
+#if !defined(C10_MOBILE) || defined(FEATURE_TORCH_MOBILE)
+#include <ATen/core/aten_interned_strings.h>
+#endif
+
 namespace c10 {
 
 #if !defined(C10_MOBILE) || defined(FEATURE_TORCH_MOBILE)
@@ -70,6 +73,7 @@ namespace c10 {
   _(prim, requires_grad)           \
   _(prim, AutogradAdd)             \
   _(prim, GradOf)                  \
+  _(prim, Guard)                   \
   _(prim, FusedConcat)             \
   _(prim, ConstantChunk)           \
   _(prim, MMTreeReduce)            \
@@ -205,7 +209,7 @@ namespace c10 {
 // 'onnx' symbols correspond to ONNX operators.  Their semantics
 // are defined in https://github.com/onnx/onnx/blob/master/docs/Operators.md
 // The particular version we are targeting is specified by '_onnx_opset_version'
-// in torch.onnx.symbolic
+// in torch.onnx.symbolic_helper
 //
 // In general, most ONNX operators won't get an entry here, because they
 // are handled from the Python end.  However, you may occasionally need
diff --git a/aten/src/ATen/core/ivalue.cpp b/aten/src/ATen/core/ivalue.cpp
index 7382f904a997..b41c8b311256 100644
--- a/aten/src/ATen/core/ivalue.cpp
+++ b/aten/src/ATen/core/ivalue.cpp
@@ -114,6 +114,16 @@ std::string ivalue::Object::name() const {
   return this->type_->qualname();
 }
 
+IValue ivalue::Object::getAttr(const std::string& name) const {
+  const size_t slot = type_->getAttributeSlot(name);
+  return getSlot(slot);
+}
+
+void ivalue::Object::setAttr(const std::string& name, IValue v) {
+  const size_t slot = type_->getAttributeSlot(name);
+  setSlot(slot, std::move(v));
+}
+
 void ivalue::Object::resizeObject(size_t slot) {
   AT_ASSERT(slot < type()->numAttributes());
   slots_.resize(type()->numAttributes());
diff --git a/aten/src/ATen/core/ivalue.h b/aten/src/ATen/core/ivalue.h
index 9d6e46b3306a..dfc809323c89 100644
--- a/aten/src/ATen/core/ivalue.h
+++ b/aten/src/ATen/core/ivalue.h
@@ -1,623 +1,409 @@
 #pragma once
 
-#include <condition_variable>
-#include <type_traits>
-
-#include <ATen/core/interned_strings.h>
-#include <c10/core/Scalar.h>
-#include <c10/core/TensorImpl.h>
-#include <c10/core/UndefinedTensorImpl.h>
-#include <ATen/core/ivalue_base.h>
-#include <ATen/core/Dict.h>
+#include <ATen/core/blob.h>
+#include <c10/util/intrusive_ptr.h>
+#include <ATen/core/Tensor.h>
 
+namespace torch {
+namespace jit {
+namespace script {
+struct Function;
+}
+} // namespace jit
+} // namespace torch
 namespace c10 {
+template<class Key, class Value> class Dict;
 struct IValue;
-struct ClassType;
-
-template<class T, class NullType>
-c10::intrusive_ptr<T, NullType> IValue::moveToIntrusivePtr() {
-  auto t = c10::intrusive_ptr<T, NullType>::reclaim(static_cast<T*>(payload.as_intrusive_ptr));
-  clearToNone();
-  return t;
-}
-template<typename T, class NullType>
-c10::intrusive_ptr<T, NullType> IValue::toIntrusivePtr() const {
-  auto r = c10::intrusive_ptr<T, NullType>::reclaim(static_cast<T*>(payload.as_intrusive_ptr));
-  auto p = r;
-  r.release();
-  return p;
-}
-
-inline c10::intrusive_ptr<ivalue::Tuple> IValue::toTuple() && {
-  AT_ASSERT(isTuple());
-  return moveToIntrusivePtr<ivalue::Tuple>();
-}
-inline c10::intrusive_ptr<ivalue::Tuple> IValue::toTuple() const & {
-  AT_ASSERT(isTuple());
-  return toIntrusivePtr<ivalue::Tuple>();
-}
-inline c10::intrusive_ptr<ivalue::Future> IValue::toFuture() && {
-  AT_ASSERT(isFuture());
-  return moveToIntrusivePtr<ivalue::Future>();
-}
-inline c10::intrusive_ptr<ivalue::Future> IValue::toFuture() const & {
-  AT_ASSERT(isFuture());
-  return toIntrusivePtr<ivalue::Future>();
-}
-inline c10::intrusive_ptr<ivalue::IntList> IValue::toIntList() && {
-  AT_ASSERT(isIntList());
-  return moveToIntrusivePtr<ivalue::IntList>();
-}
-inline c10::intrusive_ptr<ivalue::IntList> IValue::toIntList() const & {
-  AT_ASSERT(isIntList());
-  return toIntrusivePtr<ivalue::IntList>();
-}
-inline c10::intrusive_ptr<ivalue::ConstantString> IValue::toString() && {
-  AT_ASSERT(isString());
-  return moveToIntrusivePtr<ivalue::ConstantString>();
-}
-inline c10::intrusive_ptr<ivalue::ConstantString> IValue::toString() const & {
-  AT_ASSERT(isString());
-  return toIntrusivePtr<ivalue::ConstantString>();
-}
-inline c10::intrusive_ptr<ivalue::DoubleList> IValue::toDoubleList() && {
-  AT_ASSERT(isDoubleList());
-  return moveToIntrusivePtr<ivalue::DoubleList>();
-}
-inline c10::intrusive_ptr<ivalue::DoubleList> IValue::toDoubleList() const & {
-  AT_ASSERT(isDoubleList());
-  return toIntrusivePtr<ivalue::DoubleList>();
-}
-inline c10::intrusive_ptr<ivalue::BoolList> IValue::toBoolList() && {
-  AT_ASSERT(isBoolList());
-  return moveToIntrusivePtr<ivalue::BoolList>();
-}
-inline c10::intrusive_ptr<ivalue::BoolList> IValue::toBoolList() const & {
-  AT_ASSERT(isBoolList());
-  return toIntrusivePtr<ivalue::BoolList>();
-}
-inline c10::intrusive_ptr<ivalue::TensorList> IValue::toTensorList() && {
-  AT_ASSERT(isTensorList());
-  return moveToIntrusivePtr<ivalue::TensorList>();
-}
-inline c10::intrusive_ptr<ivalue::TensorList> IValue::toTensorList() const & {
-  AT_ASSERT(isTensorList());
-  return toIntrusivePtr<ivalue::TensorList>();
-}
-inline c10::intrusive_ptr<ivalue::GenericList> IValue::toGenericList() && {
-  AT_ASSERT(isGenericList());
-  return moveToIntrusivePtr<ivalue::GenericList>();
-}
-inline c10::intrusive_ptr<ivalue::GenericList> IValue::toGenericList() const & {
-  AT_ASSERT(isGenericList());
-  return toIntrusivePtr<ivalue::GenericList>();
-}
-inline c10::intrusive_ptr<ivalue::GenericDict> IValue::toGenericDict() && {
-  AT_ASSERT(isGenericDict());
-  return moveToIntrusivePtr<ivalue::GenericDict>();
-}
-inline c10::intrusive_ptr<ivalue::GenericDict> IValue::toGenericDict() const & {
-  AT_ASSERT(isGenericDict());
-  return toIntrusivePtr<ivalue::GenericDict>();
-}
-inline c10::intrusive_ptr<ivalue::Object> IValue::toObject() && {
-  AT_ASSERT(isObject());
-  return toIntrusivePtr<ivalue::Object>();
-}
-inline c10::intrusive_ptr<ivalue::Object> IValue::toObject() const & {
-  AT_ASSERT(isObject());
-  return toIntrusivePtr<ivalue::Object>();
-}
-inline at::Tensor IValue::toTensor() && {
-  AT_ASSERT(isTensor());
-  return at::Tensor(moveToIntrusivePtr<at::TensorImpl, at::UndefinedTensorImpl>());
-}
-inline at::Tensor IValue::toTensor() const & {
-  AT_ASSERT(isTensor());
-  return at::Tensor(toIntrusivePtr<at::TensorImpl, at::UndefinedTensorImpl>());
-}
-inline c10::intrusive_ptr<caffe2::Blob> IValue::toBlob() && {
-  AT_ASSERT(isBlob());
-  return moveToIntrusivePtr<caffe2::Blob>();
-}
-inline c10::intrusive_ptr<caffe2::Blob> IValue::toBlob() const & {
-  AT_ASSERT(isBlob());
-  return toIntrusivePtr<caffe2::Blob>();;
-}
-
 namespace ivalue {
-
-template <typename T>
-using Shared = c10::intrusive_ptr<T>;
-
-// string
-struct CAFFE2_API ConstantString final : c10::intrusive_ptr_target {
- private:
-  const std::string str_;
- public:
-  ConstantString(std::string str)
-  : str_(std::move(str)) {}
-  static c10::intrusive_ptr<ConstantString> create(std::string str_);
-  const std::string & string() const {
-    return str_;
-  }
-  operator const std::string & () const {
-    return string();
-  }
-  CAFFE2_API friend std::ostream& operator<<(
-      std::ostream& out,
-      const ConstantString& v);
-};
-
-template <typename Elem>
-struct CAFFE2_API List : c10::intrusive_ptr_target {
- private:
-  std::vector<Elem> elements_;
-
- public:
-  typedef Elem ElemType;
-
-  List(std::vector<Elem> elements_) : elements_(std::move(elements_)) {}
-  static c10::intrusive_ptr<List<Elem>> create(std::vector<Elem> elements_) {
-    return c10::make_intrusive<List<Elem>>(std::move(elements_));
-  }
-  const std::vector<Elem>& elements() const & {
-    return elements_;
-  }
-  operator const std::vector<Elem>&() const {
-    return elements();
-  }
-
-  std::vector<Elem>& elements() & {
-    return elements_;
-  }
-  operator std::vector<Elem>&() {
-    return elements();
-  }
-
-  std::vector<Elem>&& elements() && {
-    return std::move(elements_);
-  }
-};
-
+struct Tuple;
+template<class Elem> struct List;
+using IntList = List<int64_t>;
+using TensorList = List<at::Tensor>;
+using DoubleList = List<double>;
+using BoolList = List<bool>;
+using GenericList = List<IValue>;
 struct Future;
+struct ConstantString;
 struct GenericDict;
-
-struct CAFFE2_API Tuple : public List<IValue> {
-  using List<IValue>::List;
-  static c10::intrusive_ptr<Tuple> create(std::vector<IValue> elements_) {
-    return c10::make_intrusive<Tuple>(std::move(elements_));
-  }
-};
-
 struct Object;
 }
 
-// Future
-struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target {
- private:
-  c10::intrusive_ptr<Future> intrusive_from_this() {
-    c10::raw::intrusive_ptr::incref(this); // we are creating a new pointer
-                                           // from a raw `this` pointer
-                                           // so we need to bump the refcount
-                                           // to account for this ownership
-    return c10::intrusive_ptr<Future>::reclaim(this);
+// IValue is the generic tagged union used by the interpreter to hold
+// all value types.
+// It is a 16-byte object with an 8-byte payload and an 8-byte tag.
+// The tag is currently 4 bytes to determine the type, and 1 byte
+// to mark whether that type is a subtype of c10::intrusive_ptr_target and needs
+// retain/release calls.
+
+#define TORCH_FORALL_TAGS(_) \
+  _(None) \
+  _(Tensor) \
+  _(Double) \
+  _(Int) \
+  _(Bool) \
+  _(Tuple) \
+  _(IntList) \
+  _(DoubleList) \
+  _(BoolList) \
+  _(String) \
+  _(TensorList) \
+  _(Blob) \
+  _(GenericList) \
+  _(GenericDict) \
+  _(Future) \
+  _(Device) \
+  _(Object)
+
+struct CAFFE2_API IValue final {
+  IValue()
+  : payload{0}
+  , tag(Tag::None)
+  , is_intrusive_ptr(false) {}
+  IValue(const IValue& rhs)
+      : payload(rhs.payload),
+        tag(rhs.tag),
+        is_intrusive_ptr(rhs.is_intrusive_ptr) {
+    if (is_intrusive_ptr) {
+      c10::raw::intrusive_ptr::incref(payload.as_intrusive_ptr);
+    }
   }
-
- public:
-  struct CAFFE2_API FutureError final : public std::exception {
-    FutureError(std::string&& error_msg_)
-        : error_msg(std::move(error_msg_)) {}
-
-    FutureError() = default;
-
-    const char* what() const noexcept override {
-      return error_msg.c_str();
+  IValue(IValue&& rhs) noexcept : IValue() {
+    swap(rhs);
+  }
+  ~IValue() {
+    if (is_intrusive_ptr) {
+      c10::raw::intrusive_ptr::decref(payload.as_intrusive_ptr);
     }
+  }
+  IValue & operator=(IValue && rhs) & noexcept {
+    IValue(std::move(rhs)).swap(*this); // this also sets rhs to None
+    return *this;
+  }
+  IValue & operator=(IValue const & rhs) & {
+    IValue(rhs).swap(*this);
+    return *this;
+  }
 
-    std::string error_msg;
-  };
+  void dump() const;
 
-  /**
-  * Wait on the future until it completes.
-  */
-  void wait() {
-    if (completed()) {
-      return;
+  bool isAliasOf(const IValue& rhs) const {
+    if (this->tag != rhs.tag) {
+      // Trivially don't alias if the type is different
+      return false;
     }
-    std::condition_variable finished;
-    bool fired = false;
-
-    // Add a callback to notify the current thread
-    // when the current future completes.
-    addCallback([&] {
-      std::unique_lock<std::mutex> lock(mutex_);
-      finished.notify_all();
-      fired = true;
-    });
-
-    // The current thread will be blocked unless the above callback is fired.
-    std::unique_lock<std::mutex> lock(mutex_);
-    while (!fired) {
-      finished.wait(lock);
+
+    if (!this->is_intrusive_ptr) {
+      // Primitive types don't alias anything
+      return false;
     }
 
-    AT_ASSERT(completed());
-  }
+    AT_ASSERT(rhs.is_intrusive_ptr);
 
-  /**
-   * Explicitly mark the future as completed with the output value.
-   */
-  void markCompleted(IValue value) {
-    {
-      // This is not to protect completed_ but to create a barrier
-      // from possible addCallback() calls
-      std::unique_lock<std::mutex> lock(mutex_);
-      AT_ASSERT(!completed());
-      completed_ = true;
-      value_ = std::move(value);
+    // Tensors should be compared based on internal storage
+    if (this->isTensor()) {
+      const auto thisTensor = this->toTensor();
+      const auto rhsTensor = rhs.toTensor();
+      return thisTensor.is_alias_of(rhsTensor);
     }
 
-    fireCallbacks();
+    // Other types can be compared by their ptr value
+    return this->payload.as_intrusive_ptr == rhs.payload.as_intrusive_ptr;
+  }
+  void swap(IValue & rhs) noexcept {
+    std::swap(payload, rhs.payload);
+    std::swap(is_intrusive_ptr, rhs.is_intrusive_ptr);
+    std::swap(tag, rhs.tag);
   }
 
-  void markCompleted(FutureError&& error_) {
-    {
-      // This is not to protect completed_ but to create a barrier
-      // from possible addCallback() calls
-      std::unique_lock<std::mutex> lock(mutex_);
-      AT_ASSERT(!completed());
-      completed_ = true;
-      has_error = true;
-      error = std::move(error_);
-    }
-
-    fireCallbacks();
+  // Accessors for subtypes are arranged together below
+  // While some of these accessors could be generated through templates,
+  // we prefer to write them manually for clarity
+
+  // Tensor
+  IValue(at::Tensor t)
+  : tag(Tag::Tensor), is_intrusive_ptr(t.defined())  {
+    // Note: the undefined tensor is not refcounted, so while it
+    // is tagged as a tensor, is_intrusive_ptr is set to false.
+    // This is not an optional optimization: our incref call
+    // *will not* do the right thing when called on an
+    // undefined tensor.
+    payload.as_intrusive_ptr = t.unsafeReleaseTensorImpl();
+  }
+  bool isTensor() const { return Tag::Tensor == tag; }
+  at::Tensor toTensor() &&;
+  at::Tensor toTensor() const &;
+  at::TensorImpl* unsafeToTensorImpl() const {
+    return static_cast<at::TensorImpl*>(payload.as_intrusive_ptr);
   }
 
-  // Get the result of the current future.
-  IValue value() {
-    std::unique_lock<std::mutex> lock(mutex_);
-    AT_ASSERT(completed());
-    if (has_error) {
-      throw error;
-    }
-    return value_;
-  }
-
-  /**
-   * Add a callback to the future.
-   * The callbacks will be executed once the future completes.
-   * If the future has already completed,
-   * this function will execute the callback immediately.
-   */
-  void addCallback(std::function<void(void)> callback) {
-    std::unique_lock<std::mutex> lock(mutex_);
-    if (completed()) {
-      lock.unlock();
-      callback();
-      return;
-    }
-    callbacks.push_back(callback);
+  const IValue& toIValue() const {
+    return *this;
+  }
+  IValue& toIValue() {
+    return *this;
   }
 
-  // Check if the current future has completed
-  bool completed() {
-    return completed_;
+  IValue(intrusive_ptr<caffe2::Blob> blob)
+  : tag(Tag::Blob), is_intrusive_ptr(true) {
+    // TODO (after Tensor merge) If we pass in a Blob holding a Tensor, extract
+    // and store it as a Tensor instead.
+    payload.as_intrusive_ptr = blob.release();
+  }
+  bool isBlob() const {
+    return Tag::Blob == tag;
+  }
+  c10::intrusive_ptr<caffe2::Blob> toBlob() &&;
+  c10::intrusive_ptr<caffe2::Blob> toBlob() const &;
+
+  // Tuple
+  IValue(c10::intrusive_ptr<ivalue::Tuple> v);
+  bool isTuple() const { return Tag::Tuple == tag; }
+  c10::intrusive_ptr<ivalue::Tuple> toTuple() &&;
+  c10::intrusive_ptr<ivalue::Tuple> toTuple() const &;
+
+  // Double
+  IValue(double d)
+  : tag(Tag::Double), is_intrusive_ptr(false) {
+    payload.as_double = d;
+  }
+  bool isDouble() const { return Tag::Double == tag; }
+  double toDouble() const {
+    AT_ASSERT(isDouble());
+    return payload.as_double;
   }
 
-  CAFFE2_API friend std::ostream& operator<<(
-      std::ostream& out,
-      const Future& v);
+  // Future
+  IValue(c10::intrusive_ptr<ivalue::Future> v);
+  bool isFuture() const { return Tag::Future == tag; }
+  c10::intrusive_ptr<ivalue::Future> toFuture() &&;
+  c10::intrusive_ptr<ivalue::Future> toFuture() const &;
 
- private:
-  void fireCallbacks() {
-    AT_ASSERT(completed());
-    // There is no need to protect callbacks with the lock.
-    // Once completed_ is set to true, no one can add new callback to the list.
-    for (auto& callback : callbacks) {
-      callback();
-    }
-    callbacks.clear();
+  // Int
+  IValue(int64_t i)
+  : tag(Tag::Int), is_intrusive_ptr(false) {
+    payload.as_int = i;
   }
 
-  std::mutex mutex_;
-  IValue value_; // when finished the value
-  std::atomic_bool completed_ = {false}; // is this future complete
-  std::vector<std::function<void(void)>> callbacks;
-  bool has_error = false;
-  FutureError error;
-};
+  // allow you to pass literals (3, 4) without ambiguity
+  IValue(int32_t i)
+  : IValue(static_cast<int64_t>(i)) {}
 
-// User-defined object.
-struct C10_EXPORT ivalue::Object final : c10::intrusive_ptr_target {
- public:
-  Object(std::shared_ptr<ClassType> type, size_t numSlots) : type_(std::move(type)) {
-    slots_.resize(numSlots);
-  }
+  bool isInt() const { return Tag::Int == tag; }
 
-  static c10::intrusive_ptr<Object> create(
-      std::shared_ptr<ClassType> type,
-      size_t numSlots) {
-    return c10::make_intrusive<Object>(std::move(type), numSlots);
+  int64_t toInt() const {
+    AT_ASSERT(isInt());
+    return payload.as_int;
   }
 
-  void setSlot(size_t slot, IValue v) {
-    if (slot >= slots_.size()) {
-      // for module types, it is possible that the members of the class have
-      // expanded after the object was created. In this case, we expand
-      // the slots to the right size
-      resizeObject(slot);
-    }
-    slots_[slot] = v;
+  // Bool
+  IValue(bool b)
+  : tag(Tag::Bool), is_intrusive_ptr(false) {
+    payload.as_bool = b;
   }
-
-  const IValue& getSlot(size_t slot) const {
-    return slots_.at(slot);
+   bool isBool() const { return Tag::Bool == tag; }
+   bool toBool() const {
+    AT_ASSERT(isBool());
+    return payload.as_bool;
   }
 
-  std::string name() const;
-
-  const std::vector<IValue>& slots() const {
-    return slots_;
+  // IntList
+  IValue(c10::intrusive_ptr<ivalue::IntList> v);
+  IValue(std::vector<int64_t> v);
+  IValue(at::ArrayRef<int64_t> v)
+  : IValue(v.vec()) {}
+  bool isIntList() const { return Tag::IntList == tag; }
+  c10::intrusive_ptr<ivalue::IntList> toIntList() &&;
+  c10::intrusive_ptr<ivalue::IntList> toIntList() const &;
+
+  const std::vector<int64_t>& toIntListRef() const;
+  const std::vector<double>& toDoubleListRef() const;
+  const std::vector<bool>& toBoolListRef() const;
+  const std::vector<at::Tensor>& toTensorListRef() const;
+  const std::vector<IValue>& toGenericListRef() const;
+  const c10::Dict<IValue, IValue>& toGenericDictRef() const;
+  const std::string& toStringRef() const;
+
+  // ConstantString
+  IValue(c10::intrusive_ptr<ivalue::ConstantString> v);
+  IValue(std::string v);
+  IValue(const char* v): IValue(std::string(v)) {}
+  bool isString() const { return Tag::String == tag; }
+  c10::intrusive_ptr<ivalue::ConstantString> toString() &&;
+  c10::intrusive_ptr<ivalue::ConstantString> toString() const &;
+
+  // DoubleList
+  IValue(c10::intrusive_ptr<ivalue::DoubleList> v);
+  IValue(std::vector<double> v);
+  bool isDoubleList() const { return Tag::DoubleList == tag; }
+  c10::intrusive_ptr<ivalue::DoubleList> toDoubleList() &&;
+  c10::intrusive_ptr<ivalue::DoubleList> toDoubleList() const &;
+
+  // BoolList
+  IValue(c10::intrusive_ptr<ivalue::BoolList> v);
+  IValue(std::vector<bool> v);
+  bool isBoolList() const { return Tag::BoolList == tag; }
+  c10::intrusive_ptr<ivalue::BoolList> toBoolList() &&;
+  c10::intrusive_ptr<ivalue::BoolList> toBoolList() const &;
+
+  //TensorList
+  IValue(c10::intrusive_ptr<ivalue::TensorList> v);
+  IValue(std::vector<at::Tensor> v);
+  bool isTensorList() const { return Tag::TensorList == tag; }
+  c10::intrusive_ptr<ivalue::TensorList> toTensorList() &&;
+  c10::intrusive_ptr<ivalue::TensorList> toTensorList() const &;
+
+  //GenericList
+  IValue(c10::intrusive_ptr<ivalue::GenericList> v);
+  IValue(std::vector<IValue> v);
+  bool isGenericList() const { return Tag::GenericList == tag; }
+  c10::intrusive_ptr<ivalue::GenericList> toGenericList() &&;
+  c10::intrusive_ptr<ivalue::GenericList> toGenericList() const &;
+
+  // GenericDict
+  IValue(c10::intrusive_ptr<ivalue::GenericDict> v);
+  IValue(c10::Dict<IValue, IValue> v);
+  bool isGenericDict() const { return Tag::GenericDict == tag; }
+  c10::intrusive_ptr<ivalue::GenericDict> toGenericDict() &&;
+  c10::intrusive_ptr<ivalue::GenericDict> toGenericDict() const &;
+
+  // ClassType
+  IValue(c10::intrusive_ptr<ivalue::Object> v);
+  bool isObject() const { return tag == Tag::Object; }
+  c10::intrusive_ptr<ivalue::Object> toObject() &&;
+  c10::intrusive_ptr<ivalue::Object> toObject() const & ;
+
+  // None
+  bool isNone() const {
+    return Tag::None == tag;
   }
-  std::shared_ptr<ClassType> type() const {
-    return type_;
+  std::string toNone() const {
+    AT_ASSERT(isNone());
+    return "None";
   }
-
- private:
-  void resizeObject(size_t slot);
-  std::shared_ptr<ClassType> type_;
-  std::vector<IValue> slots_;
-};
-
-struct C10_EXPORT ivalue::GenericDict : c10::intrusive_ptr_target {
- private:
-  c10::impl::GenericDict elements_;
-
- public:
-  GenericDict(c10::impl::GenericDict elements_)
-      : elements_(std::move(elements_)) {}
-  static c10::intrusive_ptr<GenericDict> create(
-      c10::impl::GenericDict elements_) {
-    return c10::make_intrusive<GenericDict>(std::move(elements_));
+  // Scalar, which gets encoded as either an Int or a Double
+  IValue(at::Scalar s)
+  : IValue() {
+    if(s.isFloatingPoint()) {
+      *this = s.toDouble();
+    } else {
+      *this = s.toLong();
+    }
   }
-  const c10::impl::GenericDict& elements() const & {
-    return elements_;
+  bool isScalar() const {
+    return isDouble() || isInt();
   }
-  c10::impl::GenericDict& elements() & {
-    return elements_;
+  at::Scalar toScalar() const {
+    if(isDouble())
+      return toDouble();
+    else if(isInt())
+      return toInt();
+    throw std::runtime_error("IValue is not a Scalar");
   }
 
-  using IterationOrder = std::vector<std::pair<IValue, IValue>>;
-  const IterationOrder iterationOrder() const;
-};
-
-#undef TORCH_FORALL_TAGS
-
-namespace detail {
-
-struct _guarded_unsigned_long_unique_dummy final {
-  _guarded_unsigned_long_unique_dummy(int64_t){};
-};
-using _guarded_unsigned_long = c10::guts::conditional_t<
-    std::is_same<unsigned long, uint32_t>::value ||
-        std::is_same<unsigned long, uint64_t>::value,
-    _guarded_unsigned_long_unique_dummy,
-    unsigned long>;
-
-} // namespace detail
-
-#define DEFINE_TO(type, method_name) \
-template<> \
-inline type IValue::to<type>() && { \
-  return std::move(*this).method_name(); \
-} \
-template<> \
-inline type IValue::to<type>() const & { \
-  return this->method_name(); \
-}
-DEFINE_TO(at::Tensor, toTensor)
-DEFINE_TO(c10::intrusive_ptr<ivalue::Tuple>, toTuple)
-DEFINE_TO(float, toDouble)
-DEFINE_TO(double, toDouble)
-DEFINE_TO(unsigned char, toInt)
-DEFINE_TO(signed char, toInt)
-DEFINE_TO(unsigned short, toInt)
-DEFINE_TO(short, toInt)
-DEFINE_TO(int, toInt)
-DEFINE_TO(uint32_t, toInt)
-DEFINE_TO(uint64_t, toInt)
-DEFINE_TO(detail::_guarded_unsigned_long, toInt)
-DEFINE_TO(int64_t, toInt)
-DEFINE_TO(bool, toBool)
-DEFINE_TO(c10::intrusive_ptr<caffe2::Blob>, toBlob);
-DEFINE_TO(c10::intrusive_ptr<ivalue::DoubleList>, toDoubleList)
-DEFINE_TO(c10::intrusive_ptr<ivalue::IntList>, toIntList)
-DEFINE_TO(c10::intrusive_ptr<ivalue::BoolList>, toBoolList)
-DEFINE_TO(c10::intrusive_ptr<ivalue::TensorList>, toTensorList)
-DEFINE_TO(c10::intrusive_ptr<ivalue::GenericList>, toGenericList)
-DEFINE_TO(c10::intrusive_ptr<ivalue::GenericDict>, toGenericDict)
-DEFINE_TO(c10::intrusive_ptr<ivalue::ConstantString>, toString)
-DEFINE_TO(c10::intrusive_ptr<ivalue::Object>, toObject)
-DEFINE_TO(at::Scalar, toScalar)
-DEFINE_TO(std::vector<int64_t>, toIntListRef)
-DEFINE_TO(std::vector<double>, toDoubleListRef)
-DEFINE_TO(std::vector<bool>, toBoolListRef)
-DEFINE_TO(std::vector<at::Tensor>, toTensorListRef)
-DEFINE_TO(std::vector<IValue>, toGenericListRef)
-DEFINE_TO(std::string, toStringRef)
-DEFINE_TO(c10::intrusive_ptr<ivalue::Future>, toFuture)
-DEFINE_TO(IValue, toIValue)
-DEFINE_TO(c10::Device, toDevice)
-DEFINE_TO(at::ScalarType, toScalarType)
-DEFINE_TO(at::Layout, toLayout)
-
-template <typename T>
-struct _fake_type {};
-
-template <typename Elem>
-std::vector<Elem> generic_to(
-    const IValue* ivalue,
-    _fake_type<std::vector<Elem>>) {
-  return fmap(ivalue->toGenericListRef(), [](IValue item_ivalue) { return item_ivalue.to<Elem>(); });
-}
-
-template <typename K, typename V>
-std::unordered_map<K, V> generic_to(
-    const IValue* ivalue,
-    _fake_type<std::unordered_map<K, V>>) {
-  std::unordered_map<K, V> specialized_dict;
-
-  for (auto item : ivalue->toGenericDictRef()) {
-    specialized_dict[item.key().to<K>()] = item.value().to<V>();
+  // Device
+  IValue(c10::Device d)
+  : tag(Tag::Device), is_intrusive_ptr(false) {
+    payload.as_device.type = d.type();
+    payload.as_device.index = d.index();
+  }
+  bool isDevice() const { return Tag::Device == tag; }
+  c10::Device toDevice() const {
+    AT_ASSERT(isDevice());
+    return c10::Device(payload.as_device.type, payload.as_device.index);
   }
 
-  return specialized_dict;
-}
-
-template <typename T>
-inline T IValue::to() && {
-  return generic_to(this, _fake_type<T>{});
-}
-
-template <typename T>
-inline T IValue::to() const& {
-  return generic_to(this, _fake_type<T>{});
-}
-
-// note: when adding a DEFINE_TO case here you should also add a
-// toX method to IValue. These named methods are much more discoverable
-// than the to templated function.
-
-inline IValue::IValue(c10::intrusive_ptr<ivalue::Tuple> v)
-: tag(Tag::Tuple), is_intrusive_ptr(true) {
-  payload.as_intrusive_ptr = v.release();
-}
-
-inline IValue::IValue(c10::intrusive_ptr<ivalue::IntList> v)
-: tag(Tag::IntList), is_intrusive_ptr(true) {
-  payload.as_intrusive_ptr = v.release();
-}
-inline IValue::IValue(std::vector<int64_t> v)
-: IValue(ivalue::IntList::create(std::move(v))) {}
-
-inline IValue::IValue(c10::intrusive_ptr<ivalue::ConstantString> v)
-: tag(Tag::String), is_intrusive_ptr(true) {
-  payload.as_intrusive_ptr = v.release();
-}
-inline IValue::IValue(std::string v)
-: IValue(ivalue::ConstantString::create(std::move(v))) {}
-
-inline IValue::IValue(c10::intrusive_ptr<ivalue::DoubleList> v)
-: tag(Tag::DoubleList), is_intrusive_ptr(true) {
-  payload.as_intrusive_ptr = v.release();
-}
-inline IValue::IValue(std::vector<double> v)
-: IValue(ivalue::DoubleList::create(std::move(v))) {}
+  // ScalarType
+  at::ScalarType toScalarType() const {
+    return static_cast<at::ScalarType>(toInt());
+  }
 
-inline IValue::IValue(c10::intrusive_ptr<ivalue::BoolList> v)
-: tag(Tag::BoolList), is_intrusive_ptr(true) {
-  payload.as_intrusive_ptr = v.release();
-}
-inline IValue::IValue(std::vector<bool> v)
-: IValue(ivalue::BoolList::create(std::move(v))) {}
+  // Layout
+  at::Layout toLayout() const {
+    return static_cast<at::Layout>(toInt());
+  }
 
-inline IValue::IValue(c10::intrusive_ptr<ivalue::TensorList> v)
-: tag(Tag::TensorList), is_intrusive_ptr(true) {
-  payload.as_intrusive_ptr = v.release();
-}
-inline IValue::IValue(std::vector<at::Tensor> v)
-: IValue(ivalue::TensorList::create(std::move(v))) {}
+  // MemoryFormat
+  at::MemoryFormat toMemoryFormat() const {
+    return static_cast<at::MemoryFormat>(toInt());
+  }
 
-inline IValue::IValue(c10::intrusive_ptr<ivalue::GenericList> v)
-: tag(Tag::GenericList), is_intrusive_ptr(true) {
-  payload.as_intrusive_ptr = v.release();
-}
-inline IValue::IValue(std::vector<IValue> v)
-: IValue(ivalue::GenericList::create(std::move(v))) {}
 
-inline IValue::IValue(c10::intrusive_ptr<ivalue::GenericDict> v)
-: tag(Tag::GenericDict), is_intrusive_ptr(true) {
-  payload.as_intrusive_ptr = v.release();
-}
-inline IValue::IValue(c10::impl::GenericDict v)
-: IValue(ivalue::GenericDict::create(std::move(v))) {}
+  // for debugging
+  std::string tagKind() const {
+    switch(tag) {
+      #define DEFINE_CASE(x) case Tag::x: return #x;
+      TORCH_FORALL_TAGS(DEFINE_CASE)
+      #undef DEFINE_CASE
+    }
+    return "Invalid Tag";
+  }
 
-inline IValue::IValue(c10::intrusive_ptr<ivalue::Object> v)
-: tag(Tag::Object), is_intrusive_ptr(true) {
-  payload.as_intrusive_ptr = v.release();
-}
-inline IValue::IValue(c10::intrusive_ptr<ivalue::Future> v)
-: tag(Tag::Future), is_intrusive_ptr(true) {
-  payload.as_intrusive_ptr = v.release();
-}
+  // generic v.to<at::Tensor>() implementations
+  // that can be used in special functions like pop/push
+  // that use template meta-programming.
+  // prefer the directly named methods when you can,
+  // since they are simpler to understand
 
-inline const std::vector<int64_t>& IValue::toIntListRef() const {
-  return toIntList()->elements();
-}
+  // Note: if you get linker errors saying one of these is missing,
+  // change it to ... && = delete; and you will see better error messages for why
+  // However, we cannot commit this because some compiler versions barf on it.
+  template<typename T>
+  T to() &&;
+  template<typename T>
+  T to() const &;
 
-inline const std::vector<double>& IValue::toDoubleListRef() const {
-  return toDoubleList()->elements();
-}
+  // ToOptional: convert a IValue to the Optional obj that accepts both T and None
+  template<typename T>
+  optional<T> toOptional();
 
-inline const std::vector<at::Tensor>& IValue::toTensorListRef() const {
-  return toTensorList()->elements();
-}
+  // this is a shallow comparison of two IValues to test the object identity
+  bool isSameIdentity(const IValue& rhs) const;
 
-inline const std::vector<bool>& IValue::toBoolListRef() const {
-  return toBoolList()->elements();
-}
+  CAFFE2_API friend std::ostream& operator<<(
+      std::ostream& out,
+      const IValue& v);
 
-inline const std::vector<IValue>& IValue::toGenericListRef() const {
-  return toGenericList()->elements();
-}
+  bool isPtrType() const {
+    return is_intrusive_ptr;
+  }
 
-inline const c10::impl::GenericDict& IValue::
-    toGenericDictRef() const {
-  return toGenericDict()->elements();
-}
+ private:
+  // NOTE: IValue tags are intentionally private. In the future we may encode
+  // this value different (e.g. using NaN boxing), and this would make it more
+  // costly to determine the tag for all types vs just determining if something
+  // is a particular type. Instead we want clients to use the `isX` methods when
+  // possible. If for perf. reasons you really, absolutely, must have a jump
+  // table, then we can revisit this.
+  enum class Tag : uint32_t {
+#define DEFINE_TAG(x) x,
+    TORCH_FORALL_TAGS(DEFINE_TAG)
+#undef DEFINE_TAG
+  };
 
-inline const std::string& IValue::toStringRef() const {
-  return toString()->string();
-}
+  template<class T, class NullType = c10::detail::intrusive_target_default_null_type<T>>
+  c10::intrusive_ptr<T, NullType> moveToIntrusivePtr();
+  template<typename T, class NullType = c10::detail::intrusive_target_default_null_type<T>>
+  c10::intrusive_ptr<T, NullType> toIntrusivePtr() const;
 
-template<typename T>
-inline optional<T> IValue::toOptional() {
-  if (this->isNone()) {
-    return nullopt;
+  void clearToNone() {
+    payload.as_int = 0;
+    tag = Tag::None;
+    is_intrusive_ptr = false;
   }
-  return this->to<T>();
-}
+  union {
+    int64_t as_int;
+    double as_double;
+    bool as_bool;
+    c10::intrusive_ptr_target* as_intrusive_ptr;
+    struct {
+      DeviceType type;
+      DeviceIndex index;
+    } as_device;
+  } payload;
+  Tag tag;
+  bool is_intrusive_ptr;
+};
 
-inline bool IValue::isSameIdentity(const IValue& rhs) const {
-  // We choose to not use memcmp for payload check due to potential random padding characters on union type
-
-  // Semantics:
-  // 1. None is None, False is False, and True is True are all true
-  // 2. If it is a tensor type, we need to take undefined tensor into account
-  // 3. Undefined_tensor is None and vice versa should be true
-  // 4. If it is a reference type (i.e. is_intrusive_ptr), then is is True when the pointed-to object is the same.
-  // 5. False for all other comparisons.
-  if (this->isNone() && rhs.isNone()) {
-    return true;
-  } else if (this->isBool() && rhs.isBool()) {
-    // for bool type, do equality check
-    return this->toBool() == rhs.toBool();
-  } else if (this->isTensor() && rhs.isTensor()) {
-    // for tensor type, just check the as_intrusive_ptr since is_intrusive_ptr is false for undefined tensor
-    return this->payload.as_intrusive_ptr == rhs.payload.as_intrusive_ptr;
-  } else if (this->isTensor() && rhs.isNone()) {
-    // special case: undefined tensor and None are the same identity
-    return !this->is_intrusive_ptr;
-  } else if (this->isNone() && rhs.isTensor()) {
-    // special case: undefined tensor and None are the same identity
-    return !rhs.is_intrusive_ptr;
-  } else {
-    // for objects holding in IValue, do shallow compare on pointer address to testify the identity
-    return this->is_intrusive_ptr && rhs.is_intrusive_ptr
-        && this->payload.as_intrusive_ptr == rhs.payload.as_intrusive_ptr;
-  }
 }
 
-} // namespace c10
+#include <ATen/core/ivalue_inl.h>
diff --git a/aten/src/ATen/core/ivalue_base.h b/aten/src/ATen/core/ivalue_base.h
deleted file mode 100644
index b1bd0a4473ef..000000000000
--- a/aten/src/ATen/core/ivalue_base.h
+++ /dev/null
@@ -1,391 +0,0 @@
-#pragma once
-
-#include <ATen/core/blob.h>
-#include <c10/util/intrusive_ptr.h>
-#include <ATen/core/Tensor.h>
-
-namespace c10 {
-template<class Key, class Value> class Dict;
-struct IValue;
-namespace ivalue {
-struct Tuple;
-template<class Elem> struct List;
-using IntList = List<int64_t>;
-using TensorList = List<at::Tensor>;
-using DoubleList = List<double>;
-using BoolList = List<bool>;
-using GenericList = List<IValue>;
-struct Future;
-struct ConstantString;
-struct GenericDict;
-struct Object;
-}
-
-// IValue is the generic tagged union used by the interpreter to hold
-// all value types.
-// It is a 16-byte object with an 8-byte payload and an 8-byte tag.
-// The tag is currently 4 bytes to determine the type, and 1 byte
-// to mark whether that type is a subtype of c10::intrusive_ptr_target and needs
-// retain/release calls.
-
-#define TORCH_FORALL_TAGS(_) \
-  _(None) \
-  _(Tensor) \
-  _(Double) \
-  _(Int) \
-  _(Bool) \
-  _(Tuple) \
-  _(IntList) \
-  _(DoubleList) \
-  _(BoolList) \
-  _(String) \
-  _(TensorList) \
-  _(Blob) \
-  _(GenericList) \
-  _(GenericDict) \
-  _(Future) \
-  _(Device) \
-  _(Object)
-
-struct CAFFE2_API IValue final {
-  IValue()
-  : payload{0}
-  , tag(Tag::None)
-  , is_intrusive_ptr(false) {}
-  IValue(const IValue& rhs)
-      : payload(rhs.payload),
-        tag(rhs.tag),
-        is_intrusive_ptr(rhs.is_intrusive_ptr) {
-    if (is_intrusive_ptr) {
-      c10::raw::intrusive_ptr::incref(payload.as_intrusive_ptr);
-    }
-  }
-  IValue(IValue&& rhs) noexcept : IValue() {
-    swap(rhs);
-  }
-  ~IValue() {
-    if (is_intrusive_ptr) {
-      c10::raw::intrusive_ptr::decref(payload.as_intrusive_ptr);
-    }
-  }
-  IValue & operator=(IValue && rhs) & noexcept {
-    IValue(std::move(rhs)).swap(*this); // this also sets rhs to None
-    return *this;
-  }
-  IValue & operator=(IValue const & rhs) & {
-    IValue(rhs).swap(*this);
-    return *this;
-  }
-
-  void dump() const;
-
-  bool isAliasOf(const IValue& rhs) const {
-    if (this->tag != rhs.tag) {
-      // Trivially don't alias if the type is different
-      return false;
-    }
-
-    if (!this->is_intrusive_ptr) {
-      // Primitive types don't alias anything
-      return false;
-    }
-
-    AT_ASSERT(rhs.is_intrusive_ptr);
-
-    // Tensors should be compared based on internal storage
-    if (this->isTensor()) {
-      const auto thisTensor = this->toTensor();
-      const auto rhsTensor = rhs.toTensor();
-      return thisTensor.is_alias_of(rhsTensor);
-    }
-
-    // Other types can be compared by their ptr value
-    return this->payload.as_intrusive_ptr == rhs.payload.as_intrusive_ptr;
-  }
-  void swap(IValue & rhs) noexcept {
-    std::swap(payload, rhs.payload);
-    std::swap(is_intrusive_ptr, rhs.is_intrusive_ptr);
-    std::swap(tag, rhs.tag);
-  }
-
-  // Accessors for subtypes are arranged together below
-  // While some of these accessors could be generated through templates,
-  // we prefer to write them manually for clarity
-
-  // Tensor
-  IValue(at::Tensor t)
-  : tag(Tag::Tensor), is_intrusive_ptr(t.defined())  {
-    // Note: the undefined tensor is not refcounted, so while it
-    // is tagged as a tensor, is_intrusive_ptr is set to false.
-    // This is not an optional optimization: our incref call
-    // *will not* do the right thing when called on an
-    // undefined tensor.
-    payload.as_intrusive_ptr = t.unsafeReleaseTensorImpl();
-  }
-  bool isTensor() const { return Tag::Tensor == tag; }
-  at::Tensor toTensor() &&;
-  at::Tensor toTensor() const &;
-
-  const IValue& toIValue() const {
-    return *this;
-  }
-  IValue& toIValue() {
-    return *this;
-  }
-
-  IValue(intrusive_ptr<caffe2::Blob> blob)
-  : tag(Tag::Blob), is_intrusive_ptr(true) {
-    // TODO (after Tensor merge) If we pass in a Blob holding a Tensor, extract
-    // and store it as a Tensor instead.
-    payload.as_intrusive_ptr = blob.release();
-  }
-  bool isBlob() const {
-    return Tag::Blob == tag;
-  }
-  c10::intrusive_ptr<caffe2::Blob> toBlob() &&;
-  c10::intrusive_ptr<caffe2::Blob> toBlob() const &;
-
-  // Tuple
-  IValue(c10::intrusive_ptr<ivalue::Tuple> v);
-  bool isTuple() const { return Tag::Tuple == tag; }
-  c10::intrusive_ptr<ivalue::Tuple> toTuple() &&;
-  c10::intrusive_ptr<ivalue::Tuple> toTuple() const &;
-
-  // Double
-  IValue(double d)
-  : tag(Tag::Double), is_intrusive_ptr(false) {
-    payload.as_double = d;
-  }
-  bool isDouble() const { return Tag::Double == tag; }
-  double toDouble() const {
-    AT_ASSERT(isDouble());
-    return payload.as_double;
-  }
-
-  // Future
-  IValue(c10::intrusive_ptr<ivalue::Future> v);
-  bool isFuture() const { return Tag::Future == tag; }
-  c10::intrusive_ptr<ivalue::Future> toFuture() &&;
-  c10::intrusive_ptr<ivalue::Future> toFuture() const &;
-
-  // Int
-  IValue(int64_t i)
-  : tag(Tag::Int), is_intrusive_ptr(false) {
-    payload.as_int = i;
-  }
-
-  // allow you to pass literals (3, 4) without ambiguity
-  IValue(int32_t i)
-  : IValue(static_cast<int64_t>(i)) {}
-
-  bool isInt() const { return Tag::Int == tag; }
-
-  int64_t toInt() const {
-    AT_ASSERT(isInt());
-    return payload.as_int;
-  }
-
-  // Bool
-  IValue(bool b)
-  : tag(Tag::Bool), is_intrusive_ptr(false) {
-    payload.as_bool = b;
-  }
-   bool isBool() const { return Tag::Bool == tag; }
-   bool toBool() const {
-    AT_ASSERT(isBool());
-    return payload.as_bool;
-  }
-
-  // IntList
-  IValue(c10::intrusive_ptr<ivalue::IntList> v);
-  IValue(std::vector<int64_t> v);
-  IValue(at::ArrayRef<int64_t> v)
-  : IValue(v.vec()) {}
-  bool isIntList() const { return Tag::IntList == tag; }
-  c10::intrusive_ptr<ivalue::IntList> toIntList() &&;
-  c10::intrusive_ptr<ivalue::IntList> toIntList() const &;
-
-  const std::vector<int64_t>& toIntListRef() const;
-  const std::vector<double>& toDoubleListRef() const;
-  const std::vector<bool>& toBoolListRef() const;
-  const std::vector<at::Tensor>& toTensorListRef() const;
-  const std::vector<IValue>& toGenericListRef() const;
-  const c10::Dict<IValue, IValue>& toGenericDictRef() const;
-  const std::string& toStringRef() const;
-
-  // ConstantString
-  IValue(c10::intrusive_ptr<ivalue::ConstantString> v);
-  IValue(std::string v);
-  IValue(const char* v): IValue(std::string(v)) {}
-  bool isString() const { return Tag::String == tag; }
-  c10::intrusive_ptr<ivalue::ConstantString> toString() &&;
-  c10::intrusive_ptr<ivalue::ConstantString> toString() const &;
-
-  // DoubleList
-  IValue(c10::intrusive_ptr<ivalue::DoubleList> v);
-  IValue(std::vector<double> v);
-  bool isDoubleList() const { return Tag::DoubleList == tag; }
-  c10::intrusive_ptr<ivalue::DoubleList> toDoubleList() &&;
-  c10::intrusive_ptr<ivalue::DoubleList> toDoubleList() const &;
-
-  // BoolList
-  IValue(c10::intrusive_ptr<ivalue::BoolList> v);
-  IValue(std::vector<bool> v);
-  bool isBoolList() const { return Tag::BoolList == tag; }
-  c10::intrusive_ptr<ivalue::BoolList> toBoolList() &&;
-  c10::intrusive_ptr<ivalue::BoolList> toBoolList() const &;
-
-  //TensorList
-  IValue(c10::intrusive_ptr<ivalue::TensorList> v);
-  IValue(std::vector<at::Tensor> v);
-  bool isTensorList() const { return Tag::TensorList == tag; }
-  c10::intrusive_ptr<ivalue::TensorList> toTensorList() &&;
-  c10::intrusive_ptr<ivalue::TensorList> toTensorList() const &;
-
-  //GenericList
-  IValue(c10::intrusive_ptr<ivalue::GenericList> v);
-  IValue(std::vector<IValue> v);
-  bool isGenericList() const { return Tag::GenericList == tag; }
-  c10::intrusive_ptr<ivalue::GenericList> toGenericList() &&;
-  c10::intrusive_ptr<ivalue::GenericList> toGenericList() const &;
-
-  // GenericDict
-  IValue(c10::intrusive_ptr<ivalue::GenericDict> v);
-  IValue(c10::Dict<IValue, IValue> v);
-  bool isGenericDict() const { return Tag::GenericDict == tag; }
-  c10::intrusive_ptr<ivalue::GenericDict> toGenericDict() &&;
-  c10::intrusive_ptr<ivalue::GenericDict> toGenericDict() const &;
-
-  // ClassType
-  IValue(c10::intrusive_ptr<ivalue::Object> v);
-  bool isObject() const { return tag == Tag::Object; }
-  c10::intrusive_ptr<ivalue::Object> toObject() &&;
-  c10::intrusive_ptr<ivalue::Object> toObject() const & ;
-
-  // None
-  bool isNone() const {
-    return Tag::None == tag;
-  }
-  std::string toNone() const {
-    AT_ASSERT(isNone());
-    return "None";
-  }
-  // Scalar, which gets encoded as either an Int or a Double
-  IValue(at::Scalar s)
-  : IValue() {
-    if(s.isFloatingPoint()) {
-      *this = s.toDouble();
-    } else {
-      *this = s.toLong();
-    }
-  }
-  bool isScalar() const {
-    return isDouble() || isInt();
-  }
-  at::Scalar toScalar() const {
-    if(isDouble())
-      return toDouble();
-    else if(isInt())
-      return toInt();
-    throw std::runtime_error("IValue is not a Scalar");
-  }
-
-  // Device
-  IValue(c10::Device d)
-  : tag(Tag::Device), is_intrusive_ptr(false) {
-    payload.as_device.type = d.type();
-    payload.as_device.index = d.index();
-  }
-  bool isDevice() const { return Tag::Device == tag; }
-  c10::Device toDevice() const {
-    AT_ASSERT(isDevice());
-    return c10::Device(payload.as_device.type, payload.as_device.index);
-  }
-
-  // ScalarType
-  at::ScalarType toScalarType() const {
-    return static_cast<at::ScalarType>(toInt());
-  }
-
-  // Layout
-  at::Layout toLayout() const {
-    return static_cast<at::Layout>(toInt());
-  }
-
-  // for debugging
-  std::string tagKind() const {
-    switch(tag) {
-      #define DEFINE_CASE(x) case Tag::x: return #x;
-      TORCH_FORALL_TAGS(DEFINE_CASE)
-      #undef DEFINE_CASE
-    }
-    return "Invalid Tag";
-  }
-
-  // generic v.to<at::Tensor>() implementations
-  // that can be used in special functions like pop/push
-  // that use template meta-programming.
-  // prefer the directly named methods when you can,
-  // since they are simpler to understand
-
-  // Note: if you get linker errors saying one of these is missing,
-  // change it to ... && = delete; and you will see better error messages for why
-  // However, we cannot commit this because some compiler versions barf on it.
-  template<typename T>
-  T to() &&;
-  template<typename T>
-  T to() const &;
-
-  // ToOptional: convert a IValue to the Optional obj that accepts both T and None
-  template<typename T>
-  optional<T> toOptional();
-
-  // this is a shallow comparison of two IValues to test the object identity
-  bool isSameIdentity(const IValue& rhs) const;
-
-  CAFFE2_API friend std::ostream& operator<<(
-      std::ostream& out,
-      const IValue& v);
-
-  bool isPtrType() const {
-    return is_intrusive_ptr;
-  }
-
- private:
-  // NOTE: IValue tags are intentionally private. In the future we may encode
-  // this value different (e.g. using NaN boxing), and this would make it more
-  // costly to determine the tag for all types vs just determining if something
-  // is a particular type. Instead we want clients to use the `isX` methods when
-  // possible. If for perf. reasons you really, absolutely, must have a jump
-  // table, then we can revisit this.
-  enum class Tag : uint32_t {
-#define DEFINE_TAG(x) x,
-    TORCH_FORALL_TAGS(DEFINE_TAG)
-#undef DEFINE_TAG
-  };
-
-  template<class T, class NullType = c10::detail::intrusive_target_default_null_type<T>>
-  c10::intrusive_ptr<T, NullType> moveToIntrusivePtr();
-  template<typename T, class NullType = c10::detail::intrusive_target_default_null_type<T>>
-  c10::intrusive_ptr<T, NullType> toIntrusivePtr() const;
-
-  void clearToNone() {
-    payload.as_int = 0;
-    tag = Tag::None;
-    is_intrusive_ptr = false;
-  }
-  union {
-    int64_t as_int;
-    double as_double;
-    bool as_bool;
-    c10::intrusive_ptr_target* as_intrusive_ptr;
-    struct {
-      DeviceType type;
-      DeviceIndex index;
-    } as_device;
-  } payload;
-  Tag tag;
-  bool is_intrusive_ptr;
-};
-
-}
diff --git a/aten/src/ATen/core/ivalue_inl.h b/aten/src/ATen/core/ivalue_inl.h
new file mode 100644
index 000000000000..9aba56ebc84e
--- /dev/null
+++ b/aten/src/ATen/core/ivalue_inl.h
@@ -0,0 +1,652 @@
+#pragma once
+
+#include <condition_variable>
+#include <type_traits>
+
+#include <ATen/core/functional.h>
+#include <ATen/core/interned_strings.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/TensorImpl.h>
+#include <c10/core/UndefinedTensorImpl.h>
+#include <ATen/core/Dict.h>
+
+namespace torch {
+namespace jit {
+namespace script {
+struct Function;
+}
+} // namespace jit
+} // namespace torch
+namespace c10 {
+struct IValue;
+struct ClassType;
+
+template<class T, class NullType>
+c10::intrusive_ptr<T, NullType> IValue::moveToIntrusivePtr() {
+  auto t = c10::intrusive_ptr<T, NullType>::reclaim(static_cast<T*>(payload.as_intrusive_ptr));
+  clearToNone();
+  return t;
+}
+template<typename T, class NullType>
+c10::intrusive_ptr<T, NullType> IValue::toIntrusivePtr() const {
+  auto r = c10::intrusive_ptr<T, NullType>::reclaim(static_cast<T*>(payload.as_intrusive_ptr));
+  auto p = r;
+  r.release();
+  return p;
+}
+
+inline c10::intrusive_ptr<ivalue::Tuple> IValue::toTuple() && {
+  AT_ASSERT(isTuple());
+  return moveToIntrusivePtr<ivalue::Tuple>();
+}
+inline c10::intrusive_ptr<ivalue::Tuple> IValue::toTuple() const & {
+  AT_ASSERT(isTuple());
+  return toIntrusivePtr<ivalue::Tuple>();
+}
+inline c10::intrusive_ptr<ivalue::Future> IValue::toFuture() && {
+  AT_ASSERT(isFuture());
+  return moveToIntrusivePtr<ivalue::Future>();
+}
+inline c10::intrusive_ptr<ivalue::Future> IValue::toFuture() const & {
+  AT_ASSERT(isFuture());
+  return toIntrusivePtr<ivalue::Future>();
+}
+inline c10::intrusive_ptr<ivalue::IntList> IValue::toIntList() && {
+  AT_ASSERT(isIntList());
+  return moveToIntrusivePtr<ivalue::IntList>();
+}
+inline c10::intrusive_ptr<ivalue::IntList> IValue::toIntList() const & {
+  AT_ASSERT(isIntList());
+  return toIntrusivePtr<ivalue::IntList>();
+}
+inline c10::intrusive_ptr<ivalue::ConstantString> IValue::toString() && {
+  AT_ASSERT(isString());
+  return moveToIntrusivePtr<ivalue::ConstantString>();
+}
+inline c10::intrusive_ptr<ivalue::ConstantString> IValue::toString() const & {
+  AT_ASSERT(isString());
+  return toIntrusivePtr<ivalue::ConstantString>();
+}
+inline c10::intrusive_ptr<ivalue::DoubleList> IValue::toDoubleList() && {
+  AT_ASSERT(isDoubleList());
+  return moveToIntrusivePtr<ivalue::DoubleList>();
+}
+inline c10::intrusive_ptr<ivalue::DoubleList> IValue::toDoubleList() const & {
+  AT_ASSERT(isDoubleList());
+  return toIntrusivePtr<ivalue::DoubleList>();
+}
+inline c10::intrusive_ptr<ivalue::BoolList> IValue::toBoolList() && {
+  AT_ASSERT(isBoolList());
+  return moveToIntrusivePtr<ivalue::BoolList>();
+}
+inline c10::intrusive_ptr<ivalue::BoolList> IValue::toBoolList() const & {
+  AT_ASSERT(isBoolList());
+  return toIntrusivePtr<ivalue::BoolList>();
+}
+inline c10::intrusive_ptr<ivalue::TensorList> IValue::toTensorList() && {
+  AT_ASSERT(isTensorList());
+  return moveToIntrusivePtr<ivalue::TensorList>();
+}
+inline c10::intrusive_ptr<ivalue::TensorList> IValue::toTensorList() const & {
+  AT_ASSERT(isTensorList());
+  return toIntrusivePtr<ivalue::TensorList>();
+}
+inline c10::intrusive_ptr<ivalue::GenericList> IValue::toGenericList() && {
+  AT_ASSERT(isGenericList());
+  return moveToIntrusivePtr<ivalue::GenericList>();
+}
+inline c10::intrusive_ptr<ivalue::GenericList> IValue::toGenericList() const & {
+  AT_ASSERT(isGenericList());
+  return toIntrusivePtr<ivalue::GenericList>();
+}
+inline c10::intrusive_ptr<ivalue::GenericDict> IValue::toGenericDict() && {
+  AT_ASSERT(isGenericDict());
+  return moveToIntrusivePtr<ivalue::GenericDict>();
+}
+inline c10::intrusive_ptr<ivalue::GenericDict> IValue::toGenericDict() const & {
+  AT_ASSERT(isGenericDict());
+  return toIntrusivePtr<ivalue::GenericDict>();
+}
+inline c10::intrusive_ptr<ivalue::Object> IValue::toObject() && {
+  AT_ASSERT(isObject());
+  return toIntrusivePtr<ivalue::Object>();
+}
+inline c10::intrusive_ptr<ivalue::Object> IValue::toObject() const & {
+  AT_ASSERT(isObject());
+  return toIntrusivePtr<ivalue::Object>();
+}
+inline at::Tensor IValue::toTensor() && {
+  AT_ASSERT(isTensor());
+  return at::Tensor(moveToIntrusivePtr<at::TensorImpl, at::UndefinedTensorImpl>());
+}
+inline at::Tensor IValue::toTensor() const & {
+  AT_ASSERT(isTensor());
+  return at::Tensor(toIntrusivePtr<at::TensorImpl, at::UndefinedTensorImpl>());
+}
+inline c10::intrusive_ptr<caffe2::Blob> IValue::toBlob() && {
+  AT_ASSERT(isBlob());
+  return moveToIntrusivePtr<caffe2::Blob>();
+}
+inline c10::intrusive_ptr<caffe2::Blob> IValue::toBlob() const & {
+  AT_ASSERT(isBlob());
+  return toIntrusivePtr<caffe2::Blob>();;
+}
+
+namespace ivalue {
+
+template <typename T>
+using Shared = c10::intrusive_ptr<T>;
+
+// string
+struct CAFFE2_API ConstantString final : c10::intrusive_ptr_target {
+ private:
+  const std::string str_;
+ public:
+  ConstantString(std::string str)
+  : str_(std::move(str)) {}
+  static c10::intrusive_ptr<ConstantString> create(std::string str_);
+  const std::string & string() const {
+    return str_;
+  }
+  operator const std::string & () const {
+    return string();
+  }
+  CAFFE2_API friend std::ostream& operator<<(
+      std::ostream& out,
+      const ConstantString& v);
+};
+
+template <typename Elem>
+struct CAFFE2_API List : c10::intrusive_ptr_target {
+ private:
+  std::vector<Elem> elements_;
+
+ public:
+  typedef Elem ElemType;
+
+  List(std::vector<Elem> elements_) : elements_(std::move(elements_)) {}
+  static c10::intrusive_ptr<List<Elem>> create(std::vector<Elem> elements_) {
+    return c10::make_intrusive<List<Elem>>(std::move(elements_));
+  }
+  const std::vector<Elem>& elements() const & {
+    return elements_;
+  }
+  operator const std::vector<Elem>&() const {
+    return elements();
+  }
+
+  std::vector<Elem>& elements() & {
+    return elements_;
+  }
+  operator std::vector<Elem>&() {
+    return elements();
+  }
+
+  std::vector<Elem>&& elements() && {
+    return std::move(elements_);
+  }
+};
+
+struct Future;
+struct GenericDict;
+
+struct CAFFE2_API Tuple : public List<IValue> {
+  using List<IValue>::List;
+  static c10::intrusive_ptr<Tuple> create(std::vector<IValue> elements_) {
+    return c10::make_intrusive<Tuple>(std::move(elements_));
+  }
+};
+
+struct Object;
+}
+
+// Future
+struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target {
+ private:
+  c10::intrusive_ptr<Future> intrusive_from_this() {
+    c10::raw::intrusive_ptr::incref(this); // we are creating a new pointer
+                                           // from a raw `this` pointer
+                                           // so we need to bump the refcount
+                                           // to account for this ownership
+    return c10::intrusive_ptr<Future>::reclaim(this);
+  }
+
+ public:
+  struct CAFFE2_API FutureError final : public std::exception {
+    FutureError(std::string&& error_msg_)
+        : error_msg(std::move(error_msg_)) {}
+
+    FutureError() = default;
+
+    const char* what() const noexcept override {
+      return error_msg.c_str();
+    }
+
+    std::string error_msg;
+  };
+
+  /**
+  * Wait on the future until it completes.
+  */
+  void wait() {
+    if (completed()) {
+      return;
+    }
+    std::condition_variable finished;
+    bool fired = false;
+
+    // Add a callback to notify the current thread
+    // when the current future completes.
+    addCallback([&] {
+      std::unique_lock<std::mutex> lock(mutex_);
+      finished.notify_all();
+      fired = true;
+    });
+
+    // The current thread will be blocked unless the above callback is fired.
+    std::unique_lock<std::mutex> lock(mutex_);
+    while (!fired) {
+      finished.wait(lock);
+    }
+
+    AT_ASSERT(completed());
+  }
+
+  /**
+   * Explicitly mark the future as completed with the output value.
+   */
+  void markCompleted(IValue value) {
+    {
+      // This is not to protect completed_ but to create a barrier
+      // from possible addCallback() calls
+      std::unique_lock<std::mutex> lock(mutex_);
+      AT_ASSERT(!completed());
+      completed_ = true;
+      value_ = std::move(value);
+    }
+
+    fireCallbacks();
+  }
+
+  void markCompleted(FutureError&& error_) {
+    {
+      // This is not to protect completed_ but to create a barrier
+      // from possible addCallback() calls
+      std::unique_lock<std::mutex> lock(mutex_);
+      AT_ASSERT(!completed());
+      completed_ = true;
+      has_error = true;
+      error = std::move(error_);
+    }
+
+    fireCallbacks();
+  }
+
+  // Get the result of the current future.
+  IValue value() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    AT_ASSERT(completed());
+    if (has_error) {
+      throw error;
+    }
+    return value_;
+  }
+
+  /**
+   * Add a callback to the future.
+   * The callbacks will be executed once the future completes.
+   * If the future has already completed,
+   * this function will execute the callback immediately.
+   */
+  void addCallback(std::function<void(void)> callback) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    if (completed()) {
+      lock.unlock();
+      callback();
+      return;
+    }
+    callbacks.push_back(callback);
+  }
+
+  // Check if the current future has completed
+  bool completed() {
+    return completed_;
+  }
+
+  CAFFE2_API friend std::ostream& operator<<(
+      std::ostream& out,
+      const Future& v);
+
+ private:
+  void fireCallbacks() {
+    AT_ASSERT(completed());
+    // There is no need to protect callbacks with the lock.
+    // Once completed_ is set to true, no one can add new callback to the list.
+    for (auto& callback : callbacks) {
+      callback();
+    }
+    callbacks.clear();
+  }
+
+  std::mutex mutex_;
+  IValue value_; // when finished the value
+  std::atomic_bool completed_ = {false}; // is this future complete
+  std::vector<std::function<void(void)>> callbacks;
+  bool has_error = false;
+  FutureError error;
+};
+
+// User-defined object.
+struct C10_EXPORT ivalue::Object final : c10::intrusive_ptr_target {
+ public:
+  Object(std::shared_ptr<ClassType> type, size_t numSlots) : type_(std::move(type)) {
+    slots_.resize(numSlots);
+  }
+
+  static c10::intrusive_ptr<Object> create(
+      std::shared_ptr<ClassType> type,
+      size_t numSlots) {
+    return c10::make_intrusive<Object>(std::move(type), numSlots);
+  }
+
+  /**
+   * Slot API.
+   *
+   * Attributes are stored as a simple vector so that lookups are fast at
+   * runtime. A "slot" is just an index into that vector, which can be computed
+   * statically if you have access to the class type. Use this API if you are
+   * writing compiler stuff.
+   */
+  void setSlot(size_t slot, IValue v) {
+    if (slot >= slots_.size()) {
+      // for module types, it is possible that the members of the class have
+      // expanded after the object was created. In this case, we expand
+      // the slots to the right size
+      resizeObject(slot);
+    }
+    slots_[slot] = v;
+  }
+
+  const IValue& getSlot(size_t slot) const {
+    return slots_.at(slot);
+  }
+
+  /**
+   * Attribute API.
+   *
+   * Wrappers around the slot stuff so that users can access attributes
+   * directly. Use this API if you are a user.
+   *
+   * Note: Unlike in Python, TorchScript must make a distinction between
+   * attributes (which are IValues) and methods (which are Methods). If you
+   * want a method, use `obj.type()->getMethod()`
+   */
+  IValue getAttr(const std::string& name) const;
+  void setAttr(const std::string& name, IValue v);
+
+  std::string name() const;
+
+  const std::vector<IValue>& slots() const {
+    return slots_;
+  }
+  std::shared_ptr<ClassType> type() const {
+    return type_;
+  }
+
+ private:
+  void resizeObject(size_t slot);
+  std::shared_ptr<ClassType> type_;
+  std::vector<IValue> slots_;
+};
+
+struct C10_EXPORT ivalue::GenericDict : c10::intrusive_ptr_target {
+ private:
+  c10::impl::GenericDict elements_;
+
+ public:
+  GenericDict(c10::impl::GenericDict elements_)
+      : elements_(std::move(elements_)) {}
+  static c10::intrusive_ptr<GenericDict> create(
+      c10::impl::GenericDict elements_) {
+    return c10::make_intrusive<GenericDict>(std::move(elements_));
+  }
+  const c10::impl::GenericDict& elements() const & {
+    return elements_;
+  }
+  c10::impl::GenericDict& elements() & {
+    return elements_;
+  }
+
+  using IterationOrder = std::vector<std::pair<IValue, IValue>>;
+  const IterationOrder iterationOrder() const;
+};
+
+#undef TORCH_FORALL_TAGS
+
+namespace detail {
+
+struct _guarded_unsigned_long_unique_dummy final {
+  _guarded_unsigned_long_unique_dummy(int64_t){};
+};
+using _guarded_unsigned_long = c10::guts::conditional_t<
+    std::is_same<unsigned long, uint32_t>::value ||
+        std::is_same<unsigned long, uint64_t>::value,
+    _guarded_unsigned_long_unique_dummy,
+    unsigned long>;
+
+} // namespace detail
+
+#define DEFINE_TO(type, method_name) \
+template<> \
+inline type IValue::to<type>() && { \
+  return std::move(*this).method_name(); \
+} \
+template<> \
+inline type IValue::to<type>() const & { \
+  return this->method_name(); \
+}
+DEFINE_TO(at::Tensor, toTensor)
+DEFINE_TO(c10::intrusive_ptr<ivalue::Tuple>, toTuple)
+DEFINE_TO(float, toDouble)
+DEFINE_TO(double, toDouble)
+DEFINE_TO(unsigned char, toInt)
+DEFINE_TO(signed char, toInt)
+DEFINE_TO(unsigned short, toInt)
+DEFINE_TO(short, toInt)
+DEFINE_TO(int, toInt)
+DEFINE_TO(uint32_t, toInt)
+DEFINE_TO(uint64_t, toInt)
+DEFINE_TO(detail::_guarded_unsigned_long, toInt)
+DEFINE_TO(int64_t, toInt)
+DEFINE_TO(bool, toBool)
+DEFINE_TO(c10::intrusive_ptr<caffe2::Blob>, toBlob);
+DEFINE_TO(c10::intrusive_ptr<ivalue::DoubleList>, toDoubleList)
+DEFINE_TO(c10::intrusive_ptr<ivalue::IntList>, toIntList)
+DEFINE_TO(c10::intrusive_ptr<ivalue::BoolList>, toBoolList)
+DEFINE_TO(c10::intrusive_ptr<ivalue::TensorList>, toTensorList)
+DEFINE_TO(c10::intrusive_ptr<ivalue::GenericList>, toGenericList)
+DEFINE_TO(c10::intrusive_ptr<ivalue::GenericDict>, toGenericDict)
+DEFINE_TO(c10::intrusive_ptr<ivalue::ConstantString>, toString)
+DEFINE_TO(c10::intrusive_ptr<ivalue::Object>, toObject)
+DEFINE_TO(at::Scalar, toScalar)
+DEFINE_TO(std::vector<int64_t>, toIntListRef)
+DEFINE_TO(std::vector<double>, toDoubleListRef)
+DEFINE_TO(std::vector<bool>, toBoolListRef)
+DEFINE_TO(std::vector<at::Tensor>, toTensorListRef)
+DEFINE_TO(std::vector<IValue>, toGenericListRef)
+DEFINE_TO(std::string, toStringRef)
+DEFINE_TO(c10::intrusive_ptr<ivalue::Future>, toFuture)
+DEFINE_TO(IValue, toIValue)
+DEFINE_TO(c10::Device, toDevice)
+DEFINE_TO(at::ScalarType, toScalarType)
+DEFINE_TO(at::Layout, toLayout)
+DEFINE_TO(at::MemoryFormat, toMemoryFormat)
+
+template <typename T>
+struct _fake_type {};
+
+template <typename Elem>
+std::vector<Elem> generic_to(
+    const IValue* ivalue,
+    _fake_type<std::vector<Elem>>) {
+  return fmap(ivalue->toGenericListRef(), [](IValue item_ivalue) { return item_ivalue.to<Elem>(); });
+}
+
+template <typename K, typename V>
+std::unordered_map<K, V> generic_to(
+    const IValue* ivalue,
+    _fake_type<std::unordered_map<K, V>>) {
+  std::unordered_map<K, V> specialized_dict;
+
+  for (auto item : ivalue->toGenericDictRef()) {
+    specialized_dict[item.key().to<K>()] = item.value().to<V>();
+  }
+
+  return specialized_dict;
+}
+
+template <typename T>
+inline T IValue::to() && {
+  return generic_to(this, _fake_type<T>{});
+}
+
+template <typename T>
+inline T IValue::to() const& {
+  return generic_to(this, _fake_type<T>{});
+}
+
+// note: when adding a DEFINE_TO case here you should also add a
+// toX method to IValue. These named methods are much more discoverable
+// than the to templated function.
+
+inline IValue::IValue(c10::intrusive_ptr<ivalue::Tuple> v)
+: tag(Tag::Tuple), is_intrusive_ptr(true) {
+  payload.as_intrusive_ptr = v.release();
+}
+
+inline IValue::IValue(c10::intrusive_ptr<ivalue::IntList> v)
+: tag(Tag::IntList), is_intrusive_ptr(true) {
+  payload.as_intrusive_ptr = v.release();
+}
+inline IValue::IValue(std::vector<int64_t> v)
+: IValue(ivalue::IntList::create(std::move(v))) {}
+
+inline IValue::IValue(c10::intrusive_ptr<ivalue::ConstantString> v)
+: tag(Tag::String), is_intrusive_ptr(true) {
+  payload.as_intrusive_ptr = v.release();
+}
+inline IValue::IValue(std::string v)
+: IValue(ivalue::ConstantString::create(std::move(v))) {}
+
+inline IValue::IValue(c10::intrusive_ptr<ivalue::DoubleList> v)
+: tag(Tag::DoubleList), is_intrusive_ptr(true) {
+  payload.as_intrusive_ptr = v.release();
+}
+inline IValue::IValue(std::vector<double> v)
+: IValue(ivalue::DoubleList::create(std::move(v))) {}
+
+inline IValue::IValue(c10::intrusive_ptr<ivalue::BoolList> v)
+: tag(Tag::BoolList), is_intrusive_ptr(true) {
+  payload.as_intrusive_ptr = v.release();
+}
+inline IValue::IValue(std::vector<bool> v)
+: IValue(ivalue::BoolList::create(std::move(v))) {}
+
+inline IValue::IValue(c10::intrusive_ptr<ivalue::TensorList> v)
+: tag(Tag::TensorList), is_intrusive_ptr(true) {
+  payload.as_intrusive_ptr = v.release();
+}
+inline IValue::IValue(std::vector<at::Tensor> v)
+: IValue(ivalue::TensorList::create(std::move(v))) {}
+
+inline IValue::IValue(c10::intrusive_ptr<ivalue::GenericList> v)
+: tag(Tag::GenericList), is_intrusive_ptr(true) {
+  payload.as_intrusive_ptr = v.release();
+}
+inline IValue::IValue(std::vector<IValue> v)
+: IValue(ivalue::GenericList::create(std::move(v))) {}
+
+inline IValue::IValue(c10::intrusive_ptr<ivalue::GenericDict> v)
+: tag(Tag::GenericDict), is_intrusive_ptr(true) {
+  payload.as_intrusive_ptr = v.release();
+}
+inline IValue::IValue(c10::impl::GenericDict v)
+: IValue(ivalue::GenericDict::create(std::move(v))) {}
+
+inline IValue::IValue(c10::intrusive_ptr<ivalue::Object> v)
+: tag(Tag::Object), is_intrusive_ptr(true) {
+  payload.as_intrusive_ptr = v.release();
+}
+inline IValue::IValue(c10::intrusive_ptr<ivalue::Future> v)
+: tag(Tag::Future), is_intrusive_ptr(true) {
+  payload.as_intrusive_ptr = v.release();
+}
+
+inline const std::vector<int64_t>& IValue::toIntListRef() const {
+  return toIntList()->elements();
+}
+
+inline const std::vector<double>& IValue::toDoubleListRef() const {
+  return toDoubleList()->elements();
+}
+
+inline const std::vector<at::Tensor>& IValue::toTensorListRef() const {
+  return toTensorList()->elements();
+}
+
+inline const std::vector<bool>& IValue::toBoolListRef() const {
+  return toBoolList()->elements();
+}
+
+inline const std::vector<IValue>& IValue::toGenericListRef() const {
+  return toGenericList()->elements();
+}
+
+inline const c10::impl::GenericDict& IValue::
+    toGenericDictRef() const {
+  return toGenericDict()->elements();
+}
+
+inline const std::string& IValue::toStringRef() const {
+  return toString()->string();
+}
+
+template<typename T>
+inline optional<T> IValue::toOptional() {
+  if (this->isNone()) {
+    return nullopt;
+  }
+  return this->to<T>();
+}
+
+inline bool IValue::isSameIdentity(const IValue& rhs) const {
+  // We choose to not use memcmp for payload check due to potential random padding characters on union type
+
+  // Semantics:
+  // 1. None is None, False is False, and True is True are all true
+  // 2. If it is a tensor type, we need to take undefined tensor into account
+  // 3. Undefined_tensor is None and vice versa should be true
+  // 4. If it is a reference type (i.e. is_intrusive_ptr), then is is True when the pointed-to object is the same.
+  // 5. False for all other comparisons.
+  if (this->isNone() && rhs.isNone()) {
+    return true;
+  } else if (this->isBool() && rhs.isBool()) {
+    // for bool type, do equality check
+    return this->toBool() == rhs.toBool();
+  } else if (this->isTensor() && rhs.isTensor()) {
+    // for tensor type, just check the as_intrusive_ptr since is_intrusive_ptr is false for undefined tensor
+    return this->payload.as_intrusive_ptr == rhs.payload.as_intrusive_ptr;
+  } else if (this->isTensor() && rhs.isNone()) {
+    // special case: undefined tensor and None are the same identity
+    return !this->is_intrusive_ptr;
+  } else if (this->isNone() && rhs.isTensor()) {
+    // special case: undefined tensor and None are the same identity
+    return !rhs.is_intrusive_ptr;
+  } else {
+    // for objects holding in IValue, do shallow compare on pointer address to testify the identity
+    return this->is_intrusive_ptr && rhs.is_intrusive_ptr
+        && this->payload.as_intrusive_ptr == rhs.payload.as_intrusive_ptr;
+  }
+}
+
+} // namespace c10
diff --git a/aten/src/ATen/core/jit_type.h b/aten/src/ATen/core/jit_type.h
index 092c9c38fa73..350a0100ee23 100644
--- a/aten/src/ATen/core/jit_type.h
+++ b/aten/src/ATen/core/jit_type.h
@@ -494,6 +494,15 @@ struct CAFFE2_API ProfiledTensorType : public TensorType {
       return ProfiledTensorTypePtr(new ProfiledTensorType(scalar_type, device, sizes, strides, requires_grad));
   }
 
+  static ProfiledTensorTypePtr create(ProfiledTensorTypePtr pttp) {
+    return ProfiledTensorTypePtr(new ProfiledTensorType(
+        pttp->scalarType(),
+        pttp->device(),
+        pttp->sizes(),
+        pttp->strides(),
+        pttp->requiresGrad()));
+  }
+
   const VaryingShape& sizes() const { return sizes_; }
   const VaryingStrides& strides() const { return strides_; }
   c10::optional<at::Device> device() const { return device_; }
diff --git a/aten/src/ATen/core/op_registration/base.h b/aten/src/ATen/core/op_registration/base.h
deleted file mode 100644
index 3fdf6e9314b1..000000000000
--- a/aten/src/ATen/core/op_registration/base.h
+++ /dev/null
@@ -1,58 +0,0 @@
-#pragma once
-
-/**
- * This file sets up the basics for operator registration.
- *
- * You probably don't want to include this file directly but include
- * op_registration.h instead since that adds more functionality you'll
- * likely need to register your operators.
- */
-
-#include <ATen/core/dispatch/Dispatcher.h>
-
-namespace c10 {
-
-namespace detail {
-
-  // KernelRegistrationConfig accumulates all information from the config
-  // parameters passed to a RegisterOperators::op() call into one object.
-  struct KernelRegistrationConfig final {
-    c10::optional<TensorTypeId> dispatch_key = c10::nullopt;
-    KernelFunction* kernel_func = nullptr;
-    KernelCacheCreatorFunction cache_creator_func = nullptr;
-    std::unique_ptr<FunctionSchema> inferred_function_schema = nullptr;
-  };
-
-  // is_registration_config_parameter is a concept that returns true_type iff its argument is
-  // a valid parameter to be passed to c10::RegisterOperators().op(parameters...)
-  // That is, it must have an apply method that takes a KernelRegistrationConfig*.
-  template<class ConfigParameter, class Enable = void>
-  struct is_registration_config_parameter : std::false_type {
-    static_assert(std::is_same<ConfigParameter, guts::decay_t<ConfigParameter>>::value, "is_registration_config_parameter doesn't work with reference types");
-  };
-  template<class ConfigParameter>
-  struct is_registration_config_parameter<ConfigParameter, guts::void_t<decltype(
-    std::declval<ConfigParameter>().apply(std::declval<KernelRegistrationConfig*>()),
-    std::declval<const ConfigParameter&>().apply(std::declval<KernelRegistrationConfig*>())
-  )>> : std::true_type {
-    static_assert(std::is_same<ConfigParameter, guts::decay_t<ConfigParameter>>::value, "is_registration_config_parameter doesn't work with reference types");
-  };
-  static_assert(!is_registration_config_parameter<KernelRegistrationConfig>::value, "For classes that aren't registration parameters, this concept should return false");
-  // note: the corresponding asserts that the concept returns true are next to the definition of the corresponding classes
-
-  // Take a list of configuration parameters and return a
-  // KernelRegistrationConfig accumulating all their configurations.
-  template<class... ConfigParameters>
-  KernelRegistrationConfig make_registration_config(ConfigParameters&&... configParameters) {
-    static_assert(guts::conjunction<is_registration_config_parameter<guts::decay_t<ConfigParameters>>...>::value, "One of the parameters isn't a valid registration config parameter.");
-
-    KernelRegistrationConfig config;
-
-    // apply all configParameters
-    (void)std::initializer_list<int>{(std::forward<ConfigParameters>(configParameters).apply(&config), 0)...};
-
-    return config;
-  }
-}
-
-}
diff --git a/aten/src/ATen/core/op_registration/dispatch_key.h b/aten/src/ATen/core/op_registration/dispatch_key.h
deleted file mode 100644
index a4ced362d9e5..000000000000
--- a/aten/src/ATen/core/op_registration/dispatch_key.h
+++ /dev/null
@@ -1,59 +0,0 @@
-#pragma once
-
-/**
- * This file implements c10::dispatchKey() which is used in the kernel
- * registration API to set the dispatch key for a registered kernel.
- *
- * You probably don't want to include this file directly but include
- * op_registration.h instead since that adds more functionality you'll
- * likely need to register your operators.
- */
-
-#include <ATen/core/op_registration/base.h>
-
-namespace c10 {
-
-namespace detail {
-  struct DispatchKeyConfigParameter final {
-    explicit constexpr DispatchKeyConfigParameter(TensorTypeId dispatch_key)
-    : dispatch_key_(dispatch_key) {}
-
-    void apply(KernelRegistrationConfig* registration) const {
-      registration->dispatch_key = dispatch_key_;
-    }
-
-  private:
-    TensorTypeId dispatch_key_;
-  };
-  static_assert(is_registration_config_parameter<DispatchKeyConfigParameter>::value, "KernelRegistrationConfigParameter must fulfill the registration config parameter concept");
-}
-
-/**
- * Use this to register an operator with a kernel for a certain dispatch key.
- *
- * Example:
- *
- * > namespace {
- * >   class my_kernel_cpu final : public c10::OperatorKernel {
- * >   public:
- * >     Tensor operator()(Tensor a, Tensor b) {...}
- * >   };
- * >   class my_kernel_cuda final : public c10::OperatorKernel {
- * >   public:
- * >     Tensor operator()(Tensor a, Tensor b) {...}
- * >   };
- * > }
- * >
- * > static auto registry = c10::RegisterOperators()
- * >     .op("my_op",
- * >         c10::kernel<my_kernel_cpu>(),
- * >         c10::dispatchKey(CPUTensorId()))
- * >     .op("my_op",
- * >         c10::kernel<my_kernel_cuda>(),
- * >         c10::dispatchKey(CUDATensorId()));
- */
-inline constexpr detail::DispatchKeyConfigParameter dispatchKey(TensorTypeId dispatch_key) {
-  return detail::DispatchKeyConfigParameter(dispatch_key);
-}
-
-}
diff --git a/aten/src/ATen/core/op_registration/infer_schema.h b/aten/src/ATen/core/op_registration/infer_schema.h
index 36f681efb82c..32846755083c 100644
--- a/aten/src/ATen/core/op_registration/infer_schema.h
+++ b/aten/src/ATen/core/op_registration/infer_schema.h
@@ -98,6 +98,6 @@ FunctionSchema inferFunctionSchema(std::string name, std::string overload_name)
   return detail::createFunctionSchemaFromTraits<guts::infer_function_traits_t<FuncType>>(std::move(name), std::move(overload_name));
 }
 
-C10_API void assertSchemasHaveSameSignature(const FunctionSchema& inferred, const FunctionSchema& specified);
+CAFFE2_API void assertSchemasHaveSameSignature(const FunctionSchema& inferred, const FunctionSchema& specified);
 
 }
diff --git a/aten/src/ATen/core/op_registration/kernel_function.h b/aten/src/ATen/core/op_registration/kernel_function.h
index e5d7bb3fde1e..b08490d856ba 100644
--- a/aten/src/ATen/core/op_registration/kernel_function.h
+++ b/aten/src/ATen/core/op_registration/kernel_function.h
@@ -26,26 +26,4 @@ namespace detail {
   };
 }
 
-/**
- * Use this to register an operator whose kernel is implemented by a function:
- *
- * Example:
- *
- * > namespace { Tensor my_kernel_cpu(Tensor a, Tensor b) {...} }
- * >
- * > static auto registry = c10::RegisterOperators()
- * >     .op("my_op",
- * >         c10::kernel<decltype(my_kernel_cpu), &my_kernel_cpu>(),
- * >         c10::dispatchKey(CPUTensorId()));
- */
-template<class FuncType, FuncType* kernel_func>
-inline constexpr auto kernel() ->
-// enable_if: only enable it if FuncType is actually a function
-guts::enable_if_t<guts::is_function_type<FuncType>::value,
-decltype(kernel<typename detail::WrapKernelFunction<FuncType, kernel_func>::type>())> {
-  static_assert(!std::is_same<FuncType, KernelFunction>::value, "Tried to register a stackbased (i.e. internal) kernel function using the public kernel<...>() API. Please either use the internal kernel(...) API or also implement the kernel function as defined by the public API.");
-
-  return kernel<typename detail::WrapKernelFunction<FuncType, kernel_func>::type>();
-}
-
 }
diff --git a/aten/src/ATen/core/op_registration/kernel_function_legacy_test.cpp b/aten/src/ATen/core/op_registration/kernel_function_legacy_test.cpp
index 5d13721de165..b4b24aff78da 100644
--- a/aten/src/ATen/core/op_registration/kernel_function_legacy_test.cpp
+++ b/aten/src/ATen/core/op_registration/kernel_function_legacy_test.cpp
@@ -18,8 +18,6 @@
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
 
 using c10::RegisterOperators;
-using c10::kernel;
-using c10::dispatchKey;
 using c10::TensorTypeId;
 using c10::KernelCache;
 using c10::Stack;
@@ -27,7 +25,6 @@ using c10::guts::make_unique;
 using c10::ivalue::TensorList;
 using c10::ivalue::IntList;
 using c10::intrusive_ptr;
-using c10::ArrayRef;
 using c10::Dict;
 using at::Tensor;
 using std::string;
@@ -364,7 +361,7 @@ TEST(OperatorRegistrationTest_LegacyFunctionBasedKernel, givenKernelWithIntInput
 
 int64_t captured_input_list_size = 0;
 
-void kernelWithIntListInputWithoutOutput(Tensor, ArrayRef<int64_t> input1) {
+void kernelWithIntListInputWithoutOutput(Tensor, const std::vector<int64_t>& input1) {
   captured_input_list_size = input1.size();
 }
 
@@ -381,7 +378,7 @@ TEST(OperatorRegistrationTest_LegacyFunctionBasedKernel, givenKernelWithIntListI
   EXPECT_EQ(3, captured_input_list_size);
 }
 
-int64_t kernelWithIntListInputWithOutput(Tensor, ArrayRef<int64_t> input1) {
+int64_t kernelWithIntListInputWithOutput(Tensor, const std::vector<int64_t>& input1) {
   return input1.size();
 }
 
@@ -397,7 +394,7 @@ TEST(OperatorRegistrationTest_LegacyFunctionBasedKernel, givenKernelWithIntListI
   EXPECT_EQ(3, outputs[0].toInt());
 }
 
-void kernelWithTensorListInputWithoutOutput(ArrayRef<Tensor> input1) {
+void kernelWithTensorListInputWithoutOutput(const std::vector<Tensor>& input1) {
   captured_input_list_size = input1.size();
 }
 
@@ -414,7 +411,7 @@ TEST(OperatorRegistrationTest_LegacyFunctionBasedKernel, givenKernelWithTensorLi
   EXPECT_EQ(2, captured_input_list_size);
 }
 
-int64_t kernelWithTensorListInputWithOutput(ArrayRef<Tensor> input1) {
+int64_t kernelWithTensorListInputWithOutput(const std::vector<Tensor>& input1) {
   return input1.size();
 }
 
@@ -496,6 +493,27 @@ TEST(OperatorRegistrationTest_LegacyFunctionBasedKernel, givenKernelWithLegacyTe
   EXPECT_EQ(2, outputs[0].toInt());
 }
 
+std::vector<std::string> kernelWithStringListOutput(std::vector<std::string> input) {
+  return input;
+}
+
+TEST(OperatorRegistrationTest_LegacyFunctionBasedKernel, givenKernelWithStringListOutput_whenRegistered_thenCanBeCalled) {
+  auto registrar = RegisterOperators()
+      .op("_test::stringlist_output(str[] input) -> str[]", &kernelWithStringListOutput);
+
+  auto op = c10::Dispatcher::singleton().findSchema("_test::stringlist_output", "");
+  ASSERT_TRUE(op.has_value());
+
+  std::vector<std::string> list{"value1", "value2"};
+  auto outputs = callOp(*op, list);
+  EXPECT_EQ(1, outputs.size());
+  auto output = std::move(outputs[0].toGenericList()->elements());
+
+  EXPECT_EQ(2, output.size());
+  EXPECT_EQ("value1", output[0].toString()->string());
+  EXPECT_EQ("value2", output[1].toString()->string());
+}
+
 int captured_dict_size = 0;
 
 void kernelWithDictInputWithoutOutput(Dict<string, Tensor> input1) {
@@ -622,6 +640,118 @@ TEST(OperatorRegistrationTest_LegacyFunctionBasedKernel, givenKernelWithUnordere
   EXPECT_EQ("value2", output.at("key2"));
 }
 
+std::unordered_map<string, std::vector<int64_t>> kernelWithMapOfIntList(std::unordered_map<string, std::vector<int64_t>> input) {
+  return input;
+}
+
+TEST(OperatorRegistrationTest_LegacyFunctionBasedKernel, givenKernelWithMapOfList_whenRegistered_thenCanBeCalled) {
+  auto registrar = RegisterOperators()
+      .op("_test::dict_output(Dict(str, int[]) input) -> Dict(str, int[])", &kernelWithMapOfIntList);
+
+  auto op = c10::Dispatcher::singleton().findSchema("_test::dict_output", "");
+  ASSERT_TRUE(op.has_value());
+
+  std::unordered_map<string, std::vector<int64_t>> dict;
+  dict.insert({"key1", std::vector<int64_t>{10, 20}});
+  dict.insert({"key2", std::vector<int64_t>{30, 40}});
+  auto outputs = callOp(*op, dict);
+  EXPECT_EQ(1, outputs.size());
+  auto output = c10::impl::toTypedDict<string, std::vector<int64_t>>(std::move(outputs[0].toGenericDict()->elements()));
+
+  EXPECT_EQ(2, output.size());
+  EXPECT_EQ(2, output.at("key1").size());
+  EXPECT_EQ(10, output.at("key1")[0]);
+  EXPECT_EQ(20, output.at("key1")[1]);
+  EXPECT_EQ(2, output.at("key2").size());
+  EXPECT_EQ(30, output.at("key2")[0]);
+  EXPECT_EQ(40, output.at("key2")[1]);
+}
+
+std::unordered_map<string, std::vector<std::unordered_map<int64_t, string>>> kernelWithMapOfListOfMap(std::unordered_map<string, std::vector<std::unordered_map<int64_t, string>>> input) {
+  return input;
+}
+
+TEST(OperatorRegistrationTest_LegacyFunctionBasedKernel, givenKernelWithMapOfListOfMap_whenRegistered_thenCanBeCalled) {
+  auto registrar = RegisterOperators()
+      .op("_test::dict_output(Dict(str, Dict(int,str)[]) input) -> Dict(str, Dict(int,str)[])", &kernelWithMapOfListOfMap);
+
+  auto op = c10::Dispatcher::singleton().findSchema("_test::dict_output", "");
+  ASSERT_TRUE(op.has_value());
+
+  std::unordered_map<string, std::vector<std::unordered_map<int64_t, string>>> dict;
+  dict.insert({"key1", {{{10, "10"}, {20, "20"}}}});
+  dict.insert({"key2", {{{30, "30"}, {40, "40"}}}});
+  auto outputs = callOp(*op, dict);
+  EXPECT_EQ(1, outputs.size());
+  auto output = c10::impl::toTypedDict<string, std::vector<std::unordered_map<int64_t, string>>>(std::move(outputs[0].toGenericDict()->elements()));
+
+  EXPECT_EQ(2, output.size());
+  EXPECT_EQ(1, output.at("key1").size());
+  EXPECT_EQ(2, output.at("key1")[0].size());
+  EXPECT_EQ("10", output.at("key1")[0][10]);
+  EXPECT_EQ("20", output.at("key1")[0][20]);
+  EXPECT_EQ(2, output.at("key2")[0].size());
+  EXPECT_EQ("30", output.at("key2")[0][30]);
+  EXPECT_EQ("40", output.at("key2")[0][40]);
+}
+
+std::vector<std::unordered_map<string, int64_t>> kernelWithListOfMap(std::vector<std::unordered_map<string, int64_t>> input) {
+  return input;
+}
+
+TEST(OperatorRegistrationTest_LegacyFunctionBasedKernel, givenKernelWithListOfMap_whenRegistered_thenCanBeCalled) {
+  auto registrar = RegisterOperators()
+      .op("_test::list_output(Dict(str, int)[] input) -> Dict(str, int)[]", &kernelWithListOfMap);
+
+  auto op = c10::Dispatcher::singleton().findSchema("_test::list_output", "");
+  ASSERT_TRUE(op.has_value());
+
+  std::vector<std::unordered_map<string, int64_t>> list{{{"1", 1}, {"2", 2}}, {{"3", 3}, {"4", 4}}};
+  auto outputs = callOp(*op, list);
+  EXPECT_EQ(1, outputs.size());
+  std::vector<c10::IValue> output = std::move(outputs[0].toGenericList()->elements());
+
+  EXPECT_EQ(2, output.size());
+  EXPECT_EQ(2, output[0].toGenericDictRef().size());
+  EXPECT_EQ(1, output[0].toGenericDictRef().at("1").toInt());
+  EXPECT_EQ(2, output[0].toGenericDictRef().at("2").toInt());
+  EXPECT_EQ(2, output[1].toGenericDictRef().size());
+  EXPECT_EQ(3, output[1].toGenericDictRef().at("3").toInt());
+  EXPECT_EQ(4, output[1].toGenericDictRef().at("4").toInt());
+}
+
+std::vector<std::unordered_map<string, std::vector<int64_t>>> kernelWithListOfMapOfIntList(std::vector<std::unordered_map<string, std::vector<int64_t>>> input) {
+  return input;
+}
+
+TEST(OperatorRegistrationTest_LegacyFunctionBasedKernel, givenKernelWithListOfMapOfIntList_whenRegistered_thenCanBeCalled) {
+  auto registrar = RegisterOperators()
+      .op("_test::list_output(Dict(str, int[])[] input) -> Dict(str, int[])[]", &kernelWithListOfMapOfIntList);
+
+  auto op = c10::Dispatcher::singleton().findSchema("_test::list_output", "");
+  ASSERT_TRUE(op.has_value());
+
+  std::vector<std::unordered_map<string, std::vector<int64_t>>> list{{{"1", {1, 2}}, {"3", {3, 4}}}, {{"5", {5, 6}}, {"7", {7, 8}}}};
+  auto outputs = callOp(*op, list);
+  EXPECT_EQ(1, outputs.size());
+  std::vector<c10::IValue> output = std::move(outputs[0].toGenericList()->elements());
+
+  EXPECT_EQ(2, output.size());
+  EXPECT_EQ(2, output[0].toGenericDictRef().size());
+  EXPECT_EQ(2, output[0].toGenericDictRef().at("1").toIntListRef().size());
+  EXPECT_EQ(1, output[0].toGenericDictRef().at("1").toIntListRef()[0]);
+  EXPECT_EQ(2, output[0].toGenericDictRef().at("1").toIntListRef()[1]);
+  EXPECT_EQ(2, output[0].toGenericDictRef().at("3").toIntListRef().size());
+  EXPECT_EQ(3, output[0].toGenericDictRef().at("3").toIntListRef()[0]);
+  EXPECT_EQ(4, output[0].toGenericDictRef().at("3").toIntListRef()[1]);
+  EXPECT_EQ(2, output[1].toGenericDictRef().at("5").toIntListRef().size());
+  EXPECT_EQ(5, output[1].toGenericDictRef().at("5").toIntListRef()[0]);
+  EXPECT_EQ(6, output[1].toGenericDictRef().at("5").toIntListRef()[1]);
+  EXPECT_EQ(2, output[1].toGenericDictRef().at("7").toIntListRef().size());
+  EXPECT_EQ(7, output[1].toGenericDictRef().at("7").toIntListRef()[0]);
+  EXPECT_EQ(8, output[1].toGenericDictRef().at("7").toIntListRef()[1]);
+}
+
 bool called = false;
 
 void kernelWithoutInputs() {
@@ -760,7 +890,7 @@ TEST(OperatorRegistrationTest_LegacyFunctionBasedKernel, givenKernelWithOptional
   EXPECT_TRUE(outputs[2].isNone());
 }
 
-std::tuple<int64_t, Tensor> kernelForSchemaInference(Tensor arg1, int64_t arg2, ArrayRef<Tensor> arg3) {
+std::tuple<int64_t, Tensor> kernelForSchemaInference(Tensor arg1, int64_t arg2, const std::vector<Tensor>& arg3) {
   return {};
 }
 
diff --git a/aten/src/ATen/core/op_registration/kernel_function_test.cpp b/aten/src/ATen/core/op_registration/kernel_function_test.cpp
index 0bad6f2486ac..397cc5e60bd2 100644
--- a/aten/src/ATen/core/op_registration/kernel_function_test.cpp
+++ b/aten/src/ATen/core/op_registration/kernel_function_test.cpp
@@ -6,8 +6,6 @@
 #include <torch/csrc/jit/script/function_schema_parser.h>
 
 using c10::RegisterOperators;
-using c10::kernel;
-using c10::dispatchKey;
 using c10::TensorTypeId;
 using c10::KernelCache;
 using c10::Stack;
@@ -15,7 +13,6 @@ using c10::guts::make_unique;
 using c10::ivalue::TensorList;
 using c10::ivalue::IntList;
 using c10::intrusive_ptr;
-using c10::ArrayRef;
 using c10::Dict;
 using at::Tensor;
 using std::string;
@@ -60,32 +57,32 @@ void expectCallsDecrement(TensorTypeId type_id) {
 }
 
 TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernel_whenRegistered_thenCanBeCalled) {
-  auto registrar = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", kernel<decltype(incrementKernel), &incrementKernel>(), dispatchKey(TensorType1()));
+  auto registrar = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", RegisterOperators::options().kernel<decltype(incrementKernel), &incrementKernel>().dispatchKey(TensorType1()));
   expectCallsIncrement(TensorType1());
 }
 
 TEST(OperatorRegistrationTest_FunctionBasedKernel, givenMultipleOperatorsAndKernels_whenRegisteredInOneRegistrar_thenCallsRightKernel) {
   auto registrar = RegisterOperators()
-      .op("_test::my_op(Tensor dummy, int input) -> int", kernel<decltype(incrementKernel), &incrementKernel>(), dispatchKey(TensorType1()))
-      .op("_test::my_op(Tensor dummy, int input) -> int", kernel<decltype(errorKernel), &errorKernel>(), dispatchKey(TensorType2()))
-      .op("_test::error(Tensor dummy, int input) -> int", kernel<decltype(errorKernel), &errorKernel>(), dispatchKey(TensorType1()))
-      .op("_test::error(Tensor dummy, int input) -> int", kernel<decltype(errorKernel), &errorKernel>(), dispatchKey(TensorType2()));
+      .op("_test::my_op(Tensor dummy, int input) -> int", RegisterOperators::options().kernel<decltype(incrementKernel), &incrementKernel>().dispatchKey(TensorType1()))
+      .op("_test::my_op(Tensor dummy, int input) -> int", RegisterOperators::options().kernel<decltype(errorKernel), &errorKernel>().dispatchKey(TensorType2()))
+      .op("_test::error(Tensor dummy, int input) -> int", RegisterOperators::options().kernel<decltype(errorKernel), &errorKernel>().dispatchKey(TensorType1()))
+      .op("_test::error(Tensor dummy, int input) -> int", RegisterOperators::options().kernel<decltype(errorKernel), &errorKernel>().dispatchKey(TensorType2()));
   expectCallsIncrement(TensorType1());
 }
 
 TEST(OperatorRegistrationTest_FunctionBasedKernel, givenMultipleOperatorsAndKernels_whenRegisteredInMultipleRegistrars_thenCallsRightKernel) {
-  auto registrar1 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", kernel<decltype(incrementKernel), &incrementKernel>(), dispatchKey(TensorType1()));
-  auto registrar2 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", kernel<decltype(errorKernel), &errorKernel>(), dispatchKey(TensorType2()));
-  auto registrar3 = RegisterOperators().op("_test::error(Tensor dummy, int input) -> int", kernel<decltype(errorKernel), &errorKernel>(), dispatchKey(TensorType1()));
-  auto registrar4 = RegisterOperators().op("_test::error(Tensor dummy, int input) -> int", kernel<decltype(errorKernel), &errorKernel>(), dispatchKey(TensorType2()));
+  auto registrar1 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", RegisterOperators::options().kernel<decltype(incrementKernel), &incrementKernel>().dispatchKey(TensorType1()));
+  auto registrar2 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", RegisterOperators::options().kernel<decltype(errorKernel), &errorKernel>().dispatchKey(TensorType2()));
+  auto registrar3 = RegisterOperators().op("_test::error(Tensor dummy, int input) -> int", RegisterOperators::options().kernel<decltype(errorKernel), &errorKernel>().dispatchKey(TensorType1()));
+  auto registrar4 = RegisterOperators().op("_test::error(Tensor dummy, int input) -> int", RegisterOperators::options().kernel<decltype(errorKernel), &errorKernel>().dispatchKey(TensorType2()));
   expectCallsIncrement(TensorType1());
 }
 
 TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernel_whenRegistrationRunsOutOfScope_thenCannotBeCalledAnymore) {
   {
-    auto registrar1 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", kernel<decltype(incrementKernel), &incrementKernel>(), dispatchKey(TensorType1()));
+    auto registrar1 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", RegisterOperators::options().kernel<decltype(incrementKernel), &incrementKernel>().dispatchKey(TensorType1()));
     {
-      auto registrar2 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", kernel<decltype(decrementKernel), &decrementKernel>(), dispatchKey(TensorType2()));
+      auto registrar2 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", RegisterOperators::options().kernel<decltype(decrementKernel), &decrementKernel>().dispatchKey(TensorType2()));
 
       // assert that schema and cpu kernel are present
       expectCallsIncrement(TensorType1());
@@ -108,7 +105,7 @@ void kernelWithoutOutput(const Tensor&) {
 }
 
 TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithoutOutput_whenRegistered_thenCanBeCalled) {
-  auto registrar = RegisterOperators().op("_test::no_return(Tensor dummy) -> ()", kernel<decltype(kernelWithoutOutput), &kernelWithoutOutput>(), dispatchKey(TensorType1()));
+  auto registrar = RegisterOperators().op("_test::no_return(Tensor dummy) -> ()", RegisterOperators::options().kernel<decltype(kernelWithoutOutput), &kernelWithoutOutput>().dispatchKey(TensorType1()));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::no_return", "");
   ASSERT_TRUE(op.has_value());
@@ -124,7 +121,7 @@ std::tuple<> kernelWithZeroOutputs(const Tensor&) {
 }
 
 TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithZeroOutputs_whenRegistered_thenCanBeCalled) {
-  auto registrar = RegisterOperators().op("_test::zero_outputs(Tensor dummy) -> ()", kernel<decltype(kernelWithZeroOutputs), &kernelWithZeroOutputs>(), dispatchKey(TensorType1()));
+  auto registrar = RegisterOperators().op("_test::zero_outputs(Tensor dummy) -> ()", RegisterOperators::options().kernel<decltype(kernelWithZeroOutputs), &kernelWithZeroOutputs>().dispatchKey(TensorType1()));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::zero_outputs", "");
   ASSERT_TRUE(op.has_value());
@@ -140,7 +137,7 @@ int64_t kernelWithIntOutput(Tensor, int64_t a, int64_t b) {
 
 TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithIntOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
-      .op("_test::int_output(Tensor dummy, int a, int b) -> int", kernel<decltype(kernelWithIntOutput), &kernelWithIntOutput>(), dispatchKey(TensorType1()));
+      .op("_test::int_output(Tensor dummy, int a, int b) -> int", RegisterOperators::options().kernel<decltype(kernelWithIntOutput), &kernelWithIntOutput>().dispatchKey(TensorType1()));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::int_output", "");
   ASSERT_TRUE(op.has_value());
@@ -156,8 +153,8 @@ Tensor kernelWithTensorOutput(const Tensor& input) {
 
 TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithTensorOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
-      .op("_test::returning_tensor(Tensor input) -> Tensor", kernel<decltype(kernelWithTensorOutput), &kernelWithTensorOutput>(), dispatchKey(TensorType1()))
-      .op("_test::returning_tensor(Tensor input) -> Tensor", kernel<decltype(kernelWithTensorOutput), &kernelWithTensorOutput>(), dispatchKey(TensorType2()));
+      .op("_test::returning_tensor(Tensor input) -> Tensor", RegisterOperators::options().kernel<decltype(kernelWithTensorOutput), &kernelWithTensorOutput>().dispatchKey(TensorType1()))
+      .op("_test::returning_tensor(Tensor input) -> Tensor", RegisterOperators::options().kernel<decltype(kernelWithTensorOutput), &kernelWithTensorOutput>().dispatchKey(TensorType2()));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::returning_tensor", "");
   ASSERT_TRUE(op.has_value());
@@ -177,7 +174,7 @@ std::vector<Tensor> kernelWithTensorListOutput(const Tensor& input1, const Tenso
 
 TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithTensorListOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
-      .op("_test::list_output(Tensor input1, Tensor input2, Tensor input3) -> Tensor[]", kernel<decltype(kernelWithTensorListOutput), &kernelWithTensorListOutput>(), dispatchKey(TensorType1()));
+      .op("_test::list_output(Tensor input1, Tensor input2, Tensor input3) -> Tensor[]", RegisterOperators::options().kernel<decltype(kernelWithTensorListOutput), &kernelWithTensorListOutput>().dispatchKey(TensorType1()));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::list_output", "");
   ASSERT_TRUE(op.has_value());
@@ -196,7 +193,7 @@ std::vector<int64_t> kernelWithIntListOutput(const Tensor&, int64_t input1, int6
 
 TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithIntListOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
-      .op("_test::list_output(Tensor dummy, int input1, int input2, int input3) -> int[]", kernel<decltype(kernelWithIntListOutput), &kernelWithIntListOutput>(), dispatchKey(TensorType1()));
+      .op("_test::list_output(Tensor dummy, int input1, int input2, int input3) -> int[]", RegisterOperators::options().kernel<decltype(kernelWithIntListOutput), &kernelWithIntListOutput>().dispatchKey(TensorType1()));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::list_output", "");
   ASSERT_TRUE(op.has_value());
@@ -224,7 +221,7 @@ std::tuple<Tensor, int64_t, std::vector<Tensor>, c10::optional<int64_t>, Dict<st
 
 TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithMultipleOutputs_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
-     .op("_test::multiple_outputs(Tensor dummy) -> (Tensor, int, Tensor[], int?, Dict(str, Tensor))", kernel<decltype(kernelWithMultipleOutputs), &kernelWithMultipleOutputs>(), dispatchKey(TensorType1()));
+     .op("_test::multiple_outputs(Tensor dummy) -> (Tensor, int, Tensor[], int?, Dict(str, Tensor))", RegisterOperators::options().kernel<decltype(kernelWithMultipleOutputs), &kernelWithMultipleOutputs>().dispatchKey(TensorType1()));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::multiple_outputs", "");
   ASSERT_TRUE(op.has_value());
@@ -253,8 +250,8 @@ Tensor kernelWithTensorInputByValueWithOutput(Tensor input1) {
 
 TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithTensorInputByReference_withOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
-      .op("_test::tensor_input(Tensor input) -> Tensor", kernel<decltype(kernelWithTensorInputByReferenceWithOutput), &kernelWithTensorInputByReferenceWithOutput>(), dispatchKey(TensorType1()))
-      .op("_test::tensor_input(Tensor input) -> Tensor", kernel<decltype(kernelWithTensorInputByReferenceWithOutput), &kernelWithTensorInputByReferenceWithOutput>(), dispatchKey(TensorType2()));
+      .op("_test::tensor_input(Tensor input) -> Tensor", RegisterOperators::options().kernel<decltype(kernelWithTensorInputByReferenceWithOutput), &kernelWithTensorInputByReferenceWithOutput>().dispatchKey(TensorType1()))
+      .op("_test::tensor_input(Tensor input) -> Tensor", RegisterOperators::options().kernel<decltype(kernelWithTensorInputByReferenceWithOutput), &kernelWithTensorInputByReferenceWithOutput>().dispatchKey(TensorType2()));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::tensor_input", "");
   ASSERT_TRUE(op.has_value());
@@ -270,8 +267,8 @@ TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithTensorInputByR
 
 TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithTensorInputByValue_withOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
-      .op("_test::tensor_input(Tensor input) -> Tensor", kernel<decltype(kernelWithTensorInputByValueWithOutput), &kernelWithTensorInputByValueWithOutput>(), dispatchKey(TensorType1()))
-      .op("_test::tensor_input(Tensor input) -> Tensor", kernel<decltype(kernelWithTensorInputByValueWithOutput), &kernelWithTensorInputByValueWithOutput>(), dispatchKey(TensorType2()));
+      .op("_test::tensor_input(Tensor input) -> Tensor", RegisterOperators::options().kernel<decltype(kernelWithTensorInputByValueWithOutput), &kernelWithTensorInputByValueWithOutput>().dispatchKey(TensorType1()))
+      .op("_test::tensor_input(Tensor input) -> Tensor", RegisterOperators::options().kernel<decltype(kernelWithTensorInputByValueWithOutput), &kernelWithTensorInputByValueWithOutput>().dispatchKey(TensorType2()));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::tensor_input", "");
   ASSERT_TRUE(op.has_value());
@@ -297,8 +294,8 @@ void kernelWithTensorInputByValueWithoutOutput(Tensor input1) {
 
 TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithTensorInputByReference_withoutOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
-      .op("_test::tensor_input(Tensor input) -> ()", kernel<decltype(kernelWithTensorInputByReferenceWithoutOutput), &kernelWithTensorInputByReferenceWithoutOutput>(), dispatchKey(TensorType1()))
-      .op("_test::tensor_input(Tensor input) -> ()", kernel<decltype(kernelWithTensorInputByReferenceWithoutOutput), &kernelWithTensorInputByReferenceWithoutOutput>(), dispatchKey(TensorType2()));
+      .op("_test::tensor_input(Tensor input) -> ()", RegisterOperators::options().kernel<decltype(kernelWithTensorInputByReferenceWithoutOutput), &kernelWithTensorInputByReferenceWithoutOutput>().dispatchKey(TensorType1()))
+      .op("_test::tensor_input(Tensor input) -> ()", RegisterOperators::options().kernel<decltype(kernelWithTensorInputByReferenceWithoutOutput), &kernelWithTensorInputByReferenceWithoutOutput>().dispatchKey(TensorType2()));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::tensor_input", "");
   ASSERT_TRUE(op.has_value());
@@ -314,8 +311,8 @@ TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithTensorInputByR
 
 TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithTensorInputByValue_withoutOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
-      .op("_test::tensor_input(Tensor input) -> ()", kernel<decltype(kernelWithTensorInputByValueWithoutOutput), &kernelWithTensorInputByValueWithoutOutput>(), dispatchKey(TensorType1()))
-      .op("_test::tensor_input(Tensor input) -> ()", kernel<decltype(kernelWithTensorInputByValueWithoutOutput), &kernelWithTensorInputByValueWithoutOutput>(), dispatchKey(TensorType2()));
+      .op("_test::tensor_input(Tensor input) -> ()", RegisterOperators::options().kernel<decltype(kernelWithTensorInputByValueWithoutOutput), &kernelWithTensorInputByValueWithoutOutput>().dispatchKey(TensorType1()))
+      .op("_test::tensor_input(Tensor input) -> ()", RegisterOperators::options().kernel<decltype(kernelWithTensorInputByValueWithoutOutput), &kernelWithTensorInputByValueWithoutOutput>().dispatchKey(TensorType2()));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::tensor_input", "");
   ASSERT_TRUE(op.has_value());
@@ -337,7 +334,7 @@ void kernelWithIntInputWithoutOutput(Tensor, int64_t input1) {
 
 TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithIntInput_withoutOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
-      .op("_test::int_input(Tensor dummy, int input) -> ()", kernel<decltype(kernelWithIntInputWithoutOutput), &kernelWithIntInputWithoutOutput>(), dispatchKey(TensorType1()));
+      .op("_test::int_input(Tensor dummy, int input) -> ()", RegisterOperators::options().kernel<decltype(kernelWithIntInputWithoutOutput), &kernelWithIntInputWithoutOutput>().dispatchKey(TensorType1()));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::int_input", "");
   ASSERT_TRUE(op.has_value());
@@ -354,7 +351,7 @@ int64_t kernelWithIntInputWithOutput(Tensor, int64_t input1) {
 
 TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithIntInput_withOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
-      .op("_test::int_input(Tensor dummy, int input) -> int", kernel<decltype(kernelWithIntInputWithOutput), &kernelWithIntInputWithOutput>(), dispatchKey(TensorType1()));
+      .op("_test::int_input(Tensor dummy, int input) -> int", RegisterOperators::options().kernel<decltype(kernelWithIntInputWithOutput), &kernelWithIntInputWithOutput>().dispatchKey(TensorType1()));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::int_input", "");
   ASSERT_TRUE(op.has_value());
@@ -366,13 +363,13 @@ TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithIntInput_withO
 
 int64_t captured_input_list_size = 0;
 
-void kernelWithIntListInputWithoutOutput(Tensor, ArrayRef<int64_t> input1) {
+void kernelWithIntListInputWithoutOutput(Tensor, const std::vector<int64_t>& input1) {
   captured_input_list_size = input1.size();
 }
 
 TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithIntListInput_withoutOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
-      .op("_test::int_list_input(Tensor dummy, int[] input) -> ()", kernel<decltype(kernelWithIntListInputWithoutOutput), &kernelWithIntListInputWithoutOutput>(), dispatchKey(TensorType1()));
+      .op("_test::int_list_input(Tensor dummy, int[] input) -> ()", RegisterOperators::options().kernel<decltype(kernelWithIntListInputWithoutOutput), &kernelWithIntListInputWithoutOutput>().dispatchKey(TensorType1()));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::int_list_input", "");
   ASSERT_TRUE(op.has_value());
@@ -383,13 +380,13 @@ TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithIntListInput_w
   EXPECT_EQ(3, captured_input_list_size);
 }
 
-int64_t kernelWithIntListInputWithOutput(Tensor, ArrayRef<int64_t> input1) {
+int64_t kernelWithIntListInputWithOutput(Tensor, const std::vector<int64_t>& input1) {
   return input1.size();
 }
 
 TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithIntListInput_withOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
-      .op("_test::int_list_input(Tensor dummy, int[] input) -> int", kernel<decltype(kernelWithIntListInputWithOutput), &kernelWithIntListInputWithOutput>(), dispatchKey(TensorType1()));
+      .op("_test::int_list_input(Tensor dummy, int[] input) -> int", RegisterOperators::options().kernel<decltype(kernelWithIntListInputWithOutput), &kernelWithIntListInputWithOutput>().dispatchKey(TensorType1()));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::int_list_input", "");
   ASSERT_TRUE(op.has_value());
@@ -399,13 +396,13 @@ TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithIntListInput_w
   EXPECT_EQ(3, outputs[0].toInt());
 }
 
-void kernelWithTensorListInputWithoutOutput(ArrayRef<Tensor> input1) {
+void kernelWithTensorListInputWithoutOutput(const std::vector<Tensor>& input1) {
   captured_input_list_size = input1.size();
 }
 
 TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithTensorListInput_withoutOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
-      .op("_test::tensor_list_input(Tensor[] input) -> ()", kernel<decltype(kernelWithTensorListInputWithoutOutput), &kernelWithTensorListInputWithoutOutput>(), dispatchKey(TensorType1()));
+      .op("_test::tensor_list_input(Tensor[] input) -> ()", RegisterOperators::options().kernel<decltype(kernelWithTensorListInputWithoutOutput), &kernelWithTensorListInputWithoutOutput>().dispatchKey(TensorType1()));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::tensor_list_input", "");
   ASSERT_TRUE(op.has_value());
@@ -416,13 +413,13 @@ TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithTensorListInpu
   EXPECT_EQ(2, captured_input_list_size);
 }
 
-int64_t kernelWithTensorListInputWithOutput(ArrayRef<Tensor> input1) {
+int64_t kernelWithTensorListInputWithOutput(const std::vector<Tensor>& input1) {
   return input1.size();
 }
 
 TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithTensorListInput_withOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
-      .op("_test::tensor_list_input(Tensor[] input) -> int", kernel<decltype(kernelWithTensorListInputWithOutput), &kernelWithTensorListInputWithOutput>(), dispatchKey(TensorType1()));
+      .op("_test::tensor_list_input(Tensor[] input) -> int", RegisterOperators::options().kernel<decltype(kernelWithTensorListInputWithOutput), &kernelWithTensorListInputWithOutput>().dispatchKey(TensorType1()));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::tensor_list_input", "");
   ASSERT_TRUE(op.has_value());
@@ -440,7 +437,7 @@ void kernelWithDictInputWithoutOutput(Dict<string, Tensor> input1) {
 
 TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithDictInput_withoutOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
-      .op("_test::dict_input(Dict(str, Tensor) input) -> ()", kernel<decltype(kernelWithDictInputWithoutOutput), &kernelWithDictInputWithoutOutput>());
+      .op("_test::dict_input(Dict(str, Tensor) input) -> ()", RegisterOperators::options().kernel<decltype(kernelWithDictInputWithoutOutput), &kernelWithDictInputWithoutOutput>());
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::dict_input", "");
   ASSERT_TRUE(op.has_value());
@@ -460,7 +457,7 @@ string kernelWithDictInputWithOutput(Dict<string, string> input1) {
 
 TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithDictInput_withOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
-      .op("_test::dict_input(Dict(str, str) input) -> str", kernel<decltype(kernelWithDictInputWithOutput), &kernelWithDictInputWithOutput>());
+      .op("_test::dict_input(Dict(str, str) input) -> str", RegisterOperators::options().kernel<decltype(kernelWithDictInputWithOutput), &kernelWithDictInputWithOutput>());
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::dict_input", "");
   ASSERT_TRUE(op.has_value());
@@ -479,7 +476,7 @@ Dict<string, string> kernelWithDictOutput(Dict<string, string> input) {
 
 TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithDictOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
-      .op("_test::dict_output(Dict(str, str) input) -> Dict(str, str)", kernel<decltype(kernelWithDictOutput), &kernelWithDictOutput>());
+      .op("_test::dict_output(Dict(str, str) input) -> Dict(str, str)", RegisterOperators::options().kernel<decltype(kernelWithDictOutput), &kernelWithDictOutput>());
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::dict_output", "");
   ASSERT_TRUE(op.has_value());
@@ -507,7 +504,7 @@ TEST(OperatorRegistrationTest_FunctionBasedKernel, givenFallbackKernelWithoutAny
   // is no way to get the dispatch key. For operators that only have a fallback
   // kernel, this must work for backwards compatibility.
   auto registrar = RegisterOperators()
-      .op("_test::no_tensor_args() -> ()", kernel<decltype(kernelWithoutInputs), &kernelWithoutInputs>());
+      .op("_test::no_tensor_args() -> ()", RegisterOperators::options().kernel<decltype(kernelWithoutInputs), &kernelWithoutInputs>());
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::no_tensor_args", "");
   ASSERT_TRUE(op.has_value());
@@ -526,7 +523,7 @@ TEST(OperatorRegistrationTest_FunctionBasedKernel, givenFallbackKernelWithoutTen
   // is no way to get the dispatch key. For operators that only have a fallback
   // kernel, this must work for backwards compatibility.
   auto registrar = RegisterOperators()
-      .op("_test::no_tensor_args(int arg) -> int", kernel<decltype(kernelWithoutTensorInputs), &kernelWithoutTensorInputs>());
+      .op("_test::no_tensor_args(int arg) -> int", RegisterOperators::options().kernel<decltype(kernelWithoutTensorInputs), &kernelWithoutTensorInputs>());
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::no_tensor_args", "");
   ASSERT_TRUE(op.has_value());
@@ -548,7 +545,7 @@ void kernelWithOptInputWithoutOutput(Tensor arg1, const c10::optional<Tensor>& a
 }
 
 TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithOptionalInputs_withoutOutput_whenRegistered_thenCanBeCalled) {
-  auto registrar = RegisterOperators().op("_test::opt_input(Tensor arg1, Tensor? arg2, int? arg3, str? arg4) -> ()", kernel<decltype(kernelWithOptInputWithoutOutput), &kernelWithOptInputWithoutOutput>(), dispatchKey(TensorType1()));
+  auto registrar = RegisterOperators().op("_test::opt_input(Tensor arg1, Tensor? arg2, int? arg3, str? arg4) -> ()", RegisterOperators::options().kernel<decltype(kernelWithOptInputWithoutOutput), &kernelWithOptInputWithoutOutput>().dispatchKey(TensorType1()));
   auto op = c10::Dispatcher::singleton().findSchema("_test::opt_input", "");
   ASSERT_TRUE(op.has_value());
 
@@ -583,7 +580,7 @@ c10::optional<Tensor> kernelWithOptInputWithOutput(Tensor arg1, const c10::optio
 }
 
 TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithOptionalInputs_withOutput_whenRegistered_thenCanBeCalled) {
-  auto registrar = RegisterOperators().op("_test::opt_input(Tensor arg1, Tensor? arg2, int? arg3, str? arg4) -> Tensor?", kernel<decltype(kernelWithOptInputWithOutput), &kernelWithOptInputWithOutput>(), dispatchKey(TensorType1()));
+  auto registrar = RegisterOperators().op("_test::opt_input(Tensor arg1, Tensor? arg2, int? arg3, str? arg4) -> Tensor?", RegisterOperators::options().kernel<decltype(kernelWithOptInputWithOutput), &kernelWithOptInputWithOutput>().dispatchKey(TensorType1()));
   auto op = c10::Dispatcher::singleton().findSchema("_test::opt_input", "");
   ASSERT_TRUE(op.has_value());
 
@@ -617,7 +614,7 @@ kernelWithOptInputWithMultipleOutputs(Tensor arg1, const c10::optional<Tensor>&
 }
 
 TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithOptionalInputs_withMultipleOutputs_whenRegistered_thenCanBeCalled) {
-  auto registrar = RegisterOperators().op("_test::opt_input(Tensor arg1, Tensor? arg2, int? arg3, str? arg4) -> (Tensor?, int?, str?)", kernel<decltype(kernelWithOptInputWithMultipleOutputs), &kernelWithOptInputWithMultipleOutputs>(), dispatchKey(TensorType1()));
+  auto registrar = RegisterOperators().op("_test::opt_input(Tensor arg1, Tensor? arg2, int? arg3, str? arg4) -> (Tensor?, int?, str?)", RegisterOperators::options().kernel<decltype(kernelWithOptInputWithMultipleOutputs), &kernelWithOptInputWithMultipleOutputs>().dispatchKey(TensorType1()));
   auto op = c10::Dispatcher::singleton().findSchema("_test::opt_input", "");
   ASSERT_TRUE(op.has_value());
 
@@ -634,13 +631,13 @@ TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernelWithOptionalInputs
   EXPECT_TRUE(outputs[2].isNone());
 }
 
-std::tuple<int64_t, Tensor> kernelForSchemaInference(Tensor arg1, int64_t arg2, ArrayRef<Tensor> arg3) {
+std::tuple<int64_t, Tensor> kernelForSchemaInference(Tensor arg1, int64_t arg2, const std::vector<Tensor>& arg3) {
   return {};
 }
 
 TEST(OperatorRegistrationTest_FunctionBasedKernel, givenKernel_whenRegisteredWithoutSpecifyingSchema_thenInfersSchema) {
   auto registrar = RegisterOperators()
-      .op("_test::no_schema_specified", kernel<decltype(kernelForSchemaInference), &kernelForSchemaInference>());
+      .op("_test::no_schema_specified", RegisterOperators::options().kernel<decltype(kernelForSchemaInference), &kernelForSchemaInference>());
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::no_schema_specified", "");
   ASSERT_TRUE(op.has_value());
@@ -658,35 +655,35 @@ template<class... Args> struct kernel_func<void, Args...> final {
 TEST(OperatorRegistrationTest_FunctionBasedKernel, givenMismatchedKernel_withDifferentNumArguments_whenRegistering_thenFails) {
   // assert this does not fail because it matches
   RegisterOperators()
-      .op("_test::mismatch(Tensor arg) -> int", kernel<decltype(kernel_func<int64_t, Tensor>::func), &kernel_func<int64_t, Tensor>::func>(), dispatchKey(TensorType1()));
+      .op("_test::mismatch(Tensor arg) -> int", RegisterOperators::options().kernel<decltype(kernel_func<int64_t, Tensor>::func), &kernel_func<int64_t, Tensor>::func>().dispatchKey(TensorType1()));
 
   // and now a set of mismatching schemas
   expectThrows<c10::Error>([] {
     RegisterOperators()
-        .op("_test::mismatch(Tensor arg, Tensor arg2) -> int", kernel<decltype(kernel_func<int64_t, Tensor>::func), &kernel_func<int64_t, Tensor>::func>(), dispatchKey(TensorType1()));
+        .op("_test::mismatch(Tensor arg, Tensor arg2) -> int", RegisterOperators::options().kernel<decltype(kernel_func<int64_t, Tensor>::func), &kernel_func<int64_t, Tensor>::func>().dispatchKey(TensorType1()));
     }, "The number of arguments is different. Specified 2 but inferred 1"
   );
 
   // assert this does not fail because it matches
   RegisterOperators()
-      .op("_test::mismatch(Tensor arg, Tensor arg2) -> ()", kernel<decltype(kernel_func<void, Tensor, Tensor>::func), &kernel_func<void, Tensor, Tensor>::func>(), dispatchKey(TensorType1()));
+      .op("_test::mismatch(Tensor arg, Tensor arg2) -> ()", RegisterOperators::options().kernel<decltype(kernel_func<void, Tensor, Tensor>::func), &kernel_func<void, Tensor, Tensor>::func>().dispatchKey(TensorType1()));
 
   // and now a set of mismatching schemas
   expectThrows<c10::Error>([] {
     RegisterOperators()
-        .op("_test::mismatch() -> ()", kernel<decltype(kernel_func<void, Tensor, Tensor>::func), &kernel_func<void, Tensor, Tensor>::func>(), dispatchKey(TensorType1()));
+        .op("_test::mismatch() -> ()", RegisterOperators::options().kernel<decltype(kernel_func<void, Tensor, Tensor>::func), &kernel_func<void, Tensor, Tensor>::func>().dispatchKey(TensorType1()));
     }, "The number of arguments is different. Specified 0 but inferred 2"
   );
 
   expectThrows<c10::Error>([] {
     RegisterOperators()
-        .op("_test::mismatch(Tensor arg) -> ()", kernel<decltype(kernel_func<void, Tensor, Tensor>::func), &kernel_func<void, Tensor, Tensor>::func>(), dispatchKey(TensorType1()));
+        .op("_test::mismatch(Tensor arg) -> ()", RegisterOperators::options().kernel<decltype(kernel_func<void, Tensor, Tensor>::func), &kernel_func<void, Tensor, Tensor>::func>().dispatchKey(TensorType1()));
     }, "The number of arguments is different. Specified 1 but inferred 2"
   );
 
   expectThrows<c10::Error>([] {
     RegisterOperators()
-        .op("_test::mismatch(Tensor arg, Tensor arg2, Tensor arg3) -> ()", kernel<decltype(kernel_func<void, Tensor, Tensor>::func), &kernel_func<void, Tensor, Tensor>::func>(), dispatchKey(TensorType1()));
+        .op("_test::mismatch(Tensor arg, Tensor arg2, Tensor arg3) -> ()", RegisterOperators::options().kernel<decltype(kernel_func<void, Tensor, Tensor>::func), &kernel_func<void, Tensor, Tensor>::func>().dispatchKey(TensorType1()));
     }, "The number of arguments is different. Specified 3 but inferred 2"
   );
 }
@@ -694,18 +691,18 @@ TEST(OperatorRegistrationTest_FunctionBasedKernel, givenMismatchedKernel_withDif
 TEST(OperatorRegistrationTest_FunctionBasedKernel, givenMismatchedKernel_withDifferentArgumentType_whenRegistering_thenFails) {
   // assert this does not fail because it matches
   RegisterOperators()
-      .op("_test::mismatch(Tensor arg1, int arg2) -> int", kernel<decltype(kernel_func<int64_t, Tensor, int64_t>::func), &kernel_func<int64_t, Tensor, int64_t>::func>(), dispatchKey(TensorType1()));
+      .op("_test::mismatch(Tensor arg1, int arg2) -> int", RegisterOperators::options().kernel<decltype(kernel_func<int64_t, Tensor, int64_t>::func), &kernel_func<int64_t, Tensor, int64_t>::func>().dispatchKey(TensorType1()));
 
   // and now a set of mismatching schemas
   expectThrows<c10::Error>([] {
     RegisterOperators()
-        .op("_test::mismatch(Tensor arg1, float arg2) -> int", kernel<decltype(kernel_func<int64_t, Tensor, int64_t>::func), &kernel_func<int64_t, Tensor, int64_t>::func>(), dispatchKey(TensorType1()));
+        .op("_test::mismatch(Tensor arg1, float arg2) -> int", RegisterOperators::options().kernel<decltype(kernel_func<int64_t, Tensor, int64_t>::func), &kernel_func<int64_t, Tensor, int64_t>::func>().dispatchKey(TensorType1()));
     }, "Type mismatch in argument 2: specified float but inferred int"
   );
 
   expectThrows<c10::Error>([] {
     RegisterOperators()
-        .op("_test::mismatch(int arg1, int arg2) -> int", kernel<decltype(kernel_func<int64_t, Tensor, int64_t>::func), &kernel_func<int64_t, Tensor, int64_t>::func>(), dispatchKey(TensorType1()));
+        .op("_test::mismatch(int arg1, int arg2) -> int", RegisterOperators::options().kernel<decltype(kernel_func<int64_t, Tensor, int64_t>::func), &kernel_func<int64_t, Tensor, int64_t>::func>().dispatchKey(TensorType1()));
     }, "Type mismatch in argument 1: specified int but inferred Tensor"
   );
 }
@@ -713,58 +710,58 @@ TEST(OperatorRegistrationTest_FunctionBasedKernel, givenMismatchedKernel_withDif
 TEST(OperatorRegistrationTest_FunctionBasedKernel, givenMismatchedKernel_withDifferentNumReturns_whenRegistering_thenFails) {
   // assert this does not fail because it matches
   RegisterOperators()
-      .op("_test::mismatch(Tensor arg) -> int", kernel<decltype(kernel_func<int64_t, Tensor>::func), &kernel_func<int64_t, Tensor>::func>(), dispatchKey(TensorType1()));
+      .op("_test::mismatch(Tensor arg) -> int", RegisterOperators::options().kernel<decltype(kernel_func<int64_t, Tensor>::func), &kernel_func<int64_t, Tensor>::func>().dispatchKey(TensorType1()));
 
   // and now a set of mismatching schemas
   expectThrows<c10::Error>([] {
     RegisterOperators()
-        .op("_test::mismatch(Tensor arg) -> ()", kernel<decltype(kernel_func<int64_t, Tensor>::func), &kernel_func<int64_t, Tensor>::func>(), dispatchKey(TensorType1()));
+        .op("_test::mismatch(Tensor arg) -> ()", RegisterOperators::options().kernel<decltype(kernel_func<int64_t, Tensor>::func), &kernel_func<int64_t, Tensor>::func>().dispatchKey(TensorType1()));
     }, "The number of returns is different. Specified 0 but inferred 1"
   );
 
   expectThrows<c10::Error>([] {
     RegisterOperators()
-        .op("_test::mismatch(Tensor arg) -> (int, int)", kernel<decltype(kernel_func<int64_t, Tensor>::func), &kernel_func<int64_t, Tensor>::func>(), dispatchKey(TensorType1()));
+        .op("_test::mismatch(Tensor arg) -> (int, int)", RegisterOperators::options().kernel<decltype(kernel_func<int64_t, Tensor>::func), &kernel_func<int64_t, Tensor>::func>().dispatchKey(TensorType1()));
     }, "The number of returns is different. Specified 2 but inferred 1"
   );
 
   // assert this does not fail because it matches
   RegisterOperators()
-      .op("_test::mismatch(Tensor arg) -> ()", kernel<decltype(kernel_func<void, Tensor>::func), &kernel_func<void, Tensor>::func>(), dispatchKey(TensorType1()));
+      .op("_test::mismatch(Tensor arg) -> ()", RegisterOperators::options().kernel<decltype(kernel_func<void, Tensor>::func), &kernel_func<void, Tensor>::func>().dispatchKey(TensorType1()));
 
   // and now a set of mismatching schemas
   expectThrows<c10::Error>([] {
     RegisterOperators()
-        .op("_test::mismatch(Tensor arg) -> Tensor", kernel<decltype(kernel_func<void, Tensor>::func), &kernel_func<void, Tensor>::func>(), dispatchKey(TensorType1()));
+        .op("_test::mismatch(Tensor arg) -> Tensor", RegisterOperators::options().kernel<decltype(kernel_func<void, Tensor>::func), &kernel_func<void, Tensor>::func>().dispatchKey(TensorType1()));
     }, "The number of returns is different. Specified 1 but inferred 0"
   );
 
   expectThrows<c10::Error>([] {
     RegisterOperators()
-        .op("_test::mismatch(Tensor arg) -> (Tensor, Tensor)", kernel<decltype(kernel_func<void, Tensor>::func), &kernel_func<void, Tensor>::func>(), dispatchKey(TensorType1()));
+        .op("_test::mismatch(Tensor arg) -> (Tensor, Tensor)", RegisterOperators::options().kernel<decltype(kernel_func<void, Tensor>::func), &kernel_func<void, Tensor>::func>().dispatchKey(TensorType1()));
     }, "The number of returns is different. Specified 2 but inferred 0"
   );
 
   // assert this does not fail because it matches
   RegisterOperators()
-      .op("_test::mismatch(Tensor arg) -> (Tensor, Tensor)", kernel<decltype(kernel_func<std::tuple<Tensor, Tensor>, Tensor>::func), &kernel_func<std::tuple<Tensor, Tensor>, Tensor>::func>(), dispatchKey(TensorType1()));
+      .op("_test::mismatch(Tensor arg) -> (Tensor, Tensor)", RegisterOperators::options().kernel<decltype(kernel_func<std::tuple<Tensor, Tensor>, Tensor>::func), &kernel_func<std::tuple<Tensor, Tensor>, Tensor>::func>().dispatchKey(TensorType1()));
 
   // and now a set of mismatching schemas
   expectThrows<c10::Error>([] {
     RegisterOperators()
-        .op("_test::mismatch(Tensor arg) -> ()", kernel<decltype(kernel_func<std::tuple<Tensor, Tensor>, Tensor>::func), &kernel_func<std::tuple<Tensor, Tensor>, Tensor>::func>(), dispatchKey(TensorType1()));
+        .op("_test::mismatch(Tensor arg) -> ()", RegisterOperators::options().kernel<decltype(kernel_func<std::tuple<Tensor, Tensor>, Tensor>::func), &kernel_func<std::tuple<Tensor, Tensor>, Tensor>::func>().dispatchKey(TensorType1()));
     }, "The number of returns is different. Specified 0 but inferred 2"
   );
 
   expectThrows<c10::Error>([] {
     RegisterOperators()
-        .op("_test::mismatch(Tensor arg) -> Tensor", kernel<decltype(kernel_func<std::tuple<Tensor, Tensor>, Tensor>::func), &kernel_func<std::tuple<Tensor, Tensor>, Tensor>::func>(), dispatchKey(TensorType1()));
+        .op("_test::mismatch(Tensor arg) -> Tensor", RegisterOperators::options().kernel<decltype(kernel_func<std::tuple<Tensor, Tensor>, Tensor>::func), &kernel_func<std::tuple<Tensor, Tensor>, Tensor>::func>().dispatchKey(TensorType1()));
     }, "The number of returns is different. Specified 1 but inferred 2"
   );
 
   expectThrows<c10::Error>([] {
     RegisterOperators()
-        .op("_test::mismatch(Tensor arg) -> (Tensor, Tensor, Tensor)", kernel<decltype(kernel_func<std::tuple<Tensor, Tensor>, Tensor>::func), &kernel_func<std::tuple<Tensor, Tensor>, Tensor>::func>(), dispatchKey(TensorType1()));
+        .op("_test::mismatch(Tensor arg) -> (Tensor, Tensor, Tensor)", RegisterOperators::options().kernel<decltype(kernel_func<std::tuple<Tensor, Tensor>, Tensor>::func), &kernel_func<std::tuple<Tensor, Tensor>, Tensor>::func>().dispatchKey(TensorType1()));
     }, "The number of returns is different. Specified 3 but inferred 2"
   );
 }
@@ -772,46 +769,46 @@ TEST(OperatorRegistrationTest_FunctionBasedKernel, givenMismatchedKernel_withDif
 TEST(OperatorRegistrationTest_FunctionBasedKernel, givenMismatchedKernel_withDifferentReturnTypes_whenRegistering_thenFails) {
   // assert this does not fail because it matches
   RegisterOperators()
-      .op("_test::mismatch(Tensor arg) -> int", kernel<decltype(kernel_func<int64_t, Tensor>::func), &kernel_func<int64_t, Tensor>::func>(), dispatchKey(TensorType1()));
+      .op("_test::mismatch(Tensor arg) -> int", RegisterOperators::options().kernel<decltype(kernel_func<int64_t, Tensor>::func), &kernel_func<int64_t, Tensor>::func>().dispatchKey(TensorType1()));
 
   // and now a set of mismatching schemas
   expectThrows<c10::Error>([] {
     RegisterOperators()
-        .op("_test::mismatch(Tensor arg) -> Tensor", kernel<decltype(kernel_func<int64_t, Tensor>::func), &kernel_func<int64_t, Tensor>::func>(), dispatchKey(TensorType1()));
+        .op("_test::mismatch(Tensor arg) -> Tensor", RegisterOperators::options().kernel<decltype(kernel_func<int64_t, Tensor>::func), &kernel_func<int64_t, Tensor>::func>().dispatchKey(TensorType1()));
     }, "Type mismatch in return 1: specified Tensor but inferred int"
   );
 
   expectThrows<c10::Error>([] {
     RegisterOperators()
-        .op("_test::mismatch(Tensor arg) -> float", kernel<decltype(kernel_func<int64_t, Tensor>::func), &kernel_func<int64_t, Tensor>::func>(), dispatchKey(TensorType1()));
+        .op("_test::mismatch(Tensor arg) -> float", RegisterOperators::options().kernel<decltype(kernel_func<int64_t, Tensor>::func), &kernel_func<int64_t, Tensor>::func>().dispatchKey(TensorType1()));
     }, "Type mismatch in return 1: specified float but inferred int"
   );
 
   // assert this does not fail because it matches
   RegisterOperators()
-      .op("_test::mismatch(Tensor arg) -> Tensor", kernel<decltype(kernel_func<Tensor, Tensor>::func), &kernel_func<Tensor, Tensor>::func>(), dispatchKey(TensorType1()));
+      .op("_test::mismatch(Tensor arg) -> Tensor", RegisterOperators::options().kernel<decltype(kernel_func<Tensor, Tensor>::func), &kernel_func<Tensor, Tensor>::func>().dispatchKey(TensorType1()));
 
   // and now a set of mismatching schemas
   expectThrows<c10::Error>([] {
     RegisterOperators()
-        .op("_test::mismatch(Tensor arg) -> float", kernel<decltype(kernel_func<Tensor, Tensor>::func), &kernel_func<Tensor, Tensor>::func>(), dispatchKey(TensorType1()));
+        .op("_test::mismatch(Tensor arg) -> float", RegisterOperators::options().kernel<decltype(kernel_func<Tensor, Tensor>::func), &kernel_func<Tensor, Tensor>::func>().dispatchKey(TensorType1()));
     }, "Type mismatch in return 1: specified float but inferred Tensor"
   );
 
   // assert this does not fail because it matches
   RegisterOperators()
-      .op("_test::mismatch(Tensor arg) -> (Tensor, int)", kernel<decltype(kernel_func<std::tuple<Tensor, int64_t>, Tensor>::func), &kernel_func<std::tuple<Tensor, int64_t>, Tensor>::func>(), dispatchKey(TensorType1()));
+      .op("_test::mismatch(Tensor arg) -> (Tensor, int)", RegisterOperators::options().kernel<decltype(kernel_func<std::tuple<Tensor, int64_t>, Tensor>::func), &kernel_func<std::tuple<Tensor, int64_t>, Tensor>::func>().dispatchKey(TensorType1()));
 
   // and now a set of mismatching schemas
   expectThrows<c10::Error>([] {
     RegisterOperators()
-        .op("_test::mismatch(Tensor arg) -> (Tensor, float)", kernel<decltype(kernel_func<std::tuple<Tensor, int64_t>, Tensor>::func), &kernel_func<std::tuple<Tensor, int64_t>, Tensor>::func>(), dispatchKey(TensorType1()));
+        .op("_test::mismatch(Tensor arg) -> (Tensor, float)", RegisterOperators::options().kernel<decltype(kernel_func<std::tuple<Tensor, int64_t>, Tensor>::func), &kernel_func<std::tuple<Tensor, int64_t>, Tensor>::func>().dispatchKey(TensorType1()));
     }, "Type mismatch in return 2: specified float but inferred int"
   );
 
   expectThrows<c10::Error>([] {
     RegisterOperators()
-        .op("_test::mismatch(Tensor arg) -> (int, int)", kernel<decltype(kernel_func<std::tuple<Tensor, int64_t>, Tensor>::func), &kernel_func<std::tuple<Tensor, int64_t>, Tensor>::func>(), dispatchKey(TensorType1()));
+        .op("_test::mismatch(Tensor arg) -> (int, int)", RegisterOperators::options().kernel<decltype(kernel_func<std::tuple<Tensor, int64_t>, Tensor>::func), &kernel_func<std::tuple<Tensor, int64_t>, Tensor>::func>().dispatchKey(TensorType1()));
     }, "Type mismatch in return 1: specified int but inferred Tensor"
   );
 }
diff --git a/aten/src/ATen/core/op_registration/kernel_functor.h b/aten/src/ATen/core/op_registration/kernel_functor.h
index 8966b734cc14..f43c0fc3ef56 100644
--- a/aten/src/ATen/core/op_registration/kernel_functor.h
+++ b/aten/src/ATen/core/op_registration/kernel_functor.h
@@ -1,6 +1,5 @@
 #pragma once
 
-#include <ATen/core/op_registration/kernel_stackbased.h>
 #include <ATen/core/op_registration/infer_schema.h>
 
 namespace c10 {
@@ -52,17 +51,6 @@ namespace detail {
     }
   };
   template<class T>
-  struct ivalue_to_arg_type<ArrayRef<T>> {
-    static ArrayRef<T> call(const IValue& v) {
-      // Note: This takes a `const IValue&` argument and not `IValue&&`, because the
-      //        returned ArrayRef is non-owning, so the call site needs to keep ownership
-      // TODO Do we want to support ArrayRef<optional<T>> ?
-      static_assert(guts::typelist::contains<supported_primitive_arg_types, T>::value, "You tried to register a kernel with an unsupported argument type: c10::ArrayRef<T> and T is not one of the supported primitive types.");
-      static_assert(!std::is_same<T, at::Scalar>::value, "You tried to register a kernel with an unsupported argument type: c10::ArrayRef<Scalar>. Please use c10::ArrayRef<int64_t>, c10::ArrayRef<double> or Tensor instead.");
-      return v.to<intrusive_ptr<ivalue::List<T>>>()->elements();
-    }
-  };
-  template<class T>
   struct ivalue_to_arg_type<optional<T>> {
     static optional<T> call(IValue&& v) {
       if (v.isNone()) {
@@ -81,24 +69,40 @@ namespace detail {
       return impl::toTypedDict<Key, Value>(std::move(dict_ptr->elements()));
     }
   };
-  // The following specialisations of ivalue_to_arg_type are technically not
-  // necessary since we would hit the base case and show an error message
-  // there if they didn't exist, but we can show a better error message
-  // in some common error scenarios.
   template<class T>
-  struct ivalue_to_arg_type<std::vector<T>> {
-    // We don't support std::vector because that would prevent us from doing
-    // internal optimization to how we represent lists (e.g. SmallVector).
-    // Users should use ArrayRef instead.
-    static_assert(guts::false_t<std::vector<T>>::value, "You tried to register a kernel with an unsupported argument type: std::vector<T>. Please use c10::ArrayRef<T> instead.");
+  struct ivalue_to_arg_type<std::vector<T>, guts::enable_if_t<guts::typelist::contains<supported_primitive_arg_types, T>::value && !std::is_same<std::string, T>::value>> final {
+    static std::vector<T> call(IValue&& v) {
+      return std::move(*std::move(v).to<intrusive_ptr<ivalue::List<T>>>()).elements();
+    }
+  };
+  template<class T>
+  struct ivalue_to_arg_type<std::vector<T>, guts::enable_if_t<!guts::typelist::contains<supported_primitive_arg_types, T>::value || std::is_same<std::string, T>::value>> final {
+    static std::vector<T> call(IValue&& v) {
+      auto list = std::move(v).toGenericList();
+      std::vector<T> result;
+      result.reserve(list->elements().size());
+      for (auto&& elem : std::move(list)->elements()) {
+        result.push_back(ivalue_to_arg_type<T>::call(std::move(elem)));
+      }
+      return result;
+    }
   };
   template<class Key, class Value>
-  struct ivalue_to_arg_type<std::unordered_map<Key, Value>> {
-    // We don't support std::vector because that would prevent us from doing
-    // internal optimization to how we represent lists (e.g. SmallVector).
-    // Users should use ArrayRef instead.
-    static_assert(guts::false_t<std::unordered_map<Key, Value>>::value, "You tried to register a kernel with an unsupported argument type: std::unordered_map<Key, Value>. Please use c10::Dict<Key, Value> instead.");
+  struct ivalue_to_arg_type<std::unordered_map<Key, Value>> final {
+    static std::unordered_map<Key, Value> call(IValue&& v) {
+      auto dict = std::move(v).toGenericDict();
+      std::unordered_map<Key, Value> result;
+      result.reserve(dict->elements().size());
+      for (auto& element : dict->elements()) {
+        result.emplace(ivalue_to_arg_type<Key>::call(element.key()), ivalue_to_arg_type<Value>::call(element.value()));
+      }
+      return result;
+    }
   };
+  // The following specialisations of ivalue_to_arg_type are technically not
+  // necessary since we would hit the base case and show an error message
+  // there if they didn't exist, but we can show a better error message
+  // in some common error scenarios.
   template<class T>
   struct ivalue_to_arg_type<T, guts::enable_if_t<std::is_same<float, T>::value>> {
     // There is no reason to support float when we have double. Keep the API lean.
@@ -115,37 +119,14 @@ namespace detail {
 
   // legacy_ivalue_to_arg_type is like ivalue_to_arg_type but additionally
   // allows a few deprecated types like std::vector.
-  template<class T>
+  template<class T, class Enable = void>
   struct legacy_ivalue_to_arg_type final {
     static auto call(IValue&& v) -> decltype(ivalue_to_arg_type<T>::call(std::move(v))) {
       return ivalue_to_arg_type<T>::call(std::move(v));
     }
   };
-  template<class T>
-  struct legacy_ivalue_to_arg_type<std::vector<T>> final {
-    static std::vector<T> call(IValue&& v) {
-      static_assert(guts::typelist::contains<supported_primitive_arg_types, T>::value, "You tried to register a kernel with an unsupported argument type: std::vector<T> and T is not one of the supported primitive types.");
-      return std::move(*std::move(v).to<intrusive_ptr<ivalue::List<T>>>()).elements();
-    }
-  };
-  template<class Key, class Value>
-  struct legacy_ivalue_to_arg_type<std::unordered_map<Key, Value>> final {
-    static std::unordered_map<Key, Value> call(const IValue& v) {
-      static_assert(guts::typelist::contains<supported_primitive_arg_types, Key>::value, "You tried to register a kernel with an unsupported argument type: std::unordered_map<Key, Value> and Key is not one of the supported primitive types.");
-      static_assert(guts::typelist::contains<supported_primitive_arg_types, Value>::value, "You tried to register a kernel with an unsupported argument type: std::unordered_map<Key, Value> and Value is not one of the supported primitive types.");
 
-      auto dict_ptr = std::move(v).toGenericDict();
-      auto dict = impl::toTypedDict<Key, Value>(std::move(dict_ptr->elements()));
-      std::unordered_map<Key, Value> result;
-      result.reserve(dict.size());
-      for (auto& element : dict) {
-        result.emplace(element.key(), element.value());
-      }
-      return result;
-    }
-  };
-
-  // TODO Make nesting types work, e.g. Dicts of lists, lists of lists, and so on
+  // TODO Make nesting types work with new style API, e.g. Dicts of lists, lists of lists, and so on
 
   template<class T, class Enable = void>
   struct return_type_to_ivalue {
@@ -153,8 +134,9 @@ namespace detail {
   };
   template<class T>
   struct return_type_to_ivalue<T, guts::enable_if_t<guts::typelist::contains<supported_primitive_arg_types, T>::value>> {
-    static IValue call(T&& v) {
-      return IValue(std::move(v));
+    template<class T_>
+    static IValue call(T_&& v) {
+      return IValue(std::forward<T_>(v));
     }
   };
   template<class T>
@@ -167,13 +149,25 @@ namespace detail {
     }
   };
   template<class T>
-  struct return_type_to_ivalue<std::vector<T>> {
+  struct return_type_to_ivalue<std::vector<T>, guts::enable_if_t<guts::typelist::contains<supported_primitive_arg_types, T>::value && !std::is_same<std::string, T>::value>> {
     static IValue call(std::vector<T>&& v) {
       static_assert(guts::typelist::contains<supported_primitive_arg_types, T>::value, "You tried to register a kernel with an unsupported return type: vector<T> and T is not one of the supported primitive types.");
       static_assert(!std::is_same<T, at::Scalar>::value, "You tried to register a kernel with an unsupported return type: vector<Scalar>. Please use vector<int64_t>, vector<double> or Tensor instead.");
       return IValue(std::move(v));
     }
   };
+  template<class T>
+  struct return_type_to_ivalue<std::vector<T>, guts::enable_if_t<!guts::typelist::contains<supported_primitive_arg_types, T>::value || std::is_same<std::string, T>::value>> {
+    static IValue call(std::vector<T>&& v) {
+      static_assert(!std::is_same<T, at::Scalar>::value, "You tried to register a kernel with an unsupported return type: vector<Scalar>. Please use vector<int64_t>, vector<double> or Tensor instead.");
+      std::vector<IValue> result;
+      result.reserve(v.size());
+      for (auto& elem : v) {
+        result.push_back(return_type_to_ivalue<T>::call(std::move(elem)));
+      }
+      return result;
+    }
+  };
   template<class Key, class Value>
   struct return_type_to_ivalue<c10::Dict<Key, Value>> {
     static IValue call(c10::Dict<Key, Value>&& v) {
@@ -182,6 +176,17 @@ namespace detail {
       return IValue(impl::toGenericDict(std::move(v)));
     }
   };
+  template<class Key, class Value>
+  struct return_type_to_ivalue<std::unordered_map<Key, Value>> final {
+    static IValue call(std::unordered_map<Key, Value>&& v) {
+      c10::impl::GenericDict dict;
+      dict.reserve(v.size());
+      for (auto& element : v) {
+        dict.insert(return_type_to_ivalue<Key>::call(Key{element.first}), return_type_to_ivalue<Value>::call(std::move(element.second)));
+      }
+      return dict;
+    }
+  };
   // The following specialisations of return_type_to_ivalue are technically not
   // necessary since we would hit the base case and show an error message
   // there if they didn't exist, but we can show a better error message
@@ -190,10 +195,6 @@ namespace detail {
   struct return_type_to_ivalue<c10::ArrayRef<T>> {
     static_assert(guts::false_t<c10::ArrayRef<T>>::value, "You tried to register a kernel with an unsupported return type: c10::ArrayRef<T>. Please use std::vector<T> instead.");
   };
-  template<class Key, class Value>
-  struct return_type_to_ivalue<std::unordered_map<Key, Value>> {
-    static_assert(guts::false_t<std::unordered_map<Key, Value>>::value, "You tried to register a kernel with an unsupported return type: std::unordered_map<Key, Value>. Please use c10::Dict<Key, Value> instead.");
-  };
   template<class T>
   struct return_type_to_ivalue<T, guts::enable_if_t<std::is_same<float, T>::value>> {
     static_assert(guts::false_t<T>::value, "You tried to register a kernel with an unsupported return type: float. Please use double instead.");
@@ -207,22 +208,12 @@ namespace detail {
     static_assert(guts::false_t<T>::value, "You tried to register a kernel with an unsupported integral return argument type. Please use int64_t instead.");
   };
   // legacy_return_type_to_ivalue is like return_type_to_ivalue but additionally
-  // allows a few deprecated types like std::vector.
-  template<class T>
+  // allows a few deprecated types like std::unordered_map.
+  template<class T, class Enable = void>
   struct legacy_return_type_to_ivalue final {
-    static IValue call(T&& v) {
-      return return_type_to_ivalue<T>::call(std::move(v));
-    }
-  };
-  template<class Key, class Value>
-  struct legacy_return_type_to_ivalue<std::unordered_map<Key, Value>> final {
-    static IValue call(std::unordered_map<Key, Value>&& v) {
-      c10::Dict<Key, Value> dict;
-      dict.reserve(v.size());
-      for (auto& element : v) {
-        dict.insert(element.first, element.second);
-      }
-      return return_type_to_ivalue<c10::Dict<Key, Value>>::call(std::move(dict));
+    template<class T_>
+    static IValue call(T_&& v) {
+      return return_type_to_ivalue<T>::call(std::forward<T_>(v));
     }
   };
 
@@ -320,63 +311,6 @@ namespace detail {
       return guts::make_unique<FunctionSchema>(inferFunctionSchema<KernelFunctor>("", ""));
     }
   };
-
-  template<class KernelFunctor, bool AllowDeprecatedTypes = false, class... ConstructorParameters>
-  detail::KernelRegistrationConfigParameter<detail::KernelFactory<KernelFunctor, guts::decay_t<ConstructorParameters>...>, detail::FunctionSchemaInferer<KernelFunctor>>
-  kernelFunctor(ConstructorParameters&&... constructorParameters) {
-    return {
-      &detail::wrap_kernel_functor<KernelFunctor, AllowDeprecatedTypes>::call,
-      detail::KernelFactory<KernelFunctor, guts::decay_t<ConstructorParameters>...>(std::forward<ConstructorParameters>(constructorParameters)...),
-      detail::FunctionSchemaInferer<KernelFunctor>()
-    };
-  }
-}
-
-/**
- * Use this to register an operator whose kernel is implemented as a functor
- *
- * Example:
- *
- * > namespace {
- * >   class my_kernel_cpu final : public c10::OperatorKernel {
- * >   public:
- * >     Tensor operator()(Tensor a, Tensor b) {...}
- * >   };
- * > }
- * >
- * > static auto registry = c10::RegisterOperators()
- * >     .op("my_op",
- * >         c10::kernel<my_kernel_cpu>(),
- * >         c10::dispatchKey(CPUTensorId()));
- *
- * The functor constructor can take arguments to configure the kernel.
- * The arguments are defined in the kernel registration.
- * Example:
- *
- * > namespace {
- * >   class my_kernel_cpu final : public c10::OperatorKernel {
- * >   public:
- * >     explicit my_kernel_cpu(std::string some_configuration, int a, bool b)
- * >         : ... {...}
- * >
- * >     Tensor operator()(Tensor a, Tensor b) {...}
- * >   };
- * > }
- * >
- * > static auto registry = c10::RegisterOperators()
- * >     .op("my_op",
- * >         c10::kernel<my_kernel_cpu>("some_configuration", 3, true),
- * >         c10::dispatchKey(CPUTensorId()));
- */
-template<class KernelFunctor, class... ConstructorParameters>
-// enable_if: only enable it if KernelFunctor is actually a functor
-inline constexpr guts::enable_if_t<guts::is_functor<KernelFunctor>::value,
-detail::KernelRegistrationConfigParameter<detail::KernelFactory<KernelFunctor, guts::decay_t<ConstructorParameters>...>, detail::FunctionSchemaInferer<KernelFunctor>>>
-kernel(ConstructorParameters&&... constructorParameters) {
-  static_assert(std::is_base_of<OperatorKernel, KernelFunctor>::value, "Tried to register a kernel functor using the kernel<Functor>() API, but it doesn't inherit from c10::OperatorKernel. Please have the functor inherit from it.");
-  static_assert(std::is_constructible<KernelFunctor, ConstructorParameters...>::value, "Wrong argument list for constructor of kernel functor. The arguments to kernel<Functor>(arguments...) must match one of the constructors of Functor.");
-
-  return detail::kernelFunctor<KernelFunctor, false>(std::forward<ConstructorParameters>(constructorParameters)...);
 }
 
 }
diff --git a/aten/src/ATen/core/op_registration/kernel_functor_test.cpp b/aten/src/ATen/core/op_registration/kernel_functor_test.cpp
index 8dae0d26f2f7..b5dbf44fccff 100644
--- a/aten/src/ATen/core/op_registration/kernel_functor_test.cpp
+++ b/aten/src/ATen/core/op_registration/kernel_functor_test.cpp
@@ -7,8 +7,6 @@
 
 using c10::RegisterOperators;
 using c10::OperatorKernel;
-using c10::kernel;
-using c10::dispatchKey;
 using c10::TensorTypeId;
 using c10::KernelCache;
 using c10::Stack;
@@ -16,7 +14,6 @@ using c10::guts::make_unique;
 using c10::ivalue::TensorList;
 using c10::ivalue::IntList;
 using c10::intrusive_ptr;
-using c10::ArrayRef;
 using c10::Dict;
 using at::Tensor;
 using std::unique_ptr;
@@ -67,32 +64,32 @@ void expectCallsDecrement(TensorTypeId type_id) {
 }
 
 TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernel_whenRegistered_thenCanBeCalled) {
-  auto registrar = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", kernel<IncrementKernel>(), dispatchKey(TensorType1()));
+  auto registrar = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", RegisterOperators::options().kernel<IncrementKernel>().dispatchKey(TensorType1()));
   expectCallsIncrement(TensorType1());
 }
 
 TEST(OperatorRegistrationTest_FunctorBasedKernel, givenMultipleOperatorsAndKernels_whenRegisteredInOneRegistrar_thenCallsRightKernel) {
   auto registrar = RegisterOperators()
-      .op("_test::my_op(Tensor dummy, int input) -> int", kernel<IncrementKernel>(), dispatchKey(TensorType1()))
-      .op("_test::my_op(Tensor dummy, int input) -> int", kernel<ErrorKernel>(), dispatchKey(TensorType2()))
-      .op("_test::error(Tensor dummy, int input) -> int", kernel<ErrorKernel>(), dispatchKey(TensorType1()))
-      .op("_test::error(Tensor dummy, int input) -> int", kernel<ErrorKernel>(), dispatchKey(TensorType2()));
+      .op("_test::my_op(Tensor dummy, int input) -> int", RegisterOperators::options().kernel<IncrementKernel>().dispatchKey(TensorType1()))
+      .op("_test::my_op(Tensor dummy, int input) -> int", RegisterOperators::options().kernel<ErrorKernel>().dispatchKey(TensorType2()))
+      .op("_test::error(Tensor dummy, int input) -> int", RegisterOperators::options().kernel<ErrorKernel>().dispatchKey(TensorType1()))
+      .op("_test::error(Tensor dummy, int input) -> int", RegisterOperators::options().kernel<ErrorKernel>().dispatchKey(TensorType2()));
   expectCallsIncrement(TensorType1());
 }
 
 TEST(OperatorRegistrationTest_FunctorBasedKernel, givenMultipleOperatorsAndKernels_whenRegisteredInMultipleRegistrars_thenCallsRightKernel) {
-  auto registrar1 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", kernel<IncrementKernel>(), dispatchKey(TensorType1()));
-  auto registrar2 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", kernel<ErrorKernel>(), dispatchKey(TensorType2()));
-  auto registrar3 = RegisterOperators().op("_test::error(Tensor dummy, int input) -> int", kernel<ErrorKernel>(), dispatchKey(TensorType1()));
-  auto registrar4 = RegisterOperators().op("_test::error(Tensor dummy, int input) -> int", kernel<ErrorKernel>(), dispatchKey(TensorType2()));
+  auto registrar1 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", RegisterOperators::options().kernel<IncrementKernel>().dispatchKey(TensorType1()));
+  auto registrar2 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", RegisterOperators::options().kernel<ErrorKernel>().dispatchKey(TensorType2()));
+  auto registrar3 = RegisterOperators().op("_test::error(Tensor dummy, int input) -> int", RegisterOperators::options().kernel<ErrorKernel>().dispatchKey(TensorType1()));
+  auto registrar4 = RegisterOperators().op("_test::error(Tensor dummy, int input) -> int", RegisterOperators::options().kernel<ErrorKernel>().dispatchKey(TensorType2()));
   expectCallsIncrement(TensorType1());
 }
 
 TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernel_whenRegistrationRunsOutOfScope_thenCannotBeCalledAnymore) {
   {
-    auto registrar1 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", kernel<IncrementKernel>(), dispatchKey(TensorType1()));
+    auto registrar1 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", RegisterOperators::options().kernel<IncrementKernel>().dispatchKey(TensorType1()));
     {
-      auto registrar2 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", kernel<DecrementKernel>(), dispatchKey(TensorType2()));
+      auto registrar2 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", RegisterOperators::options().kernel<DecrementKernel>().dispatchKey(TensorType2()));
 
       // assert that schema and cpu kernel are present
       expectCallsIncrement(TensorType1());
@@ -117,7 +114,7 @@ struct KernelWithoutOutput final : OperatorKernel {
 };
 
 TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithoutOutput_whenRegistered_thenCanBeCalled) {
-  auto registrar = RegisterOperators().op("_test::no_return(Tensor dummy) -> ()", kernel<KernelWithoutOutput>(), dispatchKey(TensorType1()));
+  auto registrar = RegisterOperators().op("_test::no_return(Tensor dummy) -> ()", RegisterOperators::options().kernel<KernelWithoutOutput>().dispatchKey(TensorType1()));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::no_return", "");
   ASSERT_TRUE(op.has_value());
@@ -135,7 +132,7 @@ struct KernelWithZeroOutputs final : OperatorKernel {
 };
 
 TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithZeroOutputs_whenRegistered_thenCanBeCalled) {
-  auto registrar = RegisterOperators().op("_test::zero_outputs(Tensor dummy) -> ()", kernel<KernelWithZeroOutputs>(), dispatchKey(TensorType1()));
+  auto registrar = RegisterOperators().op("_test::zero_outputs(Tensor dummy) -> ()", RegisterOperators::options().kernel<KernelWithZeroOutputs>().dispatchKey(TensorType1()));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::zero_outputs", "");
   ASSERT_TRUE(op.has_value());
@@ -153,7 +150,7 @@ struct KernelWithIntOutput final : OperatorKernel {
 
 TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithIntOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
-      .op("_test::int_output(Tensor dummy, int a, int b) -> int", kernel<KernelWithIntOutput>(), dispatchKey(TensorType1()));
+      .op("_test::int_output(Tensor dummy, int a, int b) -> int", RegisterOperators::options().kernel<KernelWithIntOutput>().dispatchKey(TensorType1()));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::int_output", "");
   ASSERT_TRUE(op.has_value());
@@ -171,8 +168,8 @@ struct KernelWithTensorOutput final : OperatorKernel {
 
 TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithTensorOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
-      .op("_test::returning_tensor(Tensor input) -> Tensor", kernel<KernelWithTensorOutput>(), dispatchKey(TensorType1()))
-      .op("_test::returning_tensor(Tensor input) -> Tensor", kernel<KernelWithTensorOutput>(), dispatchKey(TensorType2()));
+      .op("_test::returning_tensor(Tensor input) -> Tensor", RegisterOperators::options().kernel<KernelWithTensorOutput>().dispatchKey(TensorType1()))
+      .op("_test::returning_tensor(Tensor input) -> Tensor", RegisterOperators::options().kernel<KernelWithTensorOutput>().dispatchKey(TensorType2()));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::returning_tensor", "");
   ASSERT_TRUE(op.has_value());
@@ -194,7 +191,7 @@ struct KernelWithTensorListOutput final : OperatorKernel {
 
 TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithTensorListOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
-      .op("_test::list_output(Tensor input1, Tensor input2, Tensor input3) -> Tensor[]", kernel<KernelWithTensorListOutput>(), dispatchKey(TensorType1()));
+      .op("_test::list_output(Tensor input1, Tensor input2, Tensor input3) -> Tensor[]", RegisterOperators::options().kernel<KernelWithTensorListOutput>().dispatchKey(TensorType1()));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::list_output", "");
   ASSERT_TRUE(op.has_value());
@@ -215,7 +212,7 @@ struct KernelWithIntListOutput final : OperatorKernel {
 
 TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithIntListOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
-      .op("_test::list_output(Tensor dummy, int input1, int input2, int input3) -> int[]", kernel<KernelWithIntListOutput>(), dispatchKey(TensorType1()));
+      .op("_test::list_output(Tensor dummy, int input1, int input2, int input3) -> int[]", RegisterOperators::options().kernel<KernelWithIntListOutput>().dispatchKey(TensorType1()));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::list_output", "");
   ASSERT_TRUE(op.has_value());
@@ -245,7 +242,7 @@ struct KernelWithMultipleOutputs final : OperatorKernel {
 
 TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithMultipleOutputs_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
-     .op("_test::multiple_outputs(Tensor dummy) -> (Tensor, int, Tensor[], int?, Dict(str, Tensor))", kernel<KernelWithMultipleOutputs>(), dispatchKey(TensorType1()));
+     .op("_test::multiple_outputs(Tensor dummy) -> (Tensor, int, Tensor[], int?, Dict(str, Tensor))", RegisterOperators::options().kernel<KernelWithMultipleOutputs>().dispatchKey(TensorType1()));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::multiple_outputs", "");
   ASSERT_TRUE(op.has_value());
@@ -278,8 +275,8 @@ struct KernelWithTensorInputByValueWithOutput final : OperatorKernel {
 
 TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithTensorInputByReference_withOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
-      .op("_test::tensor_input(Tensor input) -> Tensor", kernel<KernelWithTensorInputByReferenceWithOutput>(), dispatchKey(TensorType1()))
-      .op("_test::tensor_input(Tensor input) -> Tensor", kernel<KernelWithTensorInputByReferenceWithOutput>(), dispatchKey(TensorType2()));
+      .op("_test::tensor_input(Tensor input) -> Tensor", RegisterOperators::options().kernel<KernelWithTensorInputByReferenceWithOutput>().dispatchKey(TensorType1()))
+      .op("_test::tensor_input(Tensor input) -> Tensor", RegisterOperators::options().kernel<KernelWithTensorInputByReferenceWithOutput>().dispatchKey(TensorType2()));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::tensor_input", "");
   ASSERT_TRUE(op.has_value());
@@ -295,8 +292,8 @@ TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithTensorInputByRe
 
 TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithTensorInputByValue_withOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
-      .op("_test::tensor_input(Tensor input) -> Tensor", kernel<KernelWithTensorInputByValueWithOutput>(), dispatchKey(TensorType1()))
-      .op("_test::tensor_input(Tensor input) -> Tensor", kernel<KernelWithTensorInputByValueWithOutput>(), dispatchKey(TensorType2()));
+      .op("_test::tensor_input(Tensor input) -> Tensor", RegisterOperators::options().kernel<KernelWithTensorInputByValueWithOutput>().dispatchKey(TensorType1()))
+      .op("_test::tensor_input(Tensor input) -> Tensor", RegisterOperators::options().kernel<KernelWithTensorInputByValueWithOutput>().dispatchKey(TensorType2()));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::tensor_input", "");
   ASSERT_TRUE(op.has_value());
@@ -326,8 +323,8 @@ struct KernelWithTensorInputByValueWithoutOutput final : OperatorKernel {
 
 TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithTensorInputByReference_withoutOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
-      .op("_test::tensor_input(Tensor input) -> ()", kernel<KernelWithTensorInputByReferenceWithoutOutput>(), dispatchKey(TensorType1()))
-      .op("_test::tensor_input(Tensor input) -> ()", kernel<KernelWithTensorInputByReferenceWithoutOutput>(), dispatchKey(TensorType2()));
+      .op("_test::tensor_input(Tensor input) -> ()", RegisterOperators::options().kernel<KernelWithTensorInputByReferenceWithoutOutput>().dispatchKey(TensorType1()))
+      .op("_test::tensor_input(Tensor input) -> ()", RegisterOperators::options().kernel<KernelWithTensorInputByReferenceWithoutOutput>().dispatchKey(TensorType2()));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::tensor_input", "");
   ASSERT_TRUE(op.has_value());
@@ -343,8 +340,8 @@ TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithTensorInputByRe
 
 TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithTensorInputByValue_withoutOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
-      .op("_test::tensor_input(Tensor input) -> ()", kernel<KernelWithTensorInputByValueWithoutOutput>(), dispatchKey(TensorType1()))
-      .op("_test::tensor_input(Tensor input) -> ()", kernel<KernelWithTensorInputByValueWithoutOutput>(), dispatchKey(TensorType2()));
+      .op("_test::tensor_input(Tensor input) -> ()", RegisterOperators::options().kernel<KernelWithTensorInputByValueWithoutOutput>().dispatchKey(TensorType1()))
+      .op("_test::tensor_input(Tensor input) -> ()", RegisterOperators::options().kernel<KernelWithTensorInputByValueWithoutOutput>().dispatchKey(TensorType2()));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::tensor_input", "");
   ASSERT_TRUE(op.has_value());
@@ -368,7 +365,7 @@ struct KernelWithIntInputWithoutOutput final : OperatorKernel {
 
 TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithIntInput_withoutOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
-      .op("_test::int_input(Tensor dummy, int input) -> ()", kernel<KernelWithIntInputWithoutOutput>(), dispatchKey(TensorType1()));
+      .op("_test::int_input(Tensor dummy, int input) -> ()", RegisterOperators::options().kernel<KernelWithIntInputWithoutOutput>().dispatchKey(TensorType1()));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::int_input", "");
   ASSERT_TRUE(op.has_value());
@@ -387,7 +384,7 @@ struct KernelWithIntInputWithOutput final : OperatorKernel {
 
 TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithIntInput_withOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
-      .op("_test::int_input(Tensor dummy, int input) -> int", kernel<KernelWithIntInputWithOutput>(), dispatchKey(TensorType1()));
+      .op("_test::int_input(Tensor dummy, int input) -> int", RegisterOperators::options().kernel<KernelWithIntInputWithOutput>().dispatchKey(TensorType1()));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::int_input", "");
   ASSERT_TRUE(op.has_value());
@@ -400,14 +397,14 @@ TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithIntInput_withOu
 int64_t captured_input_list_size = 0;
 
 struct KernelWithIntListInputWithoutOutput final : OperatorKernel {
-  void operator()(Tensor, ArrayRef<int64_t> input1) {
+  void operator()(Tensor, const std::vector<int64_t>& input1) {
     captured_input_list_size = input1.size();
   }
 };
 
 TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithIntListInput_withoutOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
-      .op("_test::int_list_input(Tensor dummy, int[] input) -> ()", kernel<KernelWithIntListInputWithoutOutput>(), dispatchKey(TensorType1()));
+      .op("_test::int_list_input(Tensor dummy, int[] input) -> ()", RegisterOperators::options().kernel<KernelWithIntListInputWithoutOutput>().dispatchKey(TensorType1()));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::int_list_input", "");
   ASSERT_TRUE(op.has_value());
@@ -419,14 +416,14 @@ TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithIntListInput_wi
 }
 
 struct KernelWithIntListInputWithOutput final : OperatorKernel {
-  int64_t operator()(Tensor, ArrayRef<int64_t> input1) {
+  int64_t operator()(Tensor, const std::vector<int64_t>& input1) {
     return input1.size();
   }
 };
 
 TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithIntListInput_withOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
-      .op("_test::int_list_input(Tensor dummy, int[] input) -> int", kernel<KernelWithIntListInputWithOutput>(), dispatchKey(TensorType1()));
+      .op("_test::int_list_input(Tensor dummy, int[] input) -> int", RegisterOperators::options().kernel<KernelWithIntListInputWithOutput>().dispatchKey(TensorType1()));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::int_list_input", "");
   ASSERT_TRUE(op.has_value());
@@ -437,14 +434,14 @@ TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithIntListInput_wi
 }
 
 struct KernelWithTensorListInputWithoutOutput final : OperatorKernel {
-  void operator()(ArrayRef<Tensor> input1) {
+  void operator()(const std::vector<Tensor>& input1) {
     captured_input_list_size = input1.size();
   }
 };
 
 TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithTensorListInput_withoutOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
-      .op("_test::tensor_list_input(Tensor[] input) -> ()", kernel<KernelWithTensorListInputWithoutOutput>(), dispatchKey(TensorType1()));
+      .op("_test::tensor_list_input(Tensor[] input) -> ()", RegisterOperators::options().kernel<KernelWithTensorListInputWithoutOutput>().dispatchKey(TensorType1()));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::tensor_list_input", "");
   ASSERT_TRUE(op.has_value());
@@ -456,14 +453,14 @@ TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithTensorListInput
 }
 
 struct KernelWithTensorListInputWithOutput final : OperatorKernel {
-  int64_t operator()(ArrayRef<Tensor> input1) {
+  int64_t operator()(const std::vector<Tensor>& input1) {
     return input1.size();
   }
 };
 
 TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithTensorListInput_withOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
-      .op("_test::tensor_list_input(Tensor[] input) -> int", kernel<KernelWithTensorListInputWithOutput>(), dispatchKey(TensorType1()));
+      .op("_test::tensor_list_input(Tensor[] input) -> int", RegisterOperators::options().kernel<KernelWithTensorListInputWithOutput>().dispatchKey(TensorType1()));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::tensor_list_input", "");
   ASSERT_TRUE(op.has_value());
@@ -483,7 +480,7 @@ struct KernelWithDictInputWithoutOutput final : OperatorKernel {
 
 TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithDictInput_withoutOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
-      .op("_test::dict_input(Dict(str, Tensor) input) -> ()", kernel<KernelWithDictInputWithoutOutput>());
+      .op("_test::dict_input(Dict(str, Tensor) input) -> ()", RegisterOperators::options().kernel<KernelWithDictInputWithoutOutput>());
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::dict_input", "");
   ASSERT_TRUE(op.has_value());
@@ -505,7 +502,7 @@ struct KernelWithDictInputWithOutput final : OperatorKernel {
 
 TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithDictInput_withOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
-      .op("_test::dict_input(Dict(str, str) input) -> str", kernel<KernelWithDictInputWithOutput>());
+      .op("_test::dict_input(Dict(str, str) input) -> str", RegisterOperators::options().kernel<KernelWithDictInputWithOutput>());
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::dict_input", "");
   ASSERT_TRUE(op.has_value());
@@ -526,7 +523,7 @@ struct KernelWithDictOutput final : OperatorKernel {
 
 TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithDictOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
-      .op("_test::dict_output(Dict(str, str) input) -> Dict(str, str)", kernel<KernelWithDictOutput>());
+      .op("_test::dict_output(Dict(str, str) input) -> Dict(str, str)", RegisterOperators::options().kernel<KernelWithDictOutput>());
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::dict_output", "");
   ASSERT_TRUE(op.has_value());
@@ -556,7 +553,7 @@ class KernelWithCache final : public OperatorKernel {
 
 TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithCache_thenCacheIsKeptCorrectly) {
   auto registrar = RegisterOperators()
-      .op("_test::cache_op(Tensor input) -> int", kernel<KernelWithCache>(), dispatchKey(TensorType1()));
+      .op("_test::cache_op(Tensor input) -> int", RegisterOperators::options().kernel<KernelWithCache>().dispatchKey(TensorType1()));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::cache_op", "");
   ASSERT_TRUE(op.has_value());
@@ -596,8 +593,8 @@ class KernelWithConstructorArg final : public OperatorKernel {
 
 TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithConstructorArg_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
-      .op("_test::offset_op(Tensor tensor, int input) -> int", kernel<KernelWithConstructorArg>(2), dispatchKey(TensorType1()))
-      .op("_test::offset_op(Tensor tensor, int input) -> int", kernel<KernelWithConstructorArg>(4), dispatchKey(TensorType2()));
+      .op("_test::offset_op(Tensor tensor, int input) -> int", RegisterOperators::options().kernel<KernelWithConstructorArg>(2).dispatchKey(TensorType1()))
+      .op("_test::offset_op(Tensor tensor, int input) -> int", RegisterOperators::options().kernel<KernelWithConstructorArg>(4).dispatchKey(TensorType2()));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::offset_op", "");
   ASSERT_TRUE(op.has_value());
@@ -626,8 +623,8 @@ class KernelWithMultipleConstructorArgs final : public OperatorKernel {
 
 TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithMultipleConstructorArgs_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
-      .op("_test::offset_op(Tensor tensor, int input) -> int", kernel<KernelWithMultipleConstructorArgs>(2, 3), dispatchKey(TensorType1()))
-      .op("_test::offset_op(Tensor tensor, int input) -> int", kernel<KernelWithMultipleConstructorArgs>(4, 5), dispatchKey(TensorType2()));
+      .op("_test::offset_op(Tensor tensor, int input) -> int", RegisterOperators::options().kernel<KernelWithMultipleConstructorArgs>(2, 3).dispatchKey(TensorType1()))
+      .op("_test::offset_op(Tensor tensor, int input) -> int", RegisterOperators::options().kernel<KernelWithMultipleConstructorArgs>(4, 5).dispatchKey(TensorType2()));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::offset_op", "");
   ASSERT_TRUE(op.has_value());
@@ -654,7 +651,7 @@ TEST(OperatorRegistrationTest_FunctorBasedKernel, givenFallbackKernelWithoutAnyA
   // is no way to get the dispatch key. For operators that only have a fallback
   // kernel, this must work for backwards compatibility.
   auto registrar = RegisterOperators()
-      .op("_test::no_tensor_args() -> ()", kernel<KernelWithoutInputs>());
+      .op("_test::no_tensor_args() -> ()", RegisterOperators::options().kernel<KernelWithoutInputs>());
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::no_tensor_args", "");
   ASSERT_TRUE(op.has_value());
@@ -675,7 +672,7 @@ TEST(OperatorRegistrationTest_FunctorBasedKernel, givenFallbackKernelWithoutTens
   // is no way to get the dispatch key. For operators that only have a fallback
   // kernel, this must work for backwards compatibility.
   auto registrar = RegisterOperators()
-      .op("_test::no_tensor_args(int arg) -> int", kernel<KernelWithoutTensorInputs>());
+      .op("_test::no_tensor_args(int arg) -> int", RegisterOperators::options().kernel<KernelWithoutTensorInputs>());
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::no_tensor_args", "");
   ASSERT_TRUE(op.has_value());
@@ -699,7 +696,7 @@ struct KernelWithOptInputWithoutOutput final : OperatorKernel {
 };
 
 TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithOptionalInputs_withoutOutput_whenRegistered_thenCanBeCalled) {
-  auto registrar = RegisterOperators().op("_test::opt_input(Tensor arg1, Tensor? arg2, int? arg3, str? arg4) -> ()", kernel<KernelWithOptInputWithoutOutput>(), dispatchKey(TensorType1()));
+  auto registrar = RegisterOperators().op("_test::opt_input(Tensor arg1, Tensor? arg2, int? arg3, str? arg4) -> ()", RegisterOperators::options().kernel<KernelWithOptInputWithoutOutput>().dispatchKey(TensorType1()));
   auto op = c10::Dispatcher::singleton().findSchema("_test::opt_input", "");
   ASSERT_TRUE(op.has_value());
 
@@ -736,7 +733,7 @@ struct KernelWithOptInputWithOutput final : OperatorKernel {
 };
 
 TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithOptionalInputs_withOutput_whenRegistered_thenCanBeCalled) {
-  auto registrar = RegisterOperators().op("_test::opt_input(Tensor arg1, Tensor? arg2, int? arg3, str? arg4) -> Tensor?", kernel<KernelWithOptInputWithOutput>(), dispatchKey(TensorType1()));
+  auto registrar = RegisterOperators().op("_test::opt_input(Tensor arg1, Tensor? arg2, int? arg3, str? arg4) -> Tensor?", RegisterOperators::options().kernel<KernelWithOptInputWithOutput>().dispatchKey(TensorType1()));
   auto op = c10::Dispatcher::singleton().findSchema("_test::opt_input", "");
   ASSERT_TRUE(op.has_value());
 
@@ -772,7 +769,7 @@ struct KernelWithOptInputWithMultipleOutputs final : OperatorKernel {
 };
 
 TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithOptionalInputs_withMultipleOutputs_whenRegistered_thenCanBeCalled) {
-  auto registrar = RegisterOperators().op("_test::opt_input(Tensor arg1, Tensor? arg2, int? arg3, str? arg4) -> (Tensor?, int?, str?)", kernel<KernelWithOptInputWithMultipleOutputs>(), dispatchKey(TensorType1()));
+  auto registrar = RegisterOperators().op("_test::opt_input(Tensor arg1, Tensor? arg2, int? arg3, str? arg4) -> (Tensor?, int?, str?)", RegisterOperators::options().kernel<KernelWithOptInputWithMultipleOutputs>().dispatchKey(TensorType1()));
   auto op = c10::Dispatcher::singleton().findSchema("_test::opt_input", "");
   ASSERT_TRUE(op.has_value());
 
@@ -790,14 +787,14 @@ TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernelWithOptionalInputs_
 }
 
 struct KernelForSchemaInference final : OperatorKernel {
-  std::tuple<int64_t, Tensor> operator()(Tensor arg1, int64_t arg2, ArrayRef<Tensor> arg3) {
+  std::tuple<int64_t, Tensor> operator()(Tensor arg1, int64_t arg2, const std::vector<Tensor>& arg3) {
     return {};
   }
 };
 
 TEST(OperatorRegistrationTest_FunctorBasedKernel, givenKernel_whenRegisteredWithoutSpecifyingSchema_thenInfersSchema) {
   auto registrar = RegisterOperators()
-      .op("_test::no_schema_specified", kernel<KernelForSchemaInference>());
+      .op("_test::no_schema_specified", RegisterOperators::options().kernel<KernelForSchemaInference>());
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::no_schema_specified", "");
   ASSERT_TRUE(op.has_value());
@@ -815,35 +812,35 @@ template<class... Args> struct KernelFunc<void, Args...> final : OperatorKernel
 TEST(OperatorRegistrationTest_FunctorBasedKernel, givenMismatchedKernel_withDifferentNumArguments_whenRegistering_thenFails) {
   // assert this does not fail because it matches
   RegisterOperators()
-      .op("_test::mismatch(Tensor arg) -> int", kernel<KernelFunc<int64_t, Tensor>>(), dispatchKey(TensorType1()));
+      .op("_test::mismatch(Tensor arg) -> int", RegisterOperators::options().kernel<KernelFunc<int64_t, Tensor>>().dispatchKey(TensorType1()));
 
   // and now a set of mismatching schemas
   expectThrows<c10::Error>([] {
     RegisterOperators()
-        .op("_test::mismatch(Tensor arg, Tensor arg2) -> int", kernel<KernelFunc<int64_t, Tensor>>(), dispatchKey(TensorType1()));
+        .op("_test::mismatch(Tensor arg, Tensor arg2) -> int", RegisterOperators::options().kernel<KernelFunc<int64_t, Tensor>>().dispatchKey(TensorType1()));
     }, "The number of arguments is different. Specified 2 but inferred 1"
   );
 
   // assert this does not fail because it matches
   RegisterOperators()
-      .op("_test::mismatch(Tensor arg, Tensor arg2) -> ()", kernel<KernelFunc<void, Tensor, Tensor>>(), dispatchKey(TensorType1()));
+      .op("_test::mismatch(Tensor arg, Tensor arg2) -> ()", RegisterOperators::options().kernel<KernelFunc<void, Tensor, Tensor>>().dispatchKey(TensorType1()));
 
   // and now a set of mismatching schemas
   expectThrows<c10::Error>([] {
     RegisterOperators()
-        .op("_test::mismatch() -> ()", kernel<KernelFunc<void, Tensor, Tensor>>(), dispatchKey(TensorType1()));
+        .op("_test::mismatch() -> ()", RegisterOperators::options().kernel<KernelFunc<void, Tensor, Tensor>>().dispatchKey(TensorType1()));
     }, "The number of arguments is different. Specified 0 but inferred 2"
   );
 
   expectThrows<c10::Error>([] {
     RegisterOperators()
-        .op("_test::mismatch(Tensor arg) -> ()", kernel<KernelFunc<void, Tensor, Tensor>>(), dispatchKey(TensorType1()));
+        .op("_test::mismatch(Tensor arg) -> ()", RegisterOperators::options().kernel<KernelFunc<void, Tensor, Tensor>>().dispatchKey(TensorType1()));
     }, "The number of arguments is different. Specified 1 but inferred 2"
   );
 
   expectThrows<c10::Error>([] {
     RegisterOperators()
-        .op("_test::mismatch(Tensor arg, Tensor arg2, Tensor arg3) -> ()", kernel<KernelFunc<void, Tensor, Tensor>>(), dispatchKey(TensorType1()));
+        .op("_test::mismatch(Tensor arg, Tensor arg2, Tensor arg3) -> ()", RegisterOperators::options().kernel<KernelFunc<void, Tensor, Tensor>>().dispatchKey(TensorType1()));
     }, "The number of arguments is different. Specified 3 but inferred 2"
   );
 }
@@ -851,18 +848,18 @@ TEST(OperatorRegistrationTest_FunctorBasedKernel, givenMismatchedKernel_withDiff
 TEST(OperatorRegistrationTest_FunctorBasedKernel, givenMismatchedKernel_withDifferentArgumentType_whenRegistering_thenFails) {
   // assert this does not fail because it matches
   RegisterOperators()
-      .op("_test::mismatch(Tensor arg1, int arg2) -> int", kernel<KernelFunc<int64_t, Tensor, int64_t>>(), dispatchKey(TensorType1()));
+      .op("_test::mismatch(Tensor arg1, int arg2) -> int", RegisterOperators::options().kernel<KernelFunc<int64_t, Tensor, int64_t>>().dispatchKey(TensorType1()));
 
   // and now a set of mismatching schemas
   expectThrows<c10::Error>([] {
     RegisterOperators()
-        .op("_test::mismatch(Tensor arg1, float arg2) -> int", kernel<KernelFunc<int64_t, Tensor, int64_t>>(), dispatchKey(TensorType1()));
+        .op("_test::mismatch(Tensor arg1, float arg2) -> int", RegisterOperators::options().kernel<KernelFunc<int64_t, Tensor, int64_t>>().dispatchKey(TensorType1()));
     }, "Type mismatch in argument 2: specified float but inferred int"
   );
 
   expectThrows<c10::Error>([] {
     RegisterOperators()
-        .op("_test::mismatch(int arg1, int arg2) -> int", kernel<KernelFunc<int64_t, Tensor, int64_t>>(), dispatchKey(TensorType1()));
+        .op("_test::mismatch(int arg1, int arg2) -> int", RegisterOperators::options().kernel<KernelFunc<int64_t, Tensor, int64_t>>().dispatchKey(TensorType1()));
     }, "Type mismatch in argument 1: specified int but inferred Tensor"
   );
 }
@@ -870,58 +867,58 @@ TEST(OperatorRegistrationTest_FunctorBasedKernel, givenMismatchedKernel_withDiff
 TEST(OperatorRegistrationTest_FunctorBasedKernel, givenMismatchedKernel_withDifferentNumReturns_whenRegistering_thenFails) {
   // assert this does not fail because it matches
   RegisterOperators()
-      .op("_test::mismatch(Tensor arg) -> int", kernel<KernelFunc<int64_t, Tensor>>(), dispatchKey(TensorType1()));
+      .op("_test::mismatch(Tensor arg) -> int", RegisterOperators::options().kernel<KernelFunc<int64_t, Tensor>>().dispatchKey(TensorType1()));
 
   // and now a set of mismatching schemas
   expectThrows<c10::Error>([] {
     RegisterOperators()
-        .op("_test::mismatch(Tensor arg) -> ()", kernel<KernelFunc<int64_t, Tensor>>(), dispatchKey(TensorType1()));
+        .op("_test::mismatch(Tensor arg) -> ()", RegisterOperators::options().kernel<KernelFunc<int64_t, Tensor>>().dispatchKey(TensorType1()));
     }, "The number of returns is different. Specified 0 but inferred 1"
   );
 
   expectThrows<c10::Error>([] {
     RegisterOperators()
-        .op("_test::mismatch(Tensor arg) -> (int, int)", kernel<KernelFunc<int64_t, Tensor>>(), dispatchKey(TensorType1()));
+        .op("_test::mismatch(Tensor arg) -> (int, int)", RegisterOperators::options().kernel<KernelFunc<int64_t, Tensor>>().dispatchKey(TensorType1()));
     }, "The number of returns is different. Specified 2 but inferred 1"
   );
 
   // assert this does not fail because it matches
   RegisterOperators()
-      .op("_test::mismatch(Tensor arg) -> ()", kernel<KernelFunc<void, Tensor>>(), dispatchKey(TensorType1()));
+      .op("_test::mismatch(Tensor arg) -> ()", RegisterOperators::options().kernel<KernelFunc<void, Tensor>>().dispatchKey(TensorType1()));
 
   // and now a set of mismatching schemas
   expectThrows<c10::Error>([] {
     RegisterOperators()
-        .op("_test::mismatch(Tensor arg) -> Tensor", kernel<KernelFunc<void, Tensor>>(), dispatchKey(TensorType1()));
+        .op("_test::mismatch(Tensor arg) -> Tensor", RegisterOperators::options().kernel<KernelFunc<void, Tensor>>().dispatchKey(TensorType1()));
     }, "The number of returns is different. Specified 1 but inferred 0"
   );
 
   expectThrows<c10::Error>([] {
     RegisterOperators()
-        .op("_test::mismatch(Tensor arg) -> (Tensor, Tensor)", kernel<KernelFunc<void, Tensor>>(), dispatchKey(TensorType1()));
+        .op("_test::mismatch(Tensor arg) -> (Tensor, Tensor)", RegisterOperators::options().kernel<KernelFunc<void, Tensor>>().dispatchKey(TensorType1()));
     }, "The number of returns is different. Specified 2 but inferred 0"
   );
 
   // assert this does not fail because it matches
   RegisterOperators()
-      .op("_test::mismatch(Tensor arg) -> (Tensor, Tensor)", kernel<KernelFunc<std::tuple<Tensor, Tensor>, Tensor>>(), dispatchKey(TensorType1()));
+      .op("_test::mismatch(Tensor arg) -> (Tensor, Tensor)", RegisterOperators::options().kernel<KernelFunc<std::tuple<Tensor, Tensor>, Tensor>>().dispatchKey(TensorType1()));
 
   // and now a set of mismatching schemas
   expectThrows<c10::Error>([] {
     RegisterOperators()
-        .op("_test::mismatch(Tensor arg) -> ()", kernel<KernelFunc<std::tuple<Tensor, Tensor>, Tensor>>(), dispatchKey(TensorType1()));
+        .op("_test::mismatch(Tensor arg) -> ()", RegisterOperators::options().kernel<KernelFunc<std::tuple<Tensor, Tensor>, Tensor>>().dispatchKey(TensorType1()));
     }, "The number of returns is different. Specified 0 but inferred 2"
   );
 
   expectThrows<c10::Error>([] {
     RegisterOperators()
-        .op("_test::mismatch(Tensor arg) -> Tensor", kernel<KernelFunc<std::tuple<Tensor, Tensor>, Tensor>>(), dispatchKey(TensorType1()));
+        .op("_test::mismatch(Tensor arg) -> Tensor", RegisterOperators::options().kernel<KernelFunc<std::tuple<Tensor, Tensor>, Tensor>>().dispatchKey(TensorType1()));
     }, "The number of returns is different. Specified 1 but inferred 2"
   );
 
   expectThrows<c10::Error>([] {
     RegisterOperators()
-        .op("_test::mismatch(Tensor arg) -> (Tensor, Tensor, Tensor)", kernel<KernelFunc<std::tuple<Tensor, Tensor>, Tensor>>(), dispatchKey(TensorType1()));
+        .op("_test::mismatch(Tensor arg) -> (Tensor, Tensor, Tensor)", RegisterOperators::options().kernel<KernelFunc<std::tuple<Tensor, Tensor>, Tensor>>().dispatchKey(TensorType1()));
     }, "The number of returns is different. Specified 3 but inferred 2"
   );
 }
@@ -929,46 +926,46 @@ TEST(OperatorRegistrationTest_FunctorBasedKernel, givenMismatchedKernel_withDiff
 TEST(OperatorRegistrationTest_FunctorBasedKernel, givenMismatchedKernel_withDifferentReturnTypes_whenRegistering_thenFails) {
   // assert this does not fail because it matches
   RegisterOperators()
-      .op("_test::mismatch(Tensor arg) -> int", kernel<KernelFunc<int64_t, Tensor>>(), dispatchKey(TensorType1()));
+      .op("_test::mismatch(Tensor arg) -> int", RegisterOperators::options().kernel<KernelFunc<int64_t, Tensor>>().dispatchKey(TensorType1()));
 
   // and now a set of mismatching schemas
   expectThrows<c10::Error>([] {
     RegisterOperators()
-        .op("_test::mismatch(Tensor arg) -> Tensor", kernel<KernelFunc<int64_t, Tensor>>(), dispatchKey(TensorType1()));
+        .op("_test::mismatch(Tensor arg) -> Tensor", RegisterOperators::options().kernel<KernelFunc<int64_t, Tensor>>().dispatchKey(TensorType1()));
     }, "Type mismatch in return 1: specified Tensor but inferred int"
   );
 
   expectThrows<c10::Error>([] {
     RegisterOperators()
-        .op("_test::mismatch(Tensor arg) -> float", kernel<KernelFunc<int64_t, Tensor>>(), dispatchKey(TensorType1()));
+        .op("_test::mismatch(Tensor arg) -> float", RegisterOperators::options().kernel<KernelFunc<int64_t, Tensor>>().dispatchKey(TensorType1()));
     }, "Type mismatch in return 1: specified float but inferred int"
   );
 
   // assert this does not fail because it matches
   RegisterOperators()
-      .op("_test::mismatch(Tensor arg) -> Tensor", kernel<KernelFunc<Tensor, Tensor>>(), dispatchKey(TensorType1()));
+      .op("_test::mismatch(Tensor arg) -> Tensor", RegisterOperators::options().kernel<KernelFunc<Tensor, Tensor>>().dispatchKey(TensorType1()));
 
   // and now a set of mismatching schemas
   expectThrows<c10::Error>([] {
     RegisterOperators()
-        .op("_test::mismatch(Tensor arg) -> float", kernel<KernelFunc<Tensor, Tensor>>(), dispatchKey(TensorType1()));
+        .op("_test::mismatch(Tensor arg) -> float", RegisterOperators::options().kernel<KernelFunc<Tensor, Tensor>>().dispatchKey(TensorType1()));
     }, "Type mismatch in return 1: specified float but inferred Tensor"
   );
 
   // assert this does not fail because it matches
   RegisterOperators()
-      .op("_test::mismatch(Tensor arg) -> (Tensor, int)", kernel<KernelFunc<std::tuple<Tensor, int64_t>, Tensor>>(), dispatchKey(TensorType1()));
+      .op("_test::mismatch(Tensor arg) -> (Tensor, int)", RegisterOperators::options().kernel<KernelFunc<std::tuple<Tensor, int64_t>, Tensor>>().dispatchKey(TensorType1()));
 
   // and now a set of mismatching schemas
   expectThrows<c10::Error>([] {
     RegisterOperators()
-        .op("_test::mismatch(Tensor arg) -> (Tensor, float)", kernel<KernelFunc<std::tuple<Tensor, int64_t>, Tensor>>(), dispatchKey(TensorType1()));
+        .op("_test::mismatch(Tensor arg) -> (Tensor, float)", RegisterOperators::options().kernel<KernelFunc<std::tuple<Tensor, int64_t>, Tensor>>().dispatchKey(TensorType1()));
     }, "Type mismatch in return 2: specified float but inferred int"
   );
 
   expectThrows<c10::Error>([] {
     RegisterOperators()
-        .op("_test::mismatch(Tensor arg) -> (int, int)", kernel<KernelFunc<std::tuple<Tensor, int64_t>, Tensor>>(), dispatchKey(TensorType1()));
+        .op("_test::mismatch(Tensor arg) -> (int, int)", RegisterOperators::options().kernel<KernelFunc<std::tuple<Tensor, int64_t>, Tensor>>().dispatchKey(TensorType1()));
     }, "Type mismatch in return 1: specified int but inferred Tensor"
   );
 }
diff --git a/aten/src/ATen/core/op_registration/kernel_lambda.h b/aten/src/ATen/core/op_registration/kernel_lambda.h
index 64952cb52db4..5c01bf21bce1 100644
--- a/aten/src/ATen/core/op_registration/kernel_lambda.h
+++ b/aten/src/ATen/core/op_registration/kernel_lambda.h
@@ -34,32 +34,4 @@ namespace detail {
   >;
 }
 
-/**
- * Use this to register an operator whose kernel is implemented as a stateless lambda.
- *
- * Example:
- *
- * > static auto registry = c10::RegisterOperators()
- * >     .op("my_op",
- * >         c10::kernel([] (Tensor a) -> Tensor{...}),
- * >         c10::dispatchKey(CPUTensorId()));
- */
-template<class Lambda>
-inline constexpr auto kernel(Lambda&& functor) ->
-// enable_if: only enable it if Lambda is a functor (note: lambdas are functors)
-guts::enable_if_t<guts::is_functor<guts::decay_t<Lambda>>::value,
-decltype(detail::kernelFunctor<detail::WrapRuntimeKernelFunctor<guts::decay_t<Lambda>>>(std::forward<Lambda>(functor)))> {
-  static_assert(!std::is_base_of<OperatorKernel, Lambda>::value, "The kernel(x) API for registering a kernel is only meant to be used with lambdas. Your kernel is a functor. Please use the kernel<Functor>() API instead.");
-
-  // We don't support stateful lambdas (i.e. lambdas with a capture), because their
-  // behavior would be nonobvious. A functor kernel with cache gets a new instance of
-  // its cache each time the kernel is looked up from the dispatch table.
-  // A lambda with a capture would be global and share its capture between all kernel lookups.
-  // So, instead of making users having to think about it (including the thread-safety
-  // issues this causes), let's just forbid stateful lambdas alltogether.
-  static_assert(guts::is_stateless_lambda<guts::decay_t<Lambda>>::value, "The kernel(x) API for registering a kernel only works for stateless lambdas (i.e. lambdas without captures). If you need a cache, please use the functor based API kernel<Functor>() instead.");
-
-  return detail::kernelFunctor<detail::WrapRuntimeKernelFunctor<guts::decay_t<Lambda>>>(std::forward<Lambda>(functor));
-}
-
 }
diff --git a/aten/src/ATen/core/op_registration/kernel_lambda_legacy_test.cpp b/aten/src/ATen/core/op_registration/kernel_lambda_legacy_test.cpp
index 4cf9f170dbb9..0359720a781b 100644
--- a/aten/src/ATen/core/op_registration/kernel_lambda_legacy_test.cpp
+++ b/aten/src/ATen/core/op_registration/kernel_lambda_legacy_test.cpp
@@ -17,8 +17,6 @@
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
 
 using c10::RegisterOperators;
-using c10::kernel;
-using c10::dispatchKey;
 using c10::TensorTypeId;
 using c10::KernelCache;
 using c10::Stack;
@@ -26,7 +24,6 @@ using c10::guts::make_unique;
 using c10::ivalue::TensorList;
 using c10::ivalue::IntList;
 using c10::intrusive_ptr;
-using c10::ArrayRef;
 using c10::Dict;
 using at::Tensor;
 using std::string;
@@ -334,7 +331,7 @@ int64_t captured_input_list_size = 0;
 
 TEST(OperatorRegistrationTest_LegacyLambdaBasedKernel, givenKernelWithIntListInput_withoutOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
-      .op("_test::int_list_input(Tensor dummy, int[] input) -> ()", [] (Tensor, ArrayRef<int64_t> input1) -> void {
+      .op("_test::int_list_input(Tensor dummy, int[] input) -> ()", [] (Tensor, const std::vector<int64_t>& input1) -> void {
         captured_input_list_size = input1.size();
       });
 
@@ -349,7 +346,7 @@ TEST(OperatorRegistrationTest_LegacyLambdaBasedKernel, givenKernelWithIntListInp
 
 TEST(OperatorRegistrationTest_LegacyLambdaBasedKernel, givenKernelWithIntListInput_withOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
-      .op("_test::int_list_input(Tensor dummy, int[] input) -> int", [](Tensor, ArrayRef<int64_t> input1) -> int64_t {
+      .op("_test::int_list_input(Tensor dummy, int[] input) -> int", [](Tensor, const std::vector<int64_t>& input1) -> int64_t {
         return input1.size();
       });
 
@@ -363,7 +360,7 @@ TEST(OperatorRegistrationTest_LegacyLambdaBasedKernel, givenKernelWithIntListInp
 
 TEST(OperatorRegistrationTest_LegacyLambdaBasedKernel, givenKernelWithTensorListInput_withoutOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
-      .op("_test::tensor_list_input(Tensor[] input) -> ()", [] (ArrayRef<Tensor> input1) -> void {
+      .op("_test::tensor_list_input(Tensor[] input) -> ()", [] (const std::vector<Tensor>& input1) -> void {
         captured_input_list_size = input1.size();
       });
 
@@ -378,7 +375,7 @@ TEST(OperatorRegistrationTest_LegacyLambdaBasedKernel, givenKernelWithTensorList
 
 TEST(OperatorRegistrationTest_LegacyLambdaBasedKernel, givenKernelWithTensorListRefInput_withOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
-      .op("_test::tensor_list_input(Tensor[] input) -> int", [] (ArrayRef<Tensor> input1) -> int64_t {
+      .op("_test::tensor_list_input(Tensor[] input) -> int", [] (const std::vector<Tensor>& input1) -> int64_t {
         return input1.size();
       });
 
@@ -448,6 +445,25 @@ TEST(OperatorRegistrationTest_LegacyLambdaBasedKernel, givenKernelWithLegacyTens
   EXPECT_EQ(2, outputs[0].toInt());
 }
 
+TEST(OperatorRegistrationTest_LegacyLambdaBasedKernel, givenKernelWithStringListOutput_whenRegistered_thenCanBeCalled) {
+  auto registrar = RegisterOperators()
+      .op("_test::stringlist_output(str[] input) -> str[]", [](std::vector<std::string> input) {
+        return input;
+      });
+
+  auto op = c10::Dispatcher::singleton().findSchema("_test::stringlist_output", "");
+  ASSERT_TRUE(op.has_value());
+
+  std::vector<std::string> list{"value1", "value2"};
+  auto outputs = callOp(*op, list);
+  EXPECT_EQ(1, outputs.size());
+  auto output = std::move(outputs[0].toGenericList()->elements());
+
+  EXPECT_EQ(2, output.size());
+  EXPECT_EQ("value1", output[0].toString()->string());
+  EXPECT_EQ("value2", output[1].toString()->string());
+}
+
 TEST(OperatorRegistrationTest_LegacyLambdaBasedKernel, givenKernelWithDictInput_withoutOutput_whenRegistered_thenCanBeCalled) {
   int captured_dict_size = 0;
 
@@ -564,6 +580,111 @@ TEST(OperatorRegistrationTest_LegacyLambdaBasedKernel, givenKernelWithUnorderedM
   EXPECT_EQ("value2", output.at("key2"));
 }
 
+TEST(OperatorRegistrationTest_LegacyLambdaBasedKernel, givenKernelWithMapOfList_whenRegistered_thenCanBeCalled) {
+  auto registrar = RegisterOperators()
+      .op("_test::dict_output(Dict(str, int[]) input) -> Dict(str, int[])", [](std::unordered_map<string, std::vector<int64_t>> input) {
+        return input;
+      });
+
+  auto op = c10::Dispatcher::singleton().findSchema("_test::dict_output", "");
+  ASSERT_TRUE(op.has_value());
+
+  std::unordered_map<string, std::vector<int64_t>> dict;
+  dict.insert({"key1", std::vector<int64_t>{10, 20}});
+  dict.insert({"key2", std::vector<int64_t>{30, 40}});
+  auto outputs = callOp(*op, dict);
+  EXPECT_EQ(1, outputs.size());
+  auto output = c10::impl::toTypedDict<string, std::vector<int64_t>>(std::move(outputs[0].toGenericDict()->elements()));
+
+  EXPECT_EQ(2, output.size());
+  EXPECT_EQ(2, output.at("key1").size());
+  EXPECT_EQ(10, output.at("key1")[0]);
+  EXPECT_EQ(20, output.at("key1")[1]);
+  EXPECT_EQ(2, output.at("key2").size());
+  EXPECT_EQ(30, output.at("key2")[0]);
+  EXPECT_EQ(40, output.at("key2")[1]);
+}
+
+
+TEST(OperatorRegistrationTest_LegacyLambdaBasedKernel, givenKernelWithMapOfListOfMap_whenRegistered_thenCanBeCalled) {
+  auto registrar = RegisterOperators()
+      .op("_test::dict_output(Dict(str, Dict(int,str)[]) input) -> Dict(str, Dict(int,str)[])", [](std::unordered_map<string, std::vector<std::unordered_map<int64_t, string>>> input) {
+        return input;
+      });
+
+  auto op = c10::Dispatcher::singleton().findSchema("_test::dict_output", "");
+  ASSERT_TRUE(op.has_value());
+
+  std::unordered_map<string, std::vector<std::unordered_map<int64_t, string>>> dict;
+  dict.insert({"key1", {{{10, "10"}, {20, "20"}}}});
+  dict.insert({"key2", {{{30, "30"}, {40, "40"}}}});
+  auto outputs = callOp(*op, dict);
+  EXPECT_EQ(1, outputs.size());
+  auto output = c10::impl::toTypedDict<string, std::vector<std::unordered_map<int64_t, string>>>(std::move(outputs[0].toGenericDict()->elements()));
+
+  EXPECT_EQ(2, output.size());
+  EXPECT_EQ(1, output.at("key1").size());
+  EXPECT_EQ(2, output.at("key1")[0].size());
+  EXPECT_EQ("10", output.at("key1")[0][10]);
+  EXPECT_EQ("20", output.at("key1")[0][20]);
+  EXPECT_EQ(2, output.at("key2")[0].size());
+  EXPECT_EQ("30", output.at("key2")[0][30]);
+  EXPECT_EQ("40", output.at("key2")[0][40]);
+}
+
+TEST(OperatorRegistrationTest_LegacyLambdaBasedKernel, givenKernelWithListOfMap_whenRegistered_thenCanBeCalled) {
+  auto registrar = RegisterOperators()
+      .op("_test::list_output(Dict(str, int)[] input) -> Dict(str, int)[]", [](std::vector<std::unordered_map<string, int64_t>> input) {
+        return input;
+      });
+
+  auto op = c10::Dispatcher::singleton().findSchema("_test::list_output", "");
+  ASSERT_TRUE(op.has_value());
+
+  std::vector<std::unordered_map<string, int64_t>> list{{{"1", 1}, {"2", 2}}, {{"3", 3}, {"4", 4}}};
+  auto outputs = callOp(*op, list);
+  EXPECT_EQ(1, outputs.size());
+  std::vector<c10::IValue> output = std::move(outputs[0].toGenericList()->elements());
+
+  EXPECT_EQ(2, output.size());
+  EXPECT_EQ(2, output[0].toGenericDictRef().size());
+  EXPECT_EQ(1, output[0].toGenericDictRef().at("1").toInt());
+  EXPECT_EQ(2, output[0].toGenericDictRef().at("2").toInt());
+  EXPECT_EQ(2, output[1].toGenericDictRef().size());
+  EXPECT_EQ(3, output[1].toGenericDictRef().at("3").toInt());
+  EXPECT_EQ(4, output[1].toGenericDictRef().at("4").toInt());
+}
+
+TEST(OperatorRegistrationTest_LegacyLambdaBasedKernel, givenKernelWithListOfMapOfIntList_whenRegistered_thenCanBeCalled) {
+  auto registrar = RegisterOperators()
+      .op("_test::list_output(Dict(str, int[])[] input) -> Dict(str, int[])[]", [](std::vector<std::unordered_map<string, std::vector<int64_t>>> input) {
+        return input;
+      });
+
+  auto op = c10::Dispatcher::singleton().findSchema("_test::list_output", "");
+  ASSERT_TRUE(op.has_value());
+
+  std::vector<std::unordered_map<string, std::vector<int64_t>>> list{{{"1", {1, 2}}, {"3", {3, 4}}}, {{"5", {5, 6}}, {"7", {7, 8}}}};
+  auto outputs = callOp(*op, list);
+  EXPECT_EQ(1, outputs.size());
+  std::vector<c10::IValue> output = std::move(outputs[0].toGenericList()->elements());
+
+  EXPECT_EQ(2, output.size());
+  EXPECT_EQ(2, output[0].toGenericDictRef().size());
+  EXPECT_EQ(2, output[0].toGenericDictRef().at("1").toIntListRef().size());
+  EXPECT_EQ(1, output[0].toGenericDictRef().at("1").toIntListRef()[0]);
+  EXPECT_EQ(2, output[0].toGenericDictRef().at("1").toIntListRef()[1]);
+  EXPECT_EQ(2, output[0].toGenericDictRef().at("3").toIntListRef().size());
+  EXPECT_EQ(3, output[0].toGenericDictRef().at("3").toIntListRef()[0]);
+  EXPECT_EQ(4, output[0].toGenericDictRef().at("3").toIntListRef()[1]);
+  EXPECT_EQ(2, output[1].toGenericDictRef().at("5").toIntListRef().size());
+  EXPECT_EQ(5, output[1].toGenericDictRef().at("5").toIntListRef()[0]);
+  EXPECT_EQ(6, output[1].toGenericDictRef().at("5").toIntListRef()[1]);
+  EXPECT_EQ(2, output[1].toGenericDictRef().at("7").toIntListRef().size());
+  EXPECT_EQ(7, output[1].toGenericDictRef().at("7").toIntListRef()[0]);
+  EXPECT_EQ(8, output[1].toGenericDictRef().at("7").toIntListRef()[1]);
+}
+
 TEST(OperatorRegistrationTest_LegacyLambdaBasedKernel, givenFallbackKernelWithoutAnyArguments_whenRegistered_thenCanBeCalled) {
   // note: non-fallback kernels without tensor arguments don't work because there
   // is no way to get the dispatch key. For operators that only have a fallback
@@ -705,7 +826,7 @@ TEST(OperatorRegistrationTest_LegacyLambdaBasedKernel, givenKernelWithOptionalIn
 
 TEST(OperatorRegistrationTest_LegacyLambdaBasedKernel, givenKernel_whenRegisteredWithoutSpecifyingSchema_thenInfersSchema) {
   auto registrar = RegisterOperators()
-      .op("_test::no_schema_specified", [] (Tensor arg1, int64_t arg2, ArrayRef<Tensor> arg3) -> std::tuple<int64_t, Tensor> {return {};});
+      .op("_test::no_schema_specified", [] (Tensor arg1, int64_t arg2, const std::vector<Tensor>& arg3) -> std::tuple<int64_t, Tensor> {return {};});
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::no_schema_specified", "");
   ASSERT_TRUE(op.has_value());
diff --git a/aten/src/ATen/core/op_registration/kernel_lambda_test.cpp b/aten/src/ATen/core/op_registration/kernel_lambda_test.cpp
index 5ae3ffcdcd75..fd60342793d6 100644
--- a/aten/src/ATen/core/op_registration/kernel_lambda_test.cpp
+++ b/aten/src/ATen/core/op_registration/kernel_lambda_test.cpp
@@ -6,8 +6,6 @@
 #include <torch/csrc/jit/script/function_schema_parser.h>
 
 using c10::RegisterOperators;
-using c10::kernel;
-using c10::dispatchKey;
 using c10::TensorTypeId;
 using c10::KernelCache;
 using c10::Stack;
@@ -15,7 +13,6 @@ using c10::guts::make_unique;
 using c10::ivalue::TensorList;
 using c10::ivalue::IntList;
 using c10::intrusive_ptr;
-using c10::ArrayRef;
 using c10::Dict;
 using at::Tensor;
 using std::string;
@@ -47,38 +44,38 @@ void expectCallsDecrement(TensorTypeId type_id) {
 }
 
 TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernel_whenRegistered_thenCanBeCalled) {
-  auto registrar = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", kernel([] (Tensor, int64_t i) {return i+1;}), dispatchKey(TensorType1()));
+  auto registrar = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", RegisterOperators::options().kernel([] (Tensor, int64_t i) {return i+1;}).dispatchKey(TensorType1()));
   expectCallsIncrement(TensorType1());
 }
 
 TEST(OperatorRegistrationTest_LambdaBasedKernel, givenOutOfLineKernel_whenRegistered_thenCanBeCalled) {
   auto my_kernel = [] (Tensor, int64_t i) {return i+1;};
-  auto registrar = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", kernel(my_kernel), dispatchKey(TensorType1()));
+  auto registrar = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", RegisterOperators::options().kernel(my_kernel).dispatchKey(TensorType1()));
   expectCallsIncrement(TensorType1());
 }
 
 TEST(OperatorRegistrationTest_LambdaBasedKernel, givenMultipleOperatorsAndKernels_whenRegisteredInOneRegistrar_thenCallsRightKernel) {
   auto registrar = RegisterOperators()
-      .op("_test::my_op(Tensor dummy, int input) -> int", kernel([] (Tensor, int64_t i) {return i+1;}), dispatchKey(TensorType1()))
-      .op("_test::my_op(Tensor dummy, int input) -> int", kernel([] (Tensor, int64_t) -> int64_t {EXPECT_TRUE(false); return 0;}), dispatchKey(TensorType2()))
-      .op("_test::error(Tensor dummy, int input) -> int", kernel([] (Tensor, int64_t) -> int64_t {EXPECT_TRUE(false); return 0;}), dispatchKey(TensorType1()))
-      .op("_test::error(Tensor dummy, int input) -> int", kernel([] (Tensor, int64_t) -> int64_t {EXPECT_TRUE(false); return 0;}), dispatchKey(TensorType2()));
+      .op("_test::my_op(Tensor dummy, int input) -> int", RegisterOperators::options().kernel([] (Tensor, int64_t i) {return i+1;}).dispatchKey(TensorType1()))
+      .op("_test::my_op(Tensor dummy, int input) -> int", RegisterOperators::options().kernel([] (Tensor, int64_t) -> int64_t {EXPECT_TRUE(false); return 0;}).dispatchKey(TensorType2()))
+      .op("_test::error(Tensor dummy, int input) -> int", RegisterOperators::options().kernel([] (Tensor, int64_t) -> int64_t {EXPECT_TRUE(false); return 0;}).dispatchKey(TensorType1()))
+      .op("_test::error(Tensor dummy, int input) -> int", RegisterOperators::options().kernel([] (Tensor, int64_t) -> int64_t {EXPECT_TRUE(false); return 0;}).dispatchKey(TensorType2()));
   expectCallsIncrement(TensorType1());
 }
 
 TEST(OperatorRegistrationTest_LambdaBasedKernel, givenMultipleOperatorsAndKernels_whenRegisteredInMultipleRegistrars_thenCallsRightKernel) {
-  auto registrar1 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", kernel([] (Tensor, int64_t i) {return i+1;}), dispatchKey(TensorType1()));
-  auto registrar2 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", kernel([] (Tensor, int64_t) -> int64_t {EXPECT_TRUE(false); return 0;}), dispatchKey(TensorType2()));
-  auto registrar3 = RegisterOperators().op("_test::error(Tensor dummy, int input) -> int", kernel([] (Tensor, int64_t) -> int64_t {EXPECT_TRUE(false); return 0;}), dispatchKey(TensorType1()));
-  auto registrar4 = RegisterOperators().op("_test::error(Tensor dummy, int input) -> int", kernel([] (Tensor, int64_t) -> int64_t {EXPECT_TRUE(false); return 0;}), dispatchKey(TensorType2()));
+  auto registrar1 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", RegisterOperators::options().kernel([] (Tensor, int64_t i) {return i+1;}).dispatchKey(TensorType1()));
+  auto registrar2 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", RegisterOperators::options().kernel([] (Tensor, int64_t) -> int64_t {EXPECT_TRUE(false); return 0;}).dispatchKey(TensorType2()));
+  auto registrar3 = RegisterOperators().op("_test::error(Tensor dummy, int input) -> int", RegisterOperators::options().kernel([] (Tensor, int64_t) -> int64_t {EXPECT_TRUE(false); return 0;}).dispatchKey(TensorType1()));
+  auto registrar4 = RegisterOperators().op("_test::error(Tensor dummy, int input) -> int", RegisterOperators::options().kernel([] (Tensor, int64_t) -> int64_t {EXPECT_TRUE(false); return 0;}).dispatchKey(TensorType2()));
   expectCallsIncrement(TensorType1());
 }
 
 TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernel_whenRegistrationRunsOutOfScope_thenCannotBeCalledAnymore) {
   {
-    auto registrar1 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", kernel([] (Tensor, int64_t i) {return i+1;}), dispatchKey(TensorType1()));
+    auto registrar1 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", RegisterOperators::options().kernel([] (Tensor, int64_t i) {return i+1;}).dispatchKey(TensorType1()));
     {
-      auto registrar2 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", kernel([] (Tensor, int64_t i) {return i-1;}), dispatchKey(TensorType2()));
+      auto registrar2 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", RegisterOperators::options().kernel([] (Tensor, int64_t i) {return i-1;}).dispatchKey(TensorType2()));
 
       // assert that schema and cpu kernel are present
       expectCallsIncrement(TensorType1());
@@ -98,8 +95,9 @@ bool was_called = false;
 
 TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithoutOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators().op("_test::no_return(Tensor dummy) -> ()",
-    kernel([] (const Tensor&) -> void {was_called = true;}),
-    dispatchKey(TensorType1()));
+    RegisterOperators::options()
+      .kernel([] (const Tensor&) -> void {was_called = true;})
+      .dispatchKey(TensorType1()));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::no_return", "");
   ASSERT_TRUE(op.has_value());
@@ -111,8 +109,8 @@ TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithoutOutput_whenRe
 
 TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithZeroOutputs_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators().op("_test::zero_outputs(Tensor dummy) -> ()",
-    kernel([] (const Tensor&) -> std::tuple<> {was_called = true; return {};}),
-    dispatchKey(TensorType1()));
+    RegisterOperators::options().kernel([] (const Tensor&) -> std::tuple<> {was_called = true; return {};})
+    .dispatchKey(TensorType1()));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::zero_outputs", "");
   ASSERT_TRUE(op.has_value());
@@ -125,8 +123,8 @@ TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithZeroOutputs_when
 TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithIntOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
       .op("_test::int_output(Tensor dummy, int a, int b) -> int",
-        kernel([] (Tensor, int64_t a, int64_t b) {return a+b;}),
-        dispatchKey(TensorType1()));
+        RegisterOperators::options().kernel([] (Tensor, int64_t a, int64_t b) {return a+b;})
+        .dispatchKey(TensorType1()));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::int_output", "");
   ASSERT_TRUE(op.has_value());
@@ -139,11 +137,11 @@ TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithIntOutput_whenRe
 TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithTensorOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
       .op("_test::returning_tensor(Tensor input) -> Tensor",
-        kernel([] (const Tensor& a) {return a;}),
-        dispatchKey(TensorType1()))
+        RegisterOperators::options().kernel([] (const Tensor& a) {return a;})
+        .dispatchKey(TensorType1()))
       .op("_test::returning_tensor(Tensor input) -> Tensor",
-        kernel([] (const Tensor& a) {return a;}),
-        dispatchKey(TensorType2()));
+        RegisterOperators::options().kernel([] (const Tensor& a) {return a;})
+        .dispatchKey(TensorType2()));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::returning_tensor", "");
   ASSERT_TRUE(op.has_value());
@@ -160,8 +158,8 @@ TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithTensorOutput_whe
 TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithTensorListOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
       .op("_test::list_output(Tensor input1, Tensor input2, Tensor input3) -> Tensor[]",
-        kernel([] (const Tensor& a, const Tensor& b, const Tensor& c) -> std::vector<Tensor> {return {a, b, c};}),
-        dispatchKey(TensorType1()));
+        RegisterOperators::options().kernel([] (const Tensor& a, const Tensor& b, const Tensor& c) -> std::vector<Tensor> {return {a, b, c};})
+        .dispatchKey(TensorType1()));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::list_output", "");
   ASSERT_TRUE(op.has_value());
@@ -177,8 +175,8 @@ TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithTensorListOutput
 TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithIntListOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
       .op("_test::list_output(Tensor dummy, int input1, int input2, int input3) -> int[]",
-        kernel([] (const Tensor&, int64_t a, int64_t b, int64_t c) -> std::vector<int64_t> {return {a,b,c};}),
-        dispatchKey(TensorType1()));
+        RegisterOperators::options().kernel([] (const Tensor&, int64_t a, int64_t b, int64_t c) -> std::vector<int64_t> {return {a,b,c};})
+        .dispatchKey(TensorType1()));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::list_output", "");
   ASSERT_TRUE(op.has_value());
@@ -194,7 +192,7 @@ TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithIntListOutput_wh
 TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithMultipleOutputs_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
      .op("_test::multiple_outputs(Tensor dummy) -> (Tensor, int, Tensor[], int?, Dict(str, Tensor))",
-       kernel([] (Tensor) -> std::tuple<Tensor, int64_t, std::vector<Tensor>, c10::optional<int64_t>, Dict<string, Tensor>> {
+       RegisterOperators::options().kernel([] (Tensor) -> std::tuple<Tensor, int64_t, std::vector<Tensor>, c10::optional<int64_t>, Dict<string, Tensor>> {
          Dict<string, Tensor> dict;
          dict.insert("first", dummyTensor(TensorType1()));
          dict.insert("second", dummyTensor(TensorType2()));
@@ -205,8 +203,8 @@ TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithMultipleOutputs_
            c10::optional<int64_t>(c10::in_place, 0),
            dict
          );
-       }),
-       dispatchKey(TensorType1()));
+       })
+       .dispatchKey(TensorType1()));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::multiple_outputs", "");
   ASSERT_TRUE(op.has_value());
@@ -228,11 +226,11 @@ TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithMultipleOutputs_
 TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithTensorInputByReference_withOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
       .op("_test::tensor_input(Tensor input) -> Tensor",
-        kernel([] (const Tensor& a) {return a;}),
-        dispatchKey(TensorType1()))
+        RegisterOperators::options().kernel([] (const Tensor& a) {return a;})
+        .dispatchKey(TensorType1()))
       .op("_test::tensor_input(Tensor input) -> Tensor",
-        kernel([] (const Tensor& a) {return a;}),
-        dispatchKey(TensorType2()));
+        RegisterOperators::options().kernel([] (const Tensor& a) {return a;})
+        .dispatchKey(TensorType2()));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::tensor_input", "");
   ASSERT_TRUE(op.has_value());
@@ -249,11 +247,11 @@ TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithTensorInputByRef
 TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithTensorInputByValue_withOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
       .op("_test::tensor_input(Tensor input) -> Tensor",
-        kernel([] (Tensor a) {return a;}),
-        dispatchKey(TensorType1()))
+        RegisterOperators::options().kernel([] (Tensor a) {return a;})
+        .dispatchKey(TensorType1()))
       .op("_test::tensor_input(Tensor input) -> Tensor",
-        kernel([] (Tensor a) {return a;}),
-        dispatchKey(TensorType2()));
+        RegisterOperators::options().kernel([] (Tensor a) {return a;})
+        .dispatchKey(TensorType2()));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::tensor_input", "");
   ASSERT_TRUE(op.has_value());
@@ -272,11 +270,11 @@ Tensor captured_input;
 TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithTensorInputByReference_withoutOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
       .op("_test::tensor_input(Tensor input) -> ()",
-        kernel([] (const Tensor& a) -> void {captured_input = a;}),
-        dispatchKey(TensorType1()))
+        RegisterOperators::options().kernel([] (const Tensor& a) -> void {captured_input = a;})
+        .dispatchKey(TensorType1()))
       .op("_test::tensor_input(Tensor input) -> ()",
-        kernel([] (const Tensor& a) -> void {captured_input = a;}),
-        dispatchKey(TensorType2()));
+        RegisterOperators::options().kernel([] (const Tensor& a) -> void {captured_input = a;})
+        .dispatchKey(TensorType2()));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::tensor_input", "");
   ASSERT_TRUE(op.has_value());
@@ -293,11 +291,11 @@ TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithTensorInputByRef
 TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithTensorInputByValue_withoutOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
       .op("_test::tensor_input(Tensor input) -> ()",
-        kernel([] (Tensor a) -> void {captured_input = a;}),
-        dispatchKey(TensorType1()))
+        RegisterOperators::options().kernel([] (Tensor a) -> void {captured_input = a;})
+        .dispatchKey(TensorType1()))
       .op("_test::tensor_input(Tensor input) -> ()",
-        kernel([] (Tensor a) -> void {captured_input = a;}),
-        dispatchKey(TensorType2()));
+        RegisterOperators::options().kernel([] (Tensor a) -> void {captured_input = a;})
+        .dispatchKey(TensorType2()));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::tensor_input", "");
   ASSERT_TRUE(op.has_value());
@@ -316,8 +314,8 @@ int64_t captured_int_input = 0;
 TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithIntInput_withoutOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
       .op("_test::int_input(Tensor dummy, int input) -> ()",
-        kernel([] (Tensor, int64_t a) -> void {captured_int_input = a;}),
-        dispatchKey(TensorType1()));
+        RegisterOperators::options().kernel([] (Tensor, int64_t a) -> void {captured_int_input = a;})
+        .dispatchKey(TensorType1()));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::int_input", "");
   ASSERT_TRUE(op.has_value());
@@ -331,8 +329,8 @@ TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithIntInput_without
 TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithIntInput_withOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
       .op("_test::int_input(Tensor dummy, int input) -> int",
-        kernel([] (Tensor, int64_t a) {return a + 1;}),
-        dispatchKey(TensorType1()));
+        RegisterOperators::options().kernel([] (Tensor, int64_t a) {return a + 1;})
+        .dispatchKey(TensorType1()));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::int_input", "");
   ASSERT_TRUE(op.has_value());
@@ -347,8 +345,8 @@ int64_t captured_input_list_size = 0;
 TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithIntListInput_withoutOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
       .op("_test::int_list_input(Tensor dummy, int[] input) -> ()",
-        kernel([] (Tensor, ArrayRef<int64_t> a) {captured_input_list_size = a.size();}),
-        dispatchKey(TensorType1()));
+        RegisterOperators::options().kernel([] (Tensor, const std::vector<int64_t>& a) {captured_input_list_size = a.size();})
+        .dispatchKey(TensorType1()));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::int_list_input", "");
   ASSERT_TRUE(op.has_value());
@@ -362,8 +360,8 @@ TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithIntListInput_wit
 TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithIntListInput_withOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
       .op("_test::int_list_input(Tensor dummy, int[] input) -> int",
-        kernel([] (Tensor, ArrayRef<int64_t> a) -> int64_t {return a.size();}),
-        dispatchKey(TensorType1()));
+        RegisterOperators::options().kernel([] (Tensor, const std::vector<int64_t>& a) -> int64_t {return a.size();})
+        .dispatchKey(TensorType1()));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::int_list_input", "");
   ASSERT_TRUE(op.has_value());
@@ -376,8 +374,8 @@ TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithIntListInput_wit
 TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithTensorListInput_withoutOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
       .op("_test::tensor_list_input(Tensor[] input) -> ()",
-        kernel([] (ArrayRef<Tensor> a) -> void {captured_input_list_size = a.size();}),
-        dispatchKey(TensorType1()));
+        RegisterOperators::options().kernel([] (const std::vector<Tensor>& a) -> void {captured_input_list_size = a.size();})
+        .dispatchKey(TensorType1()));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::tensor_list_input", "");
   ASSERT_TRUE(op.has_value());
@@ -391,8 +389,8 @@ TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithTensorListInput_
 TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithTensorListInput_withOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
       .op("_test::tensor_list_input(Tensor[] input) -> int",
-         kernel([] (ArrayRef<Tensor> a) -> int64_t {return a.size();}),
-         dispatchKey(TensorType1()));
+         RegisterOperators::options().kernel([] (const std::vector<Tensor>& a) -> int64_t {return a.size();})
+         .dispatchKey(TensorType1()));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::tensor_list_input", "");
   ASSERT_TRUE(op.has_value());
@@ -406,7 +404,7 @@ int captured_dict_size = 0;
 
 TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithDictInput_withoutOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
-      .op("_test::dict_input(Dict(str, Tensor) input) -> ()", kernel([] (Dict<string, Tensor> input1) {
+      .op("_test::dict_input(Dict(str, Tensor) input) -> ()", RegisterOperators::options().kernel([] (Dict<string, Tensor> input1) {
         captured_dict_size = input1.size();
       }));
 
@@ -424,7 +422,7 @@ TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithDictInput_withou
 
 TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithDictInput_withOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
-      .op("_test::dict_input(Dict(str, str) input) -> str", kernel([] (Dict<string, string> input1) {
+      .op("_test::dict_input(Dict(str, str) input) -> str", RegisterOperators::options().kernel([] (Dict<string, string> input1) {
         return input1.at("key2");
       }));
 
@@ -441,7 +439,7 @@ TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithDictInput_withOu
 
 TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithDictOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators()
-    .op("_test::dict_output(Dict(str, str) input) -> Dict(str, str)", kernel([] (Dict<string, string> input) {
+    .op("_test::dict_output(Dict(str, str) input) -> Dict(str, str)", RegisterOperators::options().kernel([] (Dict<string, string> input) {
       return input;
     }));
 
@@ -467,7 +465,7 @@ TEST(OperatorRegistrationTest_LambdaBasedKernel, givenFallbackKernelWithoutAnyAr
   // is no way to get the dispatch key. For operators that only have a fallback
   // kernel, this must work for backwards compatibility.
   auto registrar = RegisterOperators()
-      .op("_test::no_tensor_args() -> ()", kernel([] () {called = true;}));
+      .op("_test::no_tensor_args() -> ()", RegisterOperators::options().kernel([] () {called = true;}));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::no_tensor_args", "");
   ASSERT_TRUE(op.has_value());
@@ -482,7 +480,7 @@ TEST(OperatorRegistrationTest_LambdaBasedKernel, givenFallbackKernelWithoutTenso
   // is no way to get the dispatch key. For operators that only have a fallback
   // kernel, this must work for backwards compatibility.
   auto registrar = RegisterOperators()
-      .op("_test::no_tensor_args(int arg) -> int", kernel([] (int64_t arg) {return arg + 1;}));
+      .op("_test::no_tensor_args(int arg) -> int", RegisterOperators::options().kernel([] (int64_t arg) {return arg + 1;}));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::no_tensor_args", "");
   ASSERT_TRUE(op.has_value());
@@ -499,13 +497,13 @@ c10::optional<std::string> called_arg4;
 TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithOptionalInputs_withoutOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators().op(
     "_test::opt_input(Tensor arg1, Tensor? arg2, int? arg3, str? arg4) -> ()",
-    kernel([] (Tensor arg1, const c10::optional<Tensor>& arg2, c10::optional<int64_t> arg3, c10::optional<std::string> arg4) {
+    RegisterOperators::options().kernel([] (Tensor arg1, const c10::optional<Tensor>& arg2, c10::optional<int64_t> arg3, c10::optional<std::string> arg4) {
       called = true;
       called_arg2 = arg2;
       called_arg3 = arg3;
       called_arg4 = arg4;
-    }),
-    dispatchKey(TensorType1()));
+    })
+    .dispatchKey(TensorType1()));
   auto op = c10::Dispatcher::singleton().findSchema("_test::opt_input", "");
   ASSERT_TRUE(op.has_value());
 
@@ -534,14 +532,14 @@ TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithOptionalInputs_w
 TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithOptionalInputs_withOutput_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators().op(
     "_test::opt_input(Tensor arg1, Tensor? arg2, int? arg3, str? arg4) -> Tensor?",
-    kernel([] (Tensor arg1, const c10::optional<Tensor>& arg2, c10::optional<int64_t> arg3, c10::optional<std::string> arg4) {
+    RegisterOperators::options().kernel([] (Tensor arg1, const c10::optional<Tensor>& arg2, c10::optional<int64_t> arg3, c10::optional<std::string> arg4) {
       called = true;
       called_arg2 = arg2;
       called_arg3 = arg3;
       called_arg4 = arg4;
       return arg2;
-    }),
-    dispatchKey(TensorType1()));
+    })
+    .dispatchKey(TensorType1()));
   auto op = c10::Dispatcher::singleton().findSchema("_test::opt_input", "");
   ASSERT_TRUE(op.has_value());
 
@@ -572,10 +570,10 @@ TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithOptionalInputs_w
 TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithOptionalInputs_withMultipleOutputs_whenRegistered_thenCanBeCalled) {
   auto registrar = RegisterOperators().op(
     "_test::opt_input(Tensor arg1, Tensor? arg2, int? arg3, str? arg4) -> (Tensor?, int?, str?)",
-    kernel([] (Tensor arg1, const c10::optional<Tensor>& arg2, c10::optional<int64_t> arg3, c10::optional<std::string> arg4) {
+    RegisterOperators::options().kernel([] (Tensor arg1, const c10::optional<Tensor>& arg2, c10::optional<int64_t> arg3, c10::optional<std::string> arg4) {
       return std::make_tuple(arg2, arg3, arg4);
-    }),
-    dispatchKey(TensorType1()));
+    })
+    .dispatchKey(TensorType1()));
   auto op = c10::Dispatcher::singleton().findSchema("_test::opt_input", "");
   ASSERT_TRUE(op.has_value());
 
@@ -594,7 +592,7 @@ TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernelWithOptionalInputs_w
 
 TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernel_whenRegisteredWithoutSpecifyingSchema_thenInfersSchema) {
   auto registrar = RegisterOperators()
-      .op("_test::no_schema_specified", kernel([] (Tensor arg1, int64_t arg2, ArrayRef<Tensor> arg3) -> std::tuple<int64_t, Tensor> {return {};}));
+      .op("_test::no_schema_specified", RegisterOperators::options().kernel([] (Tensor arg1, int64_t arg2, const std::vector<Tensor>& arg3) -> std::tuple<int64_t, Tensor> {return {};}));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::no_schema_specified", "");
   ASSERT_TRUE(op.has_value());
@@ -605,35 +603,35 @@ TEST(OperatorRegistrationTest_LambdaBasedKernel, givenKernel_whenRegisteredWitho
 TEST(OperatorRegistrationTest_LambdaBasedKernel, givenMismatchedKernel_withDifferentNumArguments_whenRegistering_thenFails) {
   // assert this does not fail because it matches
   RegisterOperators()
-      .op("_test::mismatch(Tensor arg) -> int", kernel([] (Tensor) -> int64_t {return {};}), dispatchKey(TensorType1()));
+      .op("_test::mismatch(Tensor arg) -> int", RegisterOperators::options().kernel([] (Tensor) -> int64_t {return {};}).dispatchKey(TensorType1()));
 
   // and now a set of mismatching schemas
   expectThrows<c10::Error>([] {
     RegisterOperators()
-        .op("_test::mismatch(Tensor arg, Tensor arg2) -> int", kernel([] (Tensor) -> int64_t {return {};}), dispatchKey(TensorType1()));
+        .op("_test::mismatch(Tensor arg, Tensor arg2) -> int", RegisterOperators::options().kernel([] (Tensor) -> int64_t {return {};}).dispatchKey(TensorType1()));
     }, "The number of arguments is different. Specified 2 but inferred 1"
   );
 
   // assert this does not fail because it matches
   RegisterOperators()
-      .op("_test::mismatch(Tensor arg, Tensor arg2) -> ()", kernel([] (Tensor, Tensor) -> void {}), dispatchKey(TensorType1()));
+      .op("_test::mismatch(Tensor arg, Tensor arg2) -> ()", RegisterOperators::options().kernel([] (Tensor, Tensor) -> void {}).dispatchKey(TensorType1()));
 
   // and now a set of mismatching schemas
   expectThrows<c10::Error>([] {
     RegisterOperators()
-        .op("_test::mismatch() -> ()", kernel([] (Tensor, Tensor) -> void {}), dispatchKey(TensorType1()));
+        .op("_test::mismatch() -> ()", RegisterOperators::options().kernel([] (Tensor, Tensor) -> void {}).dispatchKey(TensorType1()));
     }, "The number of arguments is different. Specified 0 but inferred 2"
   );
 
   expectThrows<c10::Error>([] {
     RegisterOperators()
-        .op("_test::mismatch(Tensor arg) -> ()", kernel([] (Tensor, Tensor) -> void {}), dispatchKey(TensorType1()));
+        .op("_test::mismatch(Tensor arg) -> ()", RegisterOperators::options().kernel([] (Tensor, Tensor) -> void {}).dispatchKey(TensorType1()));
     }, "The number of arguments is different. Specified 1 but inferred 2"
   );
 
   expectThrows<c10::Error>([] {
     RegisterOperators()
-        .op("_test::mismatch(Tensor arg, Tensor arg2, Tensor arg3) -> ()", kernel([] (Tensor, Tensor) -> void {}), dispatchKey(TensorType1()));
+        .op("_test::mismatch(Tensor arg, Tensor arg2, Tensor arg3) -> ()", RegisterOperators::options().kernel([] (Tensor, Tensor) -> void {}).dispatchKey(TensorType1()));
     }, "The number of arguments is different. Specified 3 but inferred 2"
   );
 }
@@ -641,18 +639,18 @@ TEST(OperatorRegistrationTest_LambdaBasedKernel, givenMismatchedKernel_withDiffe
 TEST(OperatorRegistrationTest_LambdaBasedKernel, givenMismatchedKernel_withDifferentArgumentType_whenRegistering_thenFails) {
   // assert this does not fail because it matches
   RegisterOperators()
-      .op("_test::mismatch(Tensor arg1, int arg2) -> int", kernel([] (Tensor, int64_t) -> int64_t {return {};}), dispatchKey(TensorType1()));
+      .op("_test::mismatch(Tensor arg1, int arg2) -> int", RegisterOperators::options().kernel([] (Tensor, int64_t) -> int64_t {return {};}).dispatchKey(TensorType1()));
 
   // and now a set of mismatching schemas
   expectThrows<c10::Error>([] {
     RegisterOperators()
-        .op("_test::mismatch(Tensor arg1, float arg2) -> int", kernel([] (Tensor, int64_t) -> int64_t {return {};}), dispatchKey(TensorType1()));
+        .op("_test::mismatch(Tensor arg1, float arg2) -> int", RegisterOperators::options().kernel([] (Tensor, int64_t) -> int64_t {return {};}).dispatchKey(TensorType1()));
     }, "Type mismatch in argument 2: specified float but inferred int"
   );
 
   expectThrows<c10::Error>([] {
     RegisterOperators()
-        .op("_test::mismatch(int arg1, int arg2) -> int", kernel([] (Tensor, int64_t) -> int64_t {return {};}), dispatchKey(TensorType1()));
+        .op("_test::mismatch(int arg1, int arg2) -> int", RegisterOperators::options().kernel([] (Tensor, int64_t) -> int64_t {return {};}).dispatchKey(TensorType1()));
     }, "Type mismatch in argument 1: specified int but inferred Tensor"
   );
 }
@@ -660,58 +658,58 @@ TEST(OperatorRegistrationTest_LambdaBasedKernel, givenMismatchedKernel_withDiffe
 TEST(OperatorRegistrationTest_LambdaBasedKernel, givenMismatchedKernel_withDifferentNumReturns_whenRegistering_thenFails) {
   // assert this does not fail because it matches
   RegisterOperators()
-      .op("_test::mismatch(Tensor arg) -> int", kernel([] (Tensor) -> int64_t {return {};}), dispatchKey(TensorType1()));
+      .op("_test::mismatch(Tensor arg) -> int", RegisterOperators::options().kernel([] (Tensor) -> int64_t {return {};}).dispatchKey(TensorType1()));
 
   // and now a set of mismatching schemas
   expectThrows<c10::Error>([] {
     RegisterOperators()
-        .op("_test::mismatch(Tensor arg) -> ()", kernel([] (Tensor) -> int64_t {return {};}), dispatchKey(TensorType1()));
+        .op("_test::mismatch(Tensor arg) -> ()", RegisterOperators::options().kernel([] (Tensor) -> int64_t {return {};}).dispatchKey(TensorType1()));
     }, "The number of returns is different. Specified 0 but inferred 1"
   );
 
   expectThrows<c10::Error>([] {
     RegisterOperators()
-        .op("_test::mismatch(Tensor arg) -> (int, int)", kernel([] (Tensor) -> int64_t {return {};}), dispatchKey(TensorType1()));
+        .op("_test::mismatch(Tensor arg) -> (int, int)", RegisterOperators::options().kernel([] (Tensor) -> int64_t {return {};}).dispatchKey(TensorType1()));
     }, "The number of returns is different. Specified 2 but inferred 1"
   );
 
   // assert this does not fail because it matches
   RegisterOperators()
-      .op("_test::mismatch(Tensor arg) -> ()", kernel([] (Tensor) -> void {}), dispatchKey(TensorType1()));
+      .op("_test::mismatch(Tensor arg) -> ()", RegisterOperators::options().kernel([] (Tensor) -> void {}).dispatchKey(TensorType1()));
 
   // and now a set of mismatching schemas
   expectThrows<c10::Error>([] {
     RegisterOperators()
-        .op("_test::mismatch(Tensor arg) -> Tensor", kernel([] (Tensor) -> void {}), dispatchKey(TensorType1()));
+        .op("_test::mismatch(Tensor arg) -> Tensor", RegisterOperators::options().kernel([] (Tensor) -> void {}).dispatchKey(TensorType1()));
     }, "The number of returns is different. Specified 1 but inferred 0"
   );
 
   expectThrows<c10::Error>([] {
     RegisterOperators()
-        .op("_test::mismatch(Tensor arg) -> (Tensor, Tensor)", kernel([] (Tensor) -> void {}), dispatchKey(TensorType1()));
+        .op("_test::mismatch(Tensor arg) -> (Tensor, Tensor)", RegisterOperators::options().kernel([] (Tensor) -> void {}).dispatchKey(TensorType1()));
     }, "The number of returns is different. Specified 2 but inferred 0"
   );
 
   // assert this does not fail because it matches
   RegisterOperators()
-      .op("_test::mismatch(Tensor arg) -> (Tensor, Tensor)", kernel([] (Tensor) -> std::tuple<Tensor, Tensor> {return {};}), dispatchKey(TensorType1()));
+      .op("_test::mismatch(Tensor arg) -> (Tensor, Tensor)", RegisterOperators::options().kernel([] (Tensor) -> std::tuple<Tensor, Tensor> {return {};}).dispatchKey(TensorType1()));
 
   // and now a set of mismatching schemas
   expectThrows<c10::Error>([] {
     RegisterOperators()
-        .op("_test::mismatch(Tensor arg) -> ()", kernel([] (Tensor) -> std::tuple<Tensor, Tensor> {return {};}), dispatchKey(TensorType1()));
+        .op("_test::mismatch(Tensor arg) -> ()", RegisterOperators::options().kernel([] (Tensor) -> std::tuple<Tensor, Tensor> {return {};}).dispatchKey(TensorType1()));
     }, "The number of returns is different. Specified 0 but inferred 2"
   );
 
   expectThrows<c10::Error>([] {
     RegisterOperators()
-        .op("_test::mismatch(Tensor arg) -> Tensor", kernel([] (Tensor) -> std::tuple<Tensor, Tensor> {return {};}), dispatchKey(TensorType1()));
+        .op("_test::mismatch(Tensor arg) -> Tensor", RegisterOperators::options().kernel([] (Tensor) -> std::tuple<Tensor, Tensor> {return {};}).dispatchKey(TensorType1()));
     }, "The number of returns is different. Specified 1 but inferred 2"
   );
 
   expectThrows<c10::Error>([] {
     RegisterOperators()
-        .op("_test::mismatch(Tensor arg) -> (Tensor, Tensor, Tensor)", kernel([] (Tensor) -> std::tuple<Tensor, Tensor> {return {};}), dispatchKey(TensorType1()));
+        .op("_test::mismatch(Tensor arg) -> (Tensor, Tensor, Tensor)", RegisterOperators::options().kernel([] (Tensor) -> std::tuple<Tensor, Tensor> {return {};}).dispatchKey(TensorType1()));
     }, "The number of returns is different. Specified 3 but inferred 2"
   );
 }
@@ -719,46 +717,46 @@ TEST(OperatorRegistrationTest_LambdaBasedKernel, givenMismatchedKernel_withDiffe
 TEST(OperatorRegistrationTest_LambdaBasedKernel, givenMismatchedKernel_withDifferentReturnTypes_whenRegistering_thenFails) {
   // assert this does not fail because it matches
   RegisterOperators()
-      .op("_test::mismatch(Tensor arg) -> int", kernel([] (Tensor) -> int64_t {return {};}), dispatchKey(TensorType1()));
+      .op("_test::mismatch(Tensor arg) -> int", RegisterOperators::options().kernel([] (Tensor) -> int64_t {return {};}).dispatchKey(TensorType1()));
 
   // and now a set of mismatching schemas
   expectThrows<c10::Error>([] {
     RegisterOperators()
-        .op("_test::mismatch(Tensor arg) -> Tensor", kernel([] (Tensor) -> int64_t {return {};}), dispatchKey(TensorType1()));
+        .op("_test::mismatch(Tensor arg) -> Tensor", RegisterOperators::options().kernel([] (Tensor) -> int64_t {return {};}).dispatchKey(TensorType1()));
     }, "Type mismatch in return 1: specified Tensor but inferred int"
   );
 
   expectThrows<c10::Error>([] {
     RegisterOperators()
-        .op("_test::mismatch(Tensor arg) -> float", kernel([] (Tensor) -> int64_t {return {};}), dispatchKey(TensorType1()));
+        .op("_test::mismatch(Tensor arg) -> float", RegisterOperators::options().kernel([] (Tensor) -> int64_t {return {};}).dispatchKey(TensorType1()));
     }, "Type mismatch in return 1: specified float but inferred int"
   );
 
   // assert this does not fail because it matches
   RegisterOperators()
-      .op("_test::mismatch(Tensor arg) -> Tensor", kernel([] (Tensor) -> Tensor {return {};}), dispatchKey(TensorType1()));
+      .op("_test::mismatch(Tensor arg) -> Tensor", RegisterOperators::options().kernel([] (Tensor) -> Tensor {return {};}).dispatchKey(TensorType1()));
 
   // and now a set of mismatching schemas
   expectThrows<c10::Error>([] {
     RegisterOperators()
-        .op("_test::mismatch(Tensor arg) -> float", kernel([] (Tensor) -> Tensor {return {};}), dispatchKey(TensorType1()));
+        .op("_test::mismatch(Tensor arg) -> float", RegisterOperators::options().kernel([] (Tensor) -> Tensor {return {};}).dispatchKey(TensorType1()));
     }, "Type mismatch in return 1: specified float but inferred Tensor"
   );
 
   // assert this does not fail because it matches
   RegisterOperators()
-      .op("_test::mismatch(Tensor arg) -> (Tensor, int)", kernel([] (Tensor) -> std::tuple<Tensor, int64_t> {return {};}), dispatchKey(TensorType1()));
+      .op("_test::mismatch(Tensor arg) -> (Tensor, int)", RegisterOperators::options().kernel([] (Tensor) -> std::tuple<Tensor, int64_t> {return {};}).dispatchKey(TensorType1()));
 
   // and now a set of mismatching schemas
   expectThrows<c10::Error>([] {
     RegisterOperators()
-        .op("_test::mismatch(Tensor arg) -> (Tensor, float)", kernel([] (Tensor) -> std::tuple<Tensor, int64_t> {return {};}), dispatchKey(TensorType1()));
+        .op("_test::mismatch(Tensor arg) -> (Tensor, float)", RegisterOperators::options().kernel([] (Tensor) -> std::tuple<Tensor, int64_t> {return {};}).dispatchKey(TensorType1()));
     }, "Type mismatch in return 2: specified float but inferred int"
   );
 
   expectThrows<c10::Error>([] {
     RegisterOperators()
-        .op("_test::mismatch(Tensor arg) -> (int, int)", kernel([] (Tensor) -> std::tuple<Tensor, int64_t> {return {};}), dispatchKey(TensorType1()));
+        .op("_test::mismatch(Tensor arg) -> (int, int)", RegisterOperators::options().kernel([] (Tensor) -> std::tuple<Tensor, int64_t> {return {};}).dispatchKey(TensorType1()));
     }, "Type mismatch in return 1: specified int but inferred Tensor"
   );
 }
diff --git a/aten/src/ATen/core/op_registration/kernel_stackbased.h b/aten/src/ATen/core/op_registration/kernel_stackbased.h
deleted file mode 100644
index 8ec981979f31..000000000000
--- a/aten/src/ATen/core/op_registration/kernel_stackbased.h
+++ /dev/null
@@ -1,81 +0,0 @@
-#pragma once
-
-/**
- * This file implements c10::kernel(stack_based_kernel) which is used in the
- * kernel registration API to set the dispatch key for a registered kernel.
- * You probably don't want to use this API, stack based kernels are internal
- * only. There's other, better kernel APIs which are built on top of this one.
- *
- * You probably don't want to include this file directly but include
- * op_registration.h instead since that adds more functionality you'll
- * likely need to register your operators.
- */
-
-#include <ATen/core/op_registration/base.h>
-
-namespace c10 {
-
-namespace detail {
-
-  struct NoFunctionSchemaInference final {
-    std::unique_ptr<FunctionSchema> operator()() const {
-      return nullptr;
-    }
-  };
-
-  template<class KernelCacheCreatorFunction_, class InferFunctionSchemaFunction>
-  struct KernelRegistrationConfigParameter final {
-    template<class KernelCacheCreatorFunction__>
-    constexpr KernelRegistrationConfigParameter(KernelFunction* kernel_func, KernelCacheCreatorFunction__&& cache_creator_func, InferFunctionSchemaFunction&& infer_function_schema_func)
-    : kernel_func_(kernel_func)
-    , cache_creator_func_(std::forward<KernelCacheCreatorFunction__>(cache_creator_func))
-    , infer_function_schema_func_(std::forward<InferFunctionSchemaFunction>(infer_function_schema_func)) {
-    }
-
-    void apply(KernelRegistrationConfig* registration) const & {
-      registration->kernel_func = kernel_func_;
-      registration->cache_creator_func = cache_creator_func_;
-      registration->inferred_function_schema = infer_function_schema_func_();
-    }
-
-    void apply(KernelRegistrationConfig* registration) && {
-      registration->kernel_func = kernel_func_;
-      registration->cache_creator_func = std::move(cache_creator_func_);
-      registration->inferred_function_schema = std::move(infer_function_schema_func_)();
-    }
-
-  private:
-    KernelFunction* kernel_func_;
-    KernelCacheCreatorFunction_ cache_creator_func_;
-    InferFunctionSchemaFunction infer_function_schema_func_;
-  };
-
-  static_assert(is_registration_config_parameter<KernelRegistrationConfigParameter<KernelCacheCreatorFunction, NoFunctionSchemaInference>>::value, "KernelRegistrationConfigParameter must fulfill the registration config parameter concept");
-}
-
-/**
- * Use this to register an operator whose kernel is implemented by a stack
- * based function. This is meant to be used internally, for example for writing
- * wrappers for other ways of writing operators. This is not part of the
- * public API.
- *
- * Example:
- *
- * > namespace {
- * >   void my_kernel_cpu(Stack* stack, KernelCache* cache) {...}
- * >   unique_ptr<KernelCache> my_cache_creator() {...}
- * > }
- * >
- * > static auto registry = c10::RegisterOperators()
- * >     .op("my_op",
- * >         c10::kernel(&my_kernel_cpu, &my_cache_creator),
- * >         c10::dispatchKey(CPUTensorId()));
- */
-template<class KernelCacheCreatorFunction_>
-inline constexpr detail::KernelRegistrationConfigParameter<guts::decay_t<KernelCacheCreatorFunction_>, detail::NoFunctionSchemaInference> kernel(KernelFunction* kernel_func, KernelCacheCreatorFunction_&& cache_creator) {
-  static_assert(detail::is_registration_config_parameter<detail::KernelRegistrationConfigParameter<guts::decay_t<KernelCacheCreatorFunction_>, detail::NoFunctionSchemaInference>>::value, "KernelRegistrationConfigParameter must fulfill the registration config parameter concept");
-
-  return {kernel_func, std::forward<KernelCacheCreatorFunction_>(cache_creator), detail::NoFunctionSchemaInference()};
-}
-
-}
diff --git a/aten/src/ATen/core/op_registration/kernel_stackbased_test.cpp b/aten/src/ATen/core/op_registration/kernel_stackbased_test.cpp
index 0fc2060601a5..e5cf67e16db5 100644
--- a/aten/src/ATen/core/op_registration/kernel_stackbased_test.cpp
+++ b/aten/src/ATen/core/op_registration/kernel_stackbased_test.cpp
@@ -6,8 +6,6 @@
 #include <torch/csrc/jit/script/function_schema_parser.h>
 
 using c10::RegisterOperators;
-using c10::kernel;
-using c10::dispatchKey;
 using c10::TensorTypeId;
 using c10::KernelCache;
 using c10::Stack;
@@ -60,32 +58,32 @@ void expectCallsDecrement(TensorTypeId type_id) {
 }
 
 TEST(OperatorRegistrationTest_StackBasedKernel, givenKernel_whenRegistered_thenCanBeCalled) {
-  auto registrar = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", kernel(&incrementKernel, &noCache), dispatchKey(TensorType1()));
+  auto registrar = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", RegisterOperators::options().kernel(&incrementKernel, &noCache).dispatchKey(TensorType1()));
   expectCallsIncrement(TensorType1());
 }
 
 TEST(OperatorRegistrationTest_StackBasedKernel, givenMultipleOperatorsAndKernels_whenRegisteredInOneRegistrar_thenCallsRightKernel) {
   auto registrar = RegisterOperators()
-      .op("_test::my_op(Tensor dummy, int input) -> int", kernel(&incrementKernel, &noCache), dispatchKey(TensorType1()))
-      .op("_test::my_op(Tensor dummy, int input) -> int", kernel(&errorKernel, &noCache), dispatchKey(TensorType2()))
-      .op("_test::error(Tensor dummy, int input) -> int", kernel(&errorKernel, &noCache), dispatchKey(TensorType1()))
-      .op("_test::error(Tensor dummy, int input) -> int", kernel(&errorKernel, &noCache), dispatchKey(TensorType2()));
+      .op("_test::my_op(Tensor dummy, int input) -> int", RegisterOperators::options().kernel(&incrementKernel, &noCache).dispatchKey(TensorType1()))
+      .op("_test::my_op(Tensor dummy, int input) -> int", RegisterOperators::options().kernel(&errorKernel, &noCache).dispatchKey(TensorType2()))
+      .op("_test::error(Tensor dummy, int input) -> int", RegisterOperators::options().kernel(&errorKernel, &noCache).dispatchKey(TensorType1()))
+      .op("_test::error(Tensor dummy, int input) -> int", RegisterOperators::options().kernel(&errorKernel, &noCache).dispatchKey(TensorType2()));
   expectCallsIncrement(TensorType1());
 }
 
 TEST(OperatorRegistrationTest_StackBasedKernel, givenMultipleOperatorsAndKernels_whenRegisteredInMultipleRegistrars_thenCallsRightKernel) {
-  auto registrar1 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", kernel(&incrementKernel, &noCache), dispatchKey(TensorType1()));
-  auto registrar2 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", kernel(&errorKernel, &noCache), dispatchKey(TensorType2()));
-  auto registrar3 = RegisterOperators().op("_test::error(Tensor dummy, int input) -> int", kernel(&errorKernel, &noCache), dispatchKey(TensorType1()));
-  auto registrar4 = RegisterOperators().op("_test::error(Tensor dummy, int input) -> int", kernel(&errorKernel, &noCache), dispatchKey(TensorType2()));
+  auto registrar1 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", RegisterOperators::options().kernel(&incrementKernel, &noCache).dispatchKey(TensorType1()));
+  auto registrar2 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", RegisterOperators::options().kernel(&errorKernel, &noCache).dispatchKey(TensorType2()));
+  auto registrar3 = RegisterOperators().op("_test::error(Tensor dummy, int input) -> int", RegisterOperators::options().kernel(&errorKernel, &noCache).dispatchKey(TensorType1()));
+  auto registrar4 = RegisterOperators().op("_test::error(Tensor dummy, int input) -> int", RegisterOperators::options().kernel(&errorKernel, &noCache).dispatchKey(TensorType2()));
   expectCallsIncrement(TensorType1());
 }
 
 TEST(OperatorRegistrationTest_StackBasedKernel, givenKernel_whenRegistrationRunsOutOfScope_thenCannotBeCalledAnymore) {
   {
-    auto registrar1 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", kernel(&incrementKernel, &noCache), dispatchKey(TensorType1()));
+    auto registrar1 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", RegisterOperators::options().kernel(&incrementKernel, &noCache).dispatchKey(TensorType1()));
     {
-      auto registrar2 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", kernel(&decrementKernel, &noCache), dispatchKey(TensorType2()));
+      auto registrar2 = RegisterOperators().op("_test::my_op(Tensor dummy, int input) -> int", RegisterOperators::options().kernel(&decrementKernel, &noCache).dispatchKey(TensorType2()));
 
       // assert that schema and cpu kernel are present
       expectCallsIncrement(TensorType1());
@@ -112,7 +110,7 @@ TEST(OperatorRegistrationTest_StackBasedKernel, givenFallbackKernelWithoutAnyArg
   // is no way to get the dispatch key. For operators that only have a fallback
   // kernel, this must work for backwards compatibility.
   auto registrar = RegisterOperators()
-      .op("_test::no_tensor_args() -> ()", kernel(&kernelWithoutInputs, &noCache));
+      .op("_test::no_tensor_args() -> ()", RegisterOperators::options().kernel(&kernelWithoutInputs, &noCache));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::no_tensor_args", "");
   ASSERT_TRUE(op.has_value());
@@ -131,7 +129,7 @@ TEST(OperatorRegistrationTest_StackBasedKernel, givenFallbackKernelWithoutTensor
   // is no way to get the dispatch key. For operators that only have a fallback
   // kernel, this must work for backwards compatibility.
   auto registrar = RegisterOperators()
-      .op("_test::no_tensor_args(int arg) -> int", kernel(&kernelWithoutTensorInputs, &noCache));
+      .op("_test::no_tensor_args(int arg) -> int", RegisterOperators::options().kernel(&kernelWithoutTensorInputs, &noCache));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::no_tensor_args", "");
   ASSERT_TRUE(op.has_value());
@@ -146,7 +144,7 @@ void kernelForSchemaInference(Stack* stack, KernelCache* cache) {
 
 TEST(OperatorRegistrationTest_StackBasedKernel, givenKernel_whenRegisteredWithoutSpecifyingSchema_thenFailsBecauseItCannotInferFromStackBasedKernel) {
   expectThrows<c10::Error>([] {
-      RegisterOperators().op("_test::no_schema_specified", kernel(&kernelForSchemaInference, &noCache));
+      RegisterOperators().op("_test::no_schema_specified", RegisterOperators::options().kernel(&kernelForSchemaInference, &noCache));
   }, "Cannot infer schema from this kernel function. Please explicitly specify the operator schema.");
 }
 
@@ -165,7 +163,7 @@ void increment_sequence_kernel(Stack* stack, KernelCache* cache) {
 }
 
 TEST(OperatorRegistrationTest_StackBasedKernel, givenKernelWithCache_whenCalled_thenCacheIsHandledCorrectly) {
-  auto registrar = RegisterOperators().op("_test::increment_sequence(Tensor dummy) -> int", kernel(&increment_sequence_kernel, &make_cache), dispatchKey(TensorType1()));
+  auto registrar = RegisterOperators().op("_test::increment_sequence(Tensor dummy) -> int", RegisterOperators::options().kernel(&increment_sequence_kernel, &make_cache).dispatchKey(TensorType1()));
 
   auto op = c10::Dispatcher::singleton().findSchema("_test::increment_sequence", "");
   ASSERT_TRUE(op.has_value());
diff --git a/aten/src/ATen/core/op_registration/op_registration.cpp b/aten/src/ATen/core/op_registration/op_registration.cpp
index ec0677ea4cec..a651cb26ae60 100644
--- a/aten/src/ATen/core/op_registration/op_registration.cpp
+++ b/aten/src/ATen/core/op_registration/op_registration.cpp
@@ -38,48 +38,48 @@ class RegisterOperators::OperatorRegistrar final {
   c10::optional<RegistrationHandleRAII> kernel_registration_handle_;
 };
 
-void RegisterOperators::checkSchemaAndRegisterOp_(const std::string& schemaOrNameStr, detail::KernelRegistrationConfig&& config) {
+void RegisterOperators::checkSchemaAndRegisterOp_(const std::string& schemaOrNameStr, Options&& options) {
   #if defined(CAFFE2_IS_XPLAT_BUILD)
     throw std::logic_error("We don't support registering c10 ops on mobile yet because the function schema parser isn't present in the mobile build.");
   #else
     either<OperatorName, FunctionSchema> schemaOrName = torch::jit::parseSchemaOrName(schemaOrNameStr);
     if (schemaOrName.is_right()) {
       // schema was explicitly specified. Check it matches the inferred one and register the op.
-      checkSchemaAndRegisterOp_(std::move(schemaOrName).right(), std::move(config));
+      checkSchemaAndRegisterOp_(std::move(schemaOrName).right(), std::move(options));
     } else {
       // schema wasn't explicitly specified. Take the inferred schema for registering the op.
-      AT_ASSERTM(nullptr != config.inferred_function_schema.get(), "Cannot infer schema from this kernel function. Please explicitly specify the operator schema.");
+      AT_ASSERTM(nullptr != options.config.inferred_function_schema.get(), "Cannot infer schema from this kernel function. Please explicitly specify the operator schema.");
       OperatorName name = std::move(schemaOrName).left();
       FunctionSchema inferredSchema(
         std::move(name.name),
         std::move(name.overload_name),
-        config.inferred_function_schema->arguments(),
-        config.inferred_function_schema->returns(),
-        config.inferred_function_schema->is_vararg(),
-        config.inferred_function_schema->is_varret()
+        options.config.inferred_function_schema->arguments(),
+        options.config.inferred_function_schema->returns(),
+        options.config.inferred_function_schema->is_vararg(),
+        options.config.inferred_function_schema->is_varret()
       );
-      registerOp_(std::move(inferredSchema), std::move(config));
+      registerOp_(std::move(inferredSchema), std::move(options));
     }
   #endif
 }
 
-void RegisterOperators::checkSchemaAndRegisterOp_(FunctionSchema&& schema, detail::KernelRegistrationConfig&& config) {
-  if (config.inferred_function_schema.get() != nullptr) {
-    assertSchemasHaveSameSignature(*config.inferred_function_schema, schema);
+void RegisterOperators::checkSchemaAndRegisterOp_(FunctionSchema&& schema, Options&& options) {
+  if (options.config.inferred_function_schema.get() != nullptr) {
+    assertSchemasHaveSameSignature(*options.config.inferred_function_schema, schema);
   }
 
-  registerOp_(std::move(schema), std::move(config));
+  registerOp_(std::move(schema), std::move(options));
 }
 
-void RegisterOperators::registerOp_(FunctionSchema&& schema, detail::KernelRegistrationConfig&& config) {
-  AT_CHECK(!config.dispatch_key.has_value() || config.kernel_func != nullptr,
+void RegisterOperators::registerOp_(FunctionSchema&& schema, Options&& options) {
+  TORCH_CHECK(!options.config.dispatch_key.has_value() || options.config.kernel_func != nullptr,
     "Tried to register an operator with a dispatch key but without a kernel. "
     "Please either specify a kernel or omit the dispatch key to only register the schema.");
 
   // if kernel_func is set, so must be cache_creator_func, the API shouldn't allow anything else.
-  AT_ASSERT((config.kernel_func != nullptr) == static_cast<bool>(config.cache_creator_func));
+  AT_ASSERT((options.config.kernel_func != nullptr) == static_cast<bool>(options.config.cache_creator_func));
 
-  registrars_.emplace_back(std::move(schema), config.dispatch_key, config.kernel_func, std::move(config.cache_creator_func));
+  registrars_.emplace_back(std::move(schema), options.config.dispatch_key, options.config.kernel_func, std::move(options.config.cache_creator_func));
 }
 
 RegisterOperators::RegisterOperators() = default;
diff --git a/aten/src/ATen/core/op_registration/op_registration.h b/aten/src/ATen/core/op_registration/op_registration.h
index d5c576b7e604..076101f436ee 100644
--- a/aten/src/ATen/core/op_registration/op_registration.h
+++ b/aten/src/ATen/core/op_registration/op_registration.h
@@ -5,9 +5,7 @@
  * functionality needed to do so for you.
  */
 
-#include <ATen/core/op_registration/base.h>
-#include <ATen/core/op_registration/dispatch_key.h>
-#include <ATen/core/op_registration/kernel_stackbased.h>
+#include <ATen/core/dispatch/Dispatcher.h>
 #include <ATen/core/op_registration/kernel_functor.h>
 #include <ATen/core/op_registration/kernel_function.h>
 #include <ATen/core/op_registration/kernel_lambda.h>
@@ -30,11 +28,11 @@ namespace c10 {
  * > }
  * >
  * > static auto registry = c10::RegisterOperators()
- * >     .op("my_op",
- * >         c10::kernel<my_kernel_cpu>(),
- * >         c10::dispatchKey(CPUTensorId()));
+ * >     .op("my_op", c10::RegisterOperators::options()
+ * >         .kernel<my_kernel_cpu>()
+ * >         .dispatchKey(CPUTensorId()));
  */
-class C10_API RegisterOperators final {
+class CAFFE2_API RegisterOperators final {
 public:
   RegisterOperators();
   ~RegisterOperators();
@@ -44,83 +42,238 @@ class C10_API RegisterOperators final {
   RegisterOperators(RegisterOperators&&) noexcept;
   RegisterOperators& operator=(RegisterOperators&&) noexcept;
 
+  class CAFFE2_API Options final {
+  public:
+    Options(const Options&) = delete;
+    Options(Options&&) noexcept = delete;
+    Options& operator=(const Options&) = delete;
+    Options& operator=(Options&&) noexcept = delete;
+
+    // internal-only for registering stack based kernels
+    Options&& kernel(KernelFunction* kernel_func, KernelCacheCreatorFunction&& cache_creator) && {
+      return std::move(*this).kernel(kernel_func, std::move(cache_creator), nullptr);
+    }
+
+    /**
+     * Use this to register an operator whose kernel is implemented as a functor
+     *
+     * Example:
+     *
+     * > namespace {
+     * >   class my_kernel_cpu final : public c10::OperatorKernel {
+     * >   public:
+     * >     Tensor operator()(Tensor a, Tensor b) {...}
+     * >   };
+     * > }
+     * >
+     * > static auto registry = c10::RegisterOperators()
+     * >     .op("my_op", c10::RegisterOperators::options()
+     * >         .kernel<my_kernel_cpu>()
+     * >         .dispatchKey(CPUTensorId()));
+     *
+     * The functor constructor can take arguments to configure the kernel.
+     * The arguments are defined in the kernel registration.
+     * Example:
+     *
+     * > namespace {
+     * >   class my_kernel_cpu final : public c10::OperatorKernel {
+     * >   public:
+     * >     explicit my_kernel_cpu(std::string some_configuration, int a, bool b)
+     * >         : ... {...}
+     * >
+     * >     Tensor operator()(Tensor a, Tensor b) {...}
+     * >   };
+     * > }
+     * >
+     * > static auto registry = c10::RegisterOperators()
+     * >     .op("my_op", c10::RegisterOperators::options()
+     * >         .kernel<my_kernel_cpu>("some_configuration", 3, true)
+     * >         .dispatchKey(CPUTensorId()));
+     */
+    template<class KernelFunctor, class... ConstructorParameters>
+    // enable_if: only enable it if KernelFunctor is actually a functor
+    guts::enable_if_t<guts::is_functor<KernelFunctor>::value, Options&&> kernel(ConstructorParameters&&... constructorParameters) {
+      static_assert(std::is_base_of<OperatorKernel, KernelFunctor>::value, "Tried to register a kernel functor using the kernel<Functor>() API, but it doesn't inherit from c10::OperatorKernel. Please have the functor inherit from it.");
+      static_assert(std::is_constructible<KernelFunctor, ConstructorParameters...>::value, "Wrong argument list for constructor of kernel functor. The arguments to kernel<Functor>(arguments...) must match one of the constructors of Functor.");
+
+      return std::move(*this).kernelFunctor<KernelFunctor, false>(std::forward<ConstructorParameters>(constructorParameters)...);
+    }
+
+    /**
+     * Use this to register an operator whose kernel is implemented by a function:
+     *
+     * Example:
+     *
+     * > namespace { Tensor my_kernel_cpu(Tensor a, Tensor b) {...} }
+     * >
+     * > static auto registry = c10::RegisterOperators()
+     * >     .op("my_op", c10::RegisterOperators()
+     * >         .kernel<decltype(my_kernel_cpu), &my_kernel_cpu>()
+     * >         .dispatchKey(CPUTensorId()));
+     */
+    template<class FuncType, FuncType* kernel_func>
+    // enable_if: only enable it if FuncType is actually a function
+    guts::enable_if_t<guts::is_function_type<FuncType>::value, Options&&> kernel() {
+      static_assert(!std::is_same<FuncType, KernelFunction>::value, "Tried to register a stackbased (i.e. internal) kernel function using the public kernel<...>() API. Please either use the internal kernel(...) API or also implement the kernel function as defined by the public API.");
+
+      return kernel<typename detail::WrapKernelFunction<FuncType, kernel_func>::type>();
+    }
+
+    /**
+     * Use this to register an operator whose kernel is implemented as a lambda.
+     * The lambda must be stateless, i.e. not have a capture. If your kernel
+     * needs to store some configuration parameters, write the kernel as a
+     * functor instead.
+     *
+     * Example:
+     *
+     * > static auto registry = c10::RegisterOperators()
+     * >     .op("my_op", c10::RegisterOperators::options()
+     * >         .kernel([] (Tensor a) -> Tensor {...})
+     * >         .dispatchKey(CPUTensorId()));
+     */
+    template<class Lambda>
+    // enable_if: only enable it if Lambda is a functor (note: lambdas are functors)
+    guts::enable_if_t<guts::is_functor<guts::decay_t<Lambda>>::value, Options&&> kernel(Lambda&& functor) {
+      static_assert(!std::is_base_of<OperatorKernel, Lambda>::value, "The kernel(x) API for registering a kernel is only meant to be used with lambdas. Your kernel is a functor. Please use the kernel<Functor>() API instead.");
+
+      // We don't support stateful lambdas (i.e. lambdas with a capture), because their
+      // behavior would be nonobvious. A functor kernel with cache gets a new instance of
+      // its cache each time the kernel is looked up from the dispatch table.
+      // A lambda with a capture would be global and share its capture between all kernel lookups.
+      // So, instead of making users having to think about it (including the thread-safety
+      // issues this causes), let's just forbid stateful lambdas alltogether.
+      static_assert(guts::is_stateless_lambda<guts::decay_t<Lambda>>::value, "The kernel(x) API for registering a kernel only works for stateless lambdas (i.e. lambdas without captures). If you need a cache, please use the functor based API kernel<Functor>() instead.");
+
+      return std::move(*this).kernelFunctor<detail::WrapRuntimeKernelFunctor<guts::decay_t<Lambda>>>(std::forward<Lambda>(functor));
+    }
+
+    /**
+     * Use this to register an operator with a kernel for a certain dispatch key.
+     *
+     * Example:
+     *
+     * > namespace {
+     * >   class my_kernel_cpu final : public c10::OperatorKernel {
+     * >   public:
+     * >     Tensor operator()(Tensor a, Tensor b) {...}
+     * >   };
+     * >   class my_kernel_cuda final : public c10::OperatorKernel {
+     * >   public:
+     * >     Tensor operator()(Tensor a, Tensor b) {...}
+     * >   };
+     * > }
+     * >
+     * > static auto registry = c10::RegisterOperators()
+     * >     .op("my_op", c10::RegisterOperators::options()
+     * >         .kernel<my_kernel_cpu>()
+     * >         .dispatchKey(CPUTensorId()))
+     * >     .op("my_op", c10::RegisterOperators::options()
+     * >         .kernel<my_kernel_cuda>()
+     * >         .dispatchKey(CUDATensorId()));
+     */
+    Options&& dispatchKey(TensorTypeId dispatch_key) && {
+      if (config.dispatch_key.has_value()) {
+        AT_ERROR("Operator registration: Cannot register multiple dispatch keys in the same op() call. Please call op() multiple times if you want to register multiple kernels.");
+      }
+      config.dispatch_key = dispatch_key;
+      return std::move(*this);
+    }
+
+  private:
+    Options&& kernel(KernelFunction* kernel_func, KernelCacheCreatorFunction&& cache_creator, std::unique_ptr<FunctionSchema>&& inferred_function_schema) && {
+      if (nullptr != config.kernel_func) {
+        AT_ERROR("Operator registration: Cannot register multiple kernels in the same op() call. Please call op() multiple times if you want to register multiple kernels.");
+      }
+      AT_ASSERTM(nullptr == config.cache_creator_func, "kernel_func was nullptr, so cache_creator_func must be too");
+      AT_ASSERTM(nullptr == config.inferred_function_schema, "kernel_func was nullptr, so inferred_function_schema must be too");
+
+      config.kernel_func = kernel_func;
+      config.cache_creator_func = std::move(cache_creator);
+      config.inferred_function_schema = std::move(inferred_function_schema);
+      return std::move(*this);
+    }
+
+    template<class KernelFunctor, bool AllowDeprecatedTypes = false, class... ConstructorParameters>
+    Options&& kernelFunctor(ConstructorParameters&&... constructorParameters) && {
+      return std::move(*this).kernel(
+        &detail::wrap_kernel_functor<KernelFunctor, AllowDeprecatedTypes>::call,
+        detail::KernelFactory<KernelFunctor, guts::decay_t<ConstructorParameters>...>(std::forward<ConstructorParameters>(constructorParameters)...),
+        detail::FunctionSchemaInferer<KernelFunctor>()()
+      );
+    }
+
+    Options() = default;
+
+    // KernelRegistrationConfig accumulates all information from the config
+    // parameters passed to a RegisterOperators::op() call into one object.
+    struct KernelRegistrationConfig final {
+      KernelRegistrationConfig()
+        : dispatch_key(c10::nullopt)
+        , kernel_func(nullptr)
+        , cache_creator_func(nullptr)
+        , inferred_function_schema(nullptr)
+      {}
+
+      c10::optional<TensorTypeId> dispatch_key;
+      KernelFunction* kernel_func;
+      KernelCacheCreatorFunction cache_creator_func;
+      std::unique_ptr<FunctionSchema> inferred_function_schema;
+    };
+
+    KernelRegistrationConfig config;
+    friend class RegisterOperators;
+  };
+
   /**
-   * Register an operator based on a function schema and a set of configuration
-   * parameters (i.e. kernel function, dispatch key, ...).
-   *
-   * Example:
-   *
-   * > namespace {
-   * >   class my_kernel_cpu final : public c10::OperatorKernel {
-   * >   public:
-   * >     Tensor operator()(Tensor a, Tensor b) {...}
-   * >   };
-   * > }
-   * >
-   * > static auto registry = c10::RegisterOperators()
-   * >     .op("my_op",
-   * >         c10::kernel<my_kernel_cpu>(),
-   * >         c10::dispatchKey(CPUTensorId()));
+   * Call this to get an instance of registration options, which
+   * can be passed to a call to RegisterOperators::op() to specify
+   * these options for the operator registration.
+   * See class doc comment for examples.
    */
-  template<class... ConfigParameters>
-  RegisterOperators op(const std::string& schemaOrName, ConfigParameters&&... configParameters) && {
-    static_assert(guts::conjunction<detail::is_registration_config_parameter<guts::decay_t<ConfigParameters>>...>::value,
-      "Invalid argument passed to op(). Examples for valid arguments are c10::kernel(...) for defining a kernel "
-      " and c10::dispatchKey(...) for defining a dispatch key. Please see the documentation for registering c10 operators.");
-
-    op_(schemaOrName, std::forward<ConfigParameters>(configParameters)...);
-    return std::move(*this);
+  static Options options() {
+    return {};
   }
 
-  // This FunctionSchema based variant is only meant to be used for internal
-  // purposes when we already have a pre-parsed FunctionSchema.
-  // This is for example used for exposing legacy caffe2 operators to c10.
-  template<class... ConfigParameters>
-  RegisterOperators op(FunctionSchema schema, ConfigParameters&&... configParameters) && {
-    static_assert(guts::conjunction<detail::is_registration_config_parameter<guts::decay_t<ConfigParameters>>...>::value,
-      "Invalid argument passed to op(). Examples for valid arguments are c10::kernel(...) for defining a kernel "
-      " and c10::dispatchKey(...) for defining a dispatch key. Please see the documentation for registering c10 operators.");
-
-    op_(std::move(schema), std::forward<ConfigParameters>(configParameters)...);
+  /**
+   * Call this to register an operator. See class doc comment for examples.
+   */
+  RegisterOperators&& op(const std::string& schemaOrName, Options&& options = RegisterOperators::options()) && {
+    checkSchemaAndRegisterOp_(schemaOrName, std::move(options));
     return std::move(*this);
   }
 
-  template<class FuncType>
-  C10_DEPRECATED_MESSAGE("Registering kernels via passing arguments to RegisterOperators(...) is deprecated. " \
-                         "Please use RegisterOperators().op(...) instead.")
-  // enable_if: only enable it if FuncType is actually a function, but not a stack based KernelFunction.
-  explicit RegisterOperators(guts::enable_if_t<guts::is_function_type<FuncType>::value && !std::is_same<FuncType, KernelFunction>::value, const std::string&> schemaOrName, FuncType* func)
-  : RegisterOperators() {
-    legacyAPIOp_(schemaOrName, func);
+  // internal only for registering caffe2 ops
+  RegisterOperators&& op(FunctionSchema schema, Options&& options) && {
+    checkSchemaAndRegisterOp_(std::move(schema), std::move(options));
+    return std::move(*this);
   }
 
   template<class FuncType>
-  C10_DEPRECATED_MESSAGE("Registering kernels via passing arguments to RegisterOperators(...) is deprecated. " \
-                         "Please use RegisterOperators().op(...) instead.")
-  // enable_if: only enable it if FuncType is actually a functor
-  explicit RegisterOperators(guts::enable_if_t<guts::is_functor<FuncType>::value, const std::string&> schemaOrName, FuncType&& func)
+  explicit RegisterOperators(const std::string& schemaOrName, FuncType&& func, Options&& options = RegisterOperators::options())
   : RegisterOperators() {
-    legacyAPIOp_(schemaOrName, std::forward<FuncType>(func));
+    std::move(*this).op(schemaOrName, std::forward<FuncType>(func), std::move(options));
   }
 
   /**
-   * Deprecated. For backwards compatibility only.
-   * Don't use this, it introduces a performance overhead on each kernel call
-   * due to the kernel being stored in the wrapper as a runtime function pointer.
+   * This API registers an operator based on a kernel function pointer.
    *
    * Given a kernel
    *
    * > namespace { Tensor my_kernel_cpu(Tensor a, Tensor b) {...} }
    *
-   * This deprecated API looks like:
+   * This API looks like:
    *
    * > static auto registry = c10::RegisterOperators()
    * >     .op("my_op", &my_kernel_cpu);
    *
-   * But you should use the new API instead:
+   * If your kernel is small and the overhead of calling it matters,
+   * then this API might be the wrong choice since the followig API
+   * has a slightly lower overhead for calling into the kernel:
    *
    * > static auto registry = c10::RegisterOperators()
-   * >     .op("my_op", kernel<decltype(my_kernel_cpu), &my_kernel_cpu>());
+   * >     .op("my_op", c10::RegisterOperators::options()
+   * >         .kernel<decltype(my_kernel_cpu), &my_kernel_cpu>());
    *
    * Or, alternatively, write your kernel as a functor:
    *
@@ -132,83 +285,47 @@ class C10_API RegisterOperators final {
    * > }
    * >
    * > static auto registry = c10::RegisterOperators()
-   * >     .op("my_op", c10::kernel<my_kernel_cpu>());
+   * >     .op("my_op", c10::RegisterOperators::options()
+   * >         .kernel<my_kernel_cpu>());
    */
-   template<class FuncType, class...  OtherArgs>
-   C10_DEPRECATED_MESSAGE("Registering kernels via passing function pointers to op() directly is deprecated. " \
-                          "Please use the new c10::kernel() based API instead.")
+   template<class FuncType>
    // enable_if: only enable it if FuncType is actually a function, but not a stack based KernelFunction.
-   guts::enable_if_t<guts::is_function_type<FuncType>::value && !std::is_same<FuncType, KernelFunction>::value, RegisterOperators>
-   op(const std::string& schemaOrName, FuncType* func, OtherArgs...) && {
-     // We intentionally don't extend this deprecated API to support dispatch keys
-     // and the like to push people towards using the new API.
-     static_assert(sizeof...(OtherArgs) == 0, "The deprecated function pointer based API to register kernels doesn't allow additional arguments for dispatch keys or other things. Please use the new c10::kernel() based API instead.");
-
-     legacyAPIOp_(schemaOrName, func);
-     return std::move(*this);
+   guts::enable_if_t<guts::is_function_type<FuncType>::value && !std::is_same<FuncType, KernelFunction>::value, RegisterOperators&&>
+   op(const std::string& schemaOrName, FuncType* func, Options&& options = RegisterOperators::options()) && {
+     constexpr bool AllowLegacyTypes = true;
+     return std::move(*this).op(schemaOrName, std::move(options).kernelFunctor<detail::WrapRuntimeKernelFunctor<guts::decay_t<FuncType>>, AllowLegacyTypes>(func));
    }
 
    /**
-    * Deprecated. For backwards compatibility only.
+    * This API registers an operator based on a kernel lambda.
     *
-    * This deprecated API looks like:
+    * This API looks like:
     *
     * > static auto registry = c10::RegisterOperators()
     * >     .op("my_op", [] (Tensor a, Tensor b) {...});
     *
-    * But you should use the new API instead:
+    * This is equivalent to:
     *
     * > static auto registry = c10::RegisterOperators()
-    * >     .op("my_op", kernel([] (Tensor a, Tensor b) {...}));
-    *
-    * Or, alternatively, write your kernel as a functor:
+    * >     .op("my_op", c10::RegisterOperators::options()
+    * >         .kernel([] (Tensor a, Tensor b) {...}));
     *
-    * > namespace {
-    * >   class my_kernel_cpu final : public c10::OperatorKernel {
-    * >   public:
-    * >     Tensor operator()(Tensor a, Tensor b) {...}
-    * >   };
-    * > }
-    * >
-    * > static auto registry = c10::RegisterOperators()
-    * >     .op("my_op", c10::kernel<my_kernel_cpu>());
     */
-    template<class FuncType, class...  OtherArgs>
-    C10_DEPRECATED_MESSAGE("Registering kernels via passing lambdas to op() directly is deprecated. " \
-                           "Please use the new c10::kernel() based API instead.")
+    template<class FuncType>
     // enable_if: only enable it if FuncType is actually a functor
-    guts::enable_if_t<guts::is_functor<FuncType>::value, RegisterOperators>
-    op(const std::string& schemaOrName, FuncType&& func, OtherArgs...) && {
-      // We intentionally don't extend this deprecated API to support dispatch keys
-      // and the like to push people towards using the new API.
-      static_assert(sizeof...(OtherArgs) == 0, "The deprecated lambda based API to register kernels doesn't allow additional arguments for dispatch keys or other things. Please use the new c10::kernel() based API instead.");
-
-      static_assert(!std::is_base_of<OperatorKernel, FuncType>::value, "c10::OperatorKernel is part of the new kernel registration API and shouldn't be used together with the deprecated registration API. Please use the new c10::kernel() based API instead.");
+    guts::enable_if_t<guts::is_functor<FuncType>::value, RegisterOperators&&>
+    op(const std::string& schemaOrName, FuncType&& func, Options&& options = RegisterOperators::options()) && {
+      static_assert(!std::is_base_of<OperatorKernel, FuncType>::value, "c10::OperatorKernel is part of the new kernel registration API and shouldn't be used together with the deprecated registration API. Please use the new RegisterOperators::options().kernel() based API instead.");
 
-      legacyAPIOp_(schemaOrName, std::forward<FuncType>(func));
-      return std::move(*this);
+      constexpr bool AllowLegacyTypes = true;
+      return std::move(*this).op(schemaOrName, std::move(options).kernelFunctor<detail::WrapRuntimeKernelFunctor<guts::decay_t<FuncType>>, AllowLegacyTypes>(std::forward<FuncType>(func)));
     }
 
 private:
-  template<class... ConfigParameters>
-  void op_(FunctionSchema&& schema, ConfigParameters&&... configParameters) {
-    checkSchemaAndRegisterOp_(std::move(schema), detail::make_registration_config(std::forward<ConfigParameters>(configParameters)...));
-  }
-  template<class... ConfigParameters>
-  void op_(const std::string& schemaOrName, ConfigParameters&&... configParameters) {
-    checkSchemaAndRegisterOp_(schemaOrName, detail::make_registration_config(std::forward<ConfigParameters>(configParameters)...));
-  }
-
-  template<class FuncType>
-  void legacyAPIOp_(const std::string& schemaOrName, FuncType&& func) {
-    constexpr bool AllowLegacyTypes = true;
-    op_(schemaOrName, detail::kernelFunctor<detail::WrapRuntimeKernelFunctor<guts::decay_t<FuncType>>, AllowLegacyTypes>(std::forward<FuncType>(func)));
-  }
-
-  void checkSchemaAndRegisterOp_(FunctionSchema&& schema, detail::KernelRegistrationConfig&& config);
-  void checkSchemaAndRegisterOp_(const std::string& schemaOrName, detail::KernelRegistrationConfig&& config);
+  void checkSchemaAndRegisterOp_(FunctionSchema&& schema, Options&& config);
+  void checkSchemaAndRegisterOp_(const std::string& schemaOrName, Options&& config);
 
-  void registerOp_(FunctionSchema&& schema, detail::KernelRegistrationConfig&& config);
+  void registerOp_(FunctionSchema&& schema, Options&& config);
 
   class OperatorRegistrar;
 
diff --git a/aten/src/ATen/core/op_registration/op_registration_test.cpp b/aten/src/ATen/core/op_registration/op_registration_test.cpp
index b44838a78eb3..63dd7b5391d9 100644
--- a/aten/src/ATen/core/op_registration/op_registration_test.cpp
+++ b/aten/src/ATen/core/op_registration/op_registration_test.cpp
@@ -13,8 +13,6 @@
 
 using c10::RegisterOperators;
 using c10::OperatorKernel;
-using c10::kernel;
-using c10::dispatchKey;
 using c10::Dispatcher;
 using c10::IValue;
 using at::Tensor;
@@ -40,7 +38,7 @@ struct MockKernel final : OperatorKernel {
   bool* called_;
 };
 TEST(OperatorRegistrationTest, givenOpWithoutFallbackKernel_whenCallingOpWithWrongDispatchKey_thenFails) {
-  auto registrar = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", kernel<DummyKernel>(), dispatchKey(TensorType1()));
+  auto registrar = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel<DummyKernel>().dispatchKey(TensorType1()));
 
   auto op = Dispatcher::singleton().findSchema("_test::dummy", "");
   ASSERT_TRUE(op.has_value());
@@ -50,9 +48,9 @@ TEST(OperatorRegistrationTest, givenOpWithoutFallbackKernel_whenCallingOpWithWro
 }
 
 TEST(OperatorRegistrationTest, givenOpWithFallbackKernelOutOfScope_whenCallingOpWithWrongDispatchKey_thenFails) {
-  auto registrar = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", kernel<DummyKernel>(), dispatchKey(TensorType1()));
+  auto registrar = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel<DummyKernel>().dispatchKey(TensorType1()));
   {
-    auto inner_registrar = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", kernel<DummyKernel>());
+    auto inner_registrar = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel<DummyKernel>());
     // this registered a fallback kernel, but now that registration goes out of scope and deregisters it
   }
 
@@ -65,7 +63,7 @@ TEST(OperatorRegistrationTest, givenOpWithFallbackKernelOutOfScope_whenCallingOp
 
 TEST(OperatorRegistrationTest, givenOpWithOnlyFallbackKernel_whenCallingOp_thenCallsFallbackKernel) {
   bool called = false;
-  auto registrar = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", kernel<MockKernel>(&called)); // note: no dispatch key means this is the fallback kernel
+  auto registrar = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel<MockKernel>(&called)); // note: no dispatch key means this is the fallback kernel
 
   auto op = Dispatcher::singleton().findSchema("_test::dummy", "");
   ASSERT_TRUE(op.has_value());
@@ -77,9 +75,9 @@ TEST(OperatorRegistrationTest, givenOpWithOnlyFallbackKernel_whenCallingOp_thenC
 TEST(OperatorRegistrationTest, givenOpWithOnlyFallbackKernelAndOtherKernelOutOfScope_whenCallingOp_thenCallsFallbackKernel) {
   bool called = false;
   bool other_called = false;
-  auto registrar = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", kernel<MockKernel>(&called)); // note: no dispatch key means this is the fallback kernel
+  auto registrar = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel<MockKernel>(&called)); // note: no dispatch key means this is the fallback kernel
   {
-    auto inner_registrar = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", kernel<MockKernel>(&other_called), dispatchKey(TensorType2()));
+    auto inner_registrar = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel<MockKernel>(&other_called).dispatchKey(TensorType2()));
   }
 
   auto op = Dispatcher::singleton().findSchema("_test::dummy", "");
@@ -94,8 +92,8 @@ TEST(OperatorRegistrationTest, givenOpWithFirstFallbackAndThenOtherKernel_whenCa
   bool called_kernel = false;
   bool called_fallback = false;
   auto registrar = c10::RegisterOperators()
-    .op("_test::dummy(Tensor dummy) -> ()", kernel<MockKernel>(&called_fallback)) // note: no dispatch key means this is the fallback kernel
-    .op("_test::dummy(Tensor dummy) -> ()", kernel<MockKernel>(&called_kernel), dispatchKey(TensorType1()));
+    .op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel<MockKernel>(&called_fallback)) // note: no dispatch key means this is the fallback kernel
+    .op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel<MockKernel>(&called_kernel).dispatchKey(TensorType1()));
 
   auto op = Dispatcher::singleton().findSchema("_test::dummy", "");
   ASSERT_TRUE(op.has_value());
@@ -110,8 +108,8 @@ TEST(OperatorRegistrationTest, givenOpWithFirstFallbackAndThenOtherKernel_whenCa
   bool called_kernel = false;
   bool called_fallback = false;
   auto registrar = c10::RegisterOperators()
-    .op("_test::dummy(Tensor dummy) -> ()", kernel<MockKernel>(&called_fallback)) // note: no dispatch key means this is the fallback kernel
-    .op("_test::dummy(Tensor dummy) -> ()", kernel<MockKernel>(&called_kernel), dispatchKey(TensorType1()));
+    .op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel<MockKernel>(&called_fallback)) // note: no dispatch key means this is the fallback kernel
+    .op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel<MockKernel>(&called_kernel).dispatchKey(TensorType1()));
 
   auto op = Dispatcher::singleton().findSchema("_test::dummy", "");
   ASSERT_TRUE(op.has_value());
@@ -127,8 +125,8 @@ TEST(OperatorRegistrationTest, givenOpWithFirstOtherAndThenFallbackKernel_whenCa
   bool called_kernel = false;
   bool called_fallback = false;
   auto registrar = c10::RegisterOperators()
-    .op("_test::dummy(Tensor dummy) -> ()", kernel<MockKernel>(&called_kernel), dispatchKey(TensorType1()))
-    .op("_test::dummy(Tensor dummy) -> ()", kernel<MockKernel>(&called_fallback)); // note: no dispatch key means this is the fallback kernel
+    .op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel<MockKernel>(&called_kernel).dispatchKey(TensorType1()))
+    .op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel<MockKernel>(&called_fallback)); // note: no dispatch key means this is the fallback kernel
 
   auto op = Dispatcher::singleton().findSchema("_test::dummy", "");
   ASSERT_TRUE(op.has_value());
@@ -143,8 +141,8 @@ TEST(OperatorRegistrationTest, givenOpWithFirstOtherAndThenFallbackKernel_whenCa
   bool called_kernel = false;
   bool called_fallback = false;
   auto registrar = c10::RegisterOperators()
-    .op("_test::dummy(Tensor dummy) -> ()", kernel<MockKernel>(&called_kernel), dispatchKey(TensorType1()))
-    .op("_test::dummy(Tensor dummy) -> ()", kernel<MockKernel>(&called_fallback)); // note: no dispatch key means this is the fallback kernel
+    .op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel<MockKernel>(&called_kernel).dispatchKey(TensorType1()))
+    .op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel<MockKernel>(&called_fallback)); // note: no dispatch key means this is the fallback kernel
 
   auto op = Dispatcher::singleton().findSchema("_test::dummy", "");
   ASSERT_TRUE(op.has_value());
@@ -178,7 +176,7 @@ TEST(OperatorRegistrationTest, givenOpWithoutKernels_whenRegisteringKernelAfterw
   auto registrar1 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()");
 
   bool called_kernel = false;
-  auto registrar2 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", kernel<MockKernel>(&called_kernel), dispatchKey(TensorType1()));
+  auto registrar2 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel<MockKernel>(&called_kernel).dispatchKey(TensorType1()));
 
   auto op = Dispatcher::singleton().findSchema("_test::dummy", "");
   ASSERT_TRUE(op.has_value()); // assert schema is registered
@@ -190,7 +188,7 @@ TEST(OperatorRegistrationTest, givenOpWithoutKernels_whenRegisteringKernelAfterw
   auto registrar1 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()");
 
   {
-    auto registrar2 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", kernel<DummyKernel>(), dispatchKey(TensorType1()));
+    auto registrar2 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel<DummyKernel>().dispatchKey(TensorType1()));
   }
 
   auto op = Dispatcher::singleton().findSchema("_test::dummy", "");
@@ -211,22 +209,22 @@ TEST(OperatorRegistrationTest, givenOpWithoutKernelsWithoutTensorInputs_whenRegi
 
 TEST(OperatorRegistrationTest, givenKernelsWithSameDispatchKey_whenRegistering_thenShowsWarning) {
   auto registrar = c10::RegisterOperators()
-      .op("_test::dummy(Tensor dummy) -> ()", kernel<DummyKernel>(), dispatchKey(TensorType1()));
+      .op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel<DummyKernel>().dispatchKey(TensorType1()));
 
   auto op = Dispatcher::singleton().findSchema("_test::dummy", "");
   ASSERT_TRUE(op.has_value()); // assert schema is registered
 
   testing::internal::CaptureStderr();
-  c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", kernel<DummyKernel>(), dispatchKey(TensorType1()));
+  c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel<DummyKernel>().dispatchKey(TensorType1()));
   std::string output = testing::internal::GetCapturedStderr();
-  EXPECT_THAT(output, testing::HasSubstr("Registered a kernel that overwrote a previoulsy registered kernel with same dispatch key"));
+  EXPECT_THAT(output, testing::HasSubstr("Registered a kernel that overwrote a previously registered kernel with same dispatch key"));
 }
 
 TEST(OperatorRegistrationTest, givenKernelsWithSameDispatchKey_whenCalled_thenCallsNewerKernel) {
   bool called_kernel1 = false;
   bool called_kernel2 = false;
-  auto registrar1 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", kernel<MockKernel>(&called_kernel1), dispatchKey(TensorType1()));
-  auto registrar2 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", kernel<MockKernel>(&called_kernel2), dispatchKey(TensorType1()));
+  auto registrar1 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel<MockKernel>(&called_kernel1).dispatchKey(TensorType1()));
+  auto registrar2 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel<MockKernel>(&called_kernel2).dispatchKey(TensorType1()));
 
   auto op = Dispatcher::singleton().findSchema("_test::dummy", "");
   ASSERT_TRUE(op.has_value()); // assert schema is registered
@@ -239,8 +237,8 @@ TEST(OperatorRegistrationTest, givenKernelsWithSameDispatchKey_whenCalled_thenCa
 TEST(OperatorRegistrationTest, givenKernelsWithSameFallbackDispatchKey_whenCalled_thenCallsNewerKernel) {
   bool called_kernel1 = false;
   bool called_kernel2 = false;
-  auto registrar1 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", kernel<MockKernel>(&called_kernel1));
-  auto registrar2 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", kernel<MockKernel>(&called_kernel2));
+  auto registrar1 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel<MockKernel>(&called_kernel1));
+  auto registrar2 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel<MockKernel>(&called_kernel2));
 
   auto op = Dispatcher::singleton().findSchema("_test::dummy", "");
   ASSERT_TRUE(op.has_value()); // assert schema is registered
@@ -253,8 +251,8 @@ TEST(OperatorRegistrationTest, givenKernelsWithSameFallbackDispatchKey_whenCalle
 TEST(OperatorRegistrationTest, givenKernelsWithSameDispatchKey_whenNewerKernelDeletedAndOpCalled_thenCallsOlderKernel) {
   bool called_kernel1 = false;
   bool called_kernel2 = false;
-  auto registrar1 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", kernel<MockKernel>(&called_kernel1), dispatchKey(TensorType1()));
-  auto registrar2 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", kernel<MockKernel>(&called_kernel2), dispatchKey(TensorType1()));
+  auto registrar1 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel<MockKernel>(&called_kernel1).dispatchKey(TensorType1()));
+  auto registrar2 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel<MockKernel>(&called_kernel2).dispatchKey(TensorType1()));
 
   registrar2 = c10::RegisterOperators(); // destruct the registrar
 
@@ -269,8 +267,8 @@ TEST(OperatorRegistrationTest, givenKernelsWithSameDispatchKey_whenNewerKernelDe
 TEST(OperatorRegistrationTest, givenKernelsWithSameFallbackDispatchKey_whenNewerKernelDeletedAndOpCalled_thenCallsOlderKernel) {
   bool called_kernel1 = false;
   bool called_kernel2 = false;
-  auto registrar1 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", kernel<MockKernel>(&called_kernel1));
-  auto registrar2 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", kernel<MockKernel>(&called_kernel2));
+  auto registrar1 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel<MockKernel>(&called_kernel1));
+  auto registrar2 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel<MockKernel>(&called_kernel2));
 
   registrar2 = c10::RegisterOperators(); // destruct the registrar
 
@@ -285,8 +283,8 @@ TEST(OperatorRegistrationTest, givenKernelsWithSameFallbackDispatchKey_whenNewer
 TEST(OperatorRegistrationTest, givenKernelsWithSameDispatchKey_whenOlderKernelDeletedAndOpCalled_thenCallsNewerKernel) {
   bool called_kernel1 = false;
   bool called_kernel2 = false;
-  auto registrar1 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", kernel<MockKernel>(&called_kernel1), dispatchKey(TensorType1()));
-  auto registrar2 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", kernel<MockKernel>(&called_kernel2), dispatchKey(TensorType1()));
+  auto registrar1 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel<MockKernel>(&called_kernel1).dispatchKey(TensorType1()));
+  auto registrar2 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel<MockKernel>(&called_kernel2).dispatchKey(TensorType1()));
 
   registrar1 = c10::RegisterOperators(); // destruct the registrar
 
@@ -301,8 +299,8 @@ TEST(OperatorRegistrationTest, givenKernelsWithSameDispatchKey_whenOlderKernelDe
 TEST(OperatorRegistrationTest, givenKernelsWithSameFallbackDispatchKey_whenOlderKernelDeletedAndOpCalled_thenCallsNewerKernel) {
   bool called_kernel1 = false;
   bool called_kernel2 = false;
-  auto registrar1 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", kernel<MockKernel>(&called_kernel1));
-  auto registrar2 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", kernel<MockKernel>(&called_kernel2));
+  auto registrar1 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel<MockKernel>(&called_kernel1));
+  auto registrar2 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel<MockKernel>(&called_kernel2));
 
   registrar1 = c10::RegisterOperators(); // destruct the registrar
 
@@ -318,8 +316,8 @@ TEST(OperatorRegistrationTest, givenKernelsWithSameDispatchKey_whenOlderAndThenN
   bool called_kernel1 = false;
   bool called_kernel2 = false;
   auto registrar0 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()");
-  auto registrar1 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", kernel<MockKernel>(&called_kernel1), dispatchKey(TensorType1()));
-  auto registrar2 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", kernel<MockKernel>(&called_kernel2), dispatchKey(TensorType1()));
+  auto registrar1 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel<MockKernel>(&called_kernel1).dispatchKey(TensorType1()));
+  auto registrar2 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel<MockKernel>(&called_kernel2).dispatchKey(TensorType1()));
 
   registrar1 = c10::RegisterOperators(); // destruct the registrar
   registrar2 = c10::RegisterOperators(); // destruct the registrar
@@ -336,8 +334,8 @@ TEST(OperatorRegistrationTest, givenKernelsWithSameFallbackDispatchKey_whenOlder
   bool called_kernel1 = false;
   bool called_kernel2 = false;
   auto registrar0 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()");
-  auto registrar1 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", kernel<MockKernel>(&called_kernel1));
-  auto registrar2 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", kernel<MockKernel>(&called_kernel2));
+  auto registrar1 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel<MockKernel>(&called_kernel1));
+  auto registrar2 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel<MockKernel>(&called_kernel2));
 
   registrar1 = c10::RegisterOperators(); // destruct the registrar
   registrar2 = c10::RegisterOperators(); // destruct the registrar
@@ -354,8 +352,8 @@ TEST(OperatorRegistrationTest, givenKernelsWithSameDispatchKey_whenNewerAndThenO
   bool called_kernel1 = false;
   bool called_kernel2 = false;
   auto registrar0 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()");
-  auto registrar1 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", kernel<MockKernel>(&called_kernel1), dispatchKey(TensorType1()));
-  auto registrar2 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", kernel<MockKernel>(&called_kernel2), dispatchKey(TensorType1()));
+  auto registrar1 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel<MockKernel>(&called_kernel1).dispatchKey(TensorType1()));
+  auto registrar2 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel<MockKernel>(&called_kernel2).dispatchKey(TensorType1()));
 
   registrar2 = c10::RegisterOperators(); // destruct the registrar
   registrar1 = c10::RegisterOperators(); // destruct the registrar
@@ -372,8 +370,8 @@ TEST(OperatorRegistrationTest, givenKernelsWithSameFallbackDispatchKey_whenNewer
   bool called_kernel1 = false;
   bool called_kernel2 = false;
   auto registrar0 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()");
-  auto registrar1 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", kernel<MockKernel>(&called_kernel1));
-  auto registrar2 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", kernel<MockKernel>(&called_kernel2));
+  auto registrar1 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel<MockKernel>(&called_kernel1));
+  auto registrar2 = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel<MockKernel>(&called_kernel2));
 
   registrar2 = c10::RegisterOperators(); // destruct the registrar
   registrar1 = c10::RegisterOperators(); // destruct the registrar
@@ -386,7 +384,23 @@ TEST(OperatorRegistrationTest, givenKernelsWithSameFallbackDispatchKey_whenNewer
   }, "Didn't find kernel to dispatch to for operator '_test::dummy'");
 }
 
+TEST(OperatorRegistrationTest, whenTryingToRegisterWithMultipleKernels_thenFails) {
+  expectThrows<c10::Error>([&] {
+    c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel<DummyKernel>().kernel<DummyKernel>());
+  }, "Cannot register multiple kernels in the same op() call");
+}
+
+TEST(OperatorRegistrationTest, whenTryingToRegisterWithMultipleDispatchKeys_thenFails) {
+  expectThrows<c10::Error>([&] {
+    c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().kernel<DummyKernel>().dispatchKey(TensorType1()).dispatchKey(TensorType2()));
+  }, "Cannot register multiple dispatch keys in the same op() call");
+}
 
+TEST(OperatorRegistrationTest, whenTryingToRegisterWithDispatchKeyWithoutKernel_thenFails) {
+  expectThrows<c10::Error>([&] {
+    c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options().dispatchKey(TensorType1()));
+  }, "Tried to register an operator with a dispatch key but without a kernel");
+}
 
 /**
  * This is used to check that a given type works correctly when passed as input
@@ -416,7 +430,7 @@ struct ArgTypeTestKernel final : OperatorKernel {
   }
 
   static void test(InputType input, std::function<void(const InputType&)> inputExpectation, OutputType output, std::function<void(const c10::Stack&)> outputExpectation, const std::string& schema) {
-    auto registry = c10::RegisterOperators().op("_test::my_op" + schema, kernel<ArgTypeTestKernel>(input, std::move(inputExpectation), std::move(output)));
+    auto registry = c10::RegisterOperators().op("_test::my_op" + schema, c10::RegisterOperators::options().kernel<ArgTypeTestKernel>(input, std::move(inputExpectation), std::move(output)));
     auto op = Dispatcher::singleton().findSchema("_test::my_op", "");
     ASSERT_TRUE(op.has_value()); // assert schema is registered
     auto actualOutput = callOp(*op, std::move(input));
@@ -551,43 +565,47 @@ TEST(OperatorRegistrationTest, testAvailableArgTypes) {
 
 
   // list types (with empty list)
-  testArgTypes<c10::ArrayRef<double>, std::vector<double>>::test(
-    c10::ArrayRef<double>(), [] (c10::ArrayRef<double> v) {EXPECT_EQ(0, v.size());},
+  testArgTypes<std::vector<double>>::test(
+    std::vector<double>(), [] (const std::vector<double>& v) {EXPECT_EQ(0, v.size());},
     std::vector<double>(), [] (const IValue& v) {EXPECT_EQ(0, v.toDoubleListRef().size());},
     "(float[] a) -> float[]");
-  testArgTypes<c10::ArrayRef<int64_t>, std::vector<int64_t>>::test(
-    c10::ArrayRef<int64_t>(), [] (c10::ArrayRef<int64_t> v) {EXPECT_EQ(0, v.size());},
+  testArgTypes<std::vector<int64_t>, std::vector<int64_t>>::test(
+    std::vector<int64_t>(), [] (const std::vector<int64_t>& v) {EXPECT_EQ(0, v.size());},
     std::vector<int64_t>(), [] (const IValue& v) {EXPECT_EQ(0, v.toIntListRef().size());},
     "(int[] a) -> int[]");
-  // TODO Converting std::vector<bool> to ArrayRef<bool> doesn't work, so we
-  //      need to find an alternative
-  // testArgTypes<c10::ArrayRef<bool>, std::vector<bool>>::test(
-  //   c10::ArrayRef<bool>(), [] (c10::ArrayRef<bool> v) {EXPECT_EQ(0, v.size());},
-  //   std::vector<bool>(), [] (const IValue& v) {EXPECT_EQ(0, v.toBoolListRef().size());},
-  //   "(bool[] a) -> bool[]");
-  // testArgTypes<c10::ArrayRef<bool>, std::vector<bool>>::test(
-  //   c10::ArrayRef<bool>(), [] (c10::ArrayRef<bool> v) {EXPECT_EQ(0, v.size());},
-  //   std::vector<bool>(), [] (const IValue& v) {EXPECT_EQ(0, v.toBoolListRef().size());},
-  //   "(bool[] a) -> bool[]");
-  // TODO We currently don't support str[] (i.e. string list) as type. Do we want to?
-  // testArgTypes<c10::ArrayRef<std::string>, std::vector<std::string>>::test(
-  //   c10::ArrayRef<std::string>(), [] (c10::ArrayRef<std::string> v) {EXPECT_EQ(0, v.size());},
-  //   std::vector<std::string>(), [] (const IValue& v) {EXPECT_EQ(0, v.toStringListRef().size());},
-  //   "(str[] a) -> str[]");
+  testArgTypes<std::vector<bool>>::test(
+    std::vector<bool>(), [] (const std::vector<bool>& v) {EXPECT_EQ(0, v.size());},
+    std::vector<bool>(), [] (const IValue& v) {EXPECT_EQ(0, v.toBoolListRef().size());},
+    "(bool[] a) -> bool[]");
+  testArgTypes<std::vector<std::string>>::test(
+    std::vector<std::string>(), [] (const std::vector<std::string>& v) {EXPECT_EQ(0, v.size());},
+    std::vector<std::string>(), [] (const IValue& v) {EXPECT_EQ(0, v.toGenericListRef().size());},
+    "(str[] a) -> str[]");
 
 
   // list types (with non-empty list)
-  testArgTypes<c10::ArrayRef<double>, std::vector<double>>::test(
-    c10::ArrayRef<double>({1.5, 2.5}), [] (c10::ArrayRef<double> v) {EXPECT_EQ(c10::ArrayRef<double>({1.5, 2.5}), v);},
+  testArgTypes<std::vector<double>>::test(
+    std::vector<double>({1.5, 2.5}), [] (const std::vector<double>& v) {EXPECT_EQ(std::vector<double>({1.5, 2.5}), v);},
     std::vector<double>({3.5, 4.5}), [] (const IValue& v) {EXPECT_EQ(std::vector<double>({3.5, 4.5}), v.toDoubleListRef());},
     "(float[] a) -> float[]");
-  testArgTypes<c10::ArrayRef<int64_t>, std::vector<int64_t>>::test(
-    c10::ArrayRef<int64_t>({1, 2}), [] (c10::ArrayRef<int64_t> v) {EXPECT_EQ(c10::ArrayRef<int64_t>({1, 2}), v);},
+  testArgTypes<std::vector<int64_t>>::test(
+    std::vector<int64_t>({1, 2}), [] (const std::vector<int64_t>& v) {EXPECT_EQ(std::vector<int64_t>({1, 2}), v);},
     std::vector<int64_t>({3, 4}), [] (const IValue& v) {EXPECT_EQ(std::vector<int64_t>({3, 4}), v.toIntListRef());},
     "(int[] a) -> int[]");
-  // TODO When fixing bool[] and str[] (see above), also add them here
-  testArgTypes<c10::ArrayRef<Tensor>, std::vector<Tensor>>::test(
-    c10::ArrayRef<Tensor>({dummyTensor(TensorType1()), dummyTensor(TensorType2())}), [] (c10::ArrayRef<Tensor> v) {
+  testArgTypes<std::vector<bool>>::test(
+    std::vector<bool>({true, false}), [] (const std::vector<bool>& v) {EXPECT_EQ(std::vector<bool>({true, false}), v);},
+    std::vector<bool>({true, false}), [] (const IValue& v) {EXPECT_EQ(std::vector<bool>({true, false}), v.toBoolListRef());},
+    "(bool[] a) -> bool[]");
+  testArgTypes<std::vector<std::string>>::test(
+    std::vector<std::string>({"first", "second"}), [] (const std::vector<std::string>& v) {EXPECT_EQ(std::vector<std::string>({"first", "second"}), v);},
+    std::vector<std::string>({"first", "second"}), [] (const IValue& v) {
+      EXPECT_EQ(2, v.toGenericListRef().size());
+      EXPECT_EQ("first", v.toGenericListRef()[0].toStringRef());
+      EXPECT_EQ("second", v.toGenericListRef()[1].toStringRef());
+    },
+    "(str[] a) -> str[]");
+  testArgTypes<std::vector<Tensor>>::test(
+    std::vector<Tensor>({dummyTensor(TensorType1()), dummyTensor(TensorType2())}), [] (const std::vector<Tensor>& v) {
       EXPECT_EQ(2, v.size());
       EXPECT_EQ(TensorType1(), v[0].type_id());
       EXPECT_EQ(TensorType2(), v[1].type_id());
@@ -600,20 +618,20 @@ TEST(OperatorRegistrationTest, testAvailableArgTypes) {
     "(Tensor[] a) -> Tensor[]");
 
   // Test optional of list (with nullopt)
-  testArgTypes<c10::optional<c10::ArrayRef<int64_t>>, c10::optional<std::vector<int64_t>>>::test(
-    c10::optional<c10::ArrayRef<int64_t>>(c10::nullopt), [] (c10::optional<c10::ArrayRef<int64_t>> v) {EXPECT_FALSE(v.has_value());},
+  testArgTypes<c10::optional<std::vector<int64_t>>>::test(
+    c10::optional<std::vector<int64_t>>(c10::nullopt), [] (const c10::optional<std::vector<int64_t>>& v) {EXPECT_FALSE(v.has_value());},
     c10::optional<std::vector<int64_t>>(c10::nullopt), [] (const IValue& v) {EXPECT_TRUE(v.isNone());},
     "(int[]? a) -> int[]?");
 
   // Test optional of list (with empty list)
-  testArgTypes<c10::optional<c10::ArrayRef<int64_t>>, c10::optional<std::vector<int64_t>>>::test(
-    c10::optional<c10::ArrayRef<int64_t>>(c10::ArrayRef<int64_t>{}), [] (c10::optional<c10::ArrayRef<int64_t>> v) {EXPECT_EQ(0, v.value().size());},
+  testArgTypes<c10::optional<std::vector<int64_t>>>::test(
+    c10::optional<std::vector<int64_t>>(std::vector<int64_t>{}), [] (const c10::optional<std::vector<int64_t>>& v) {EXPECT_EQ(0, v.value().size());},
     c10::optional<std::vector<int64_t>>(std::vector<int64_t>{}), [] (const IValue& v) {EXPECT_EQ(0, v.toIntListRef().size());},
     "(int[]? a) -> int[]?");
 
   // Test optional of list (with values)
-  testArgTypes<c10::optional<c10::ArrayRef<int64_t>>, c10::optional<std::vector<int64_t>>>::test(
-    c10::optional<c10::ArrayRef<int64_t>>({1, 2}), [] (c10::optional<c10::ArrayRef<int64_t>> v) {EXPECT_EQ(c10::ArrayRef<int64_t>({1, 2}), v.value());},
+  testArgTypes<c10::optional<std::vector<int64_t>>>::test(
+    c10::optional<std::vector<int64_t>>({1, 2}), [] (const c10::optional<std::vector<int64_t>>& v) {EXPECT_EQ(std::vector<int64_t>({1, 2}), v.value());},
     c10::optional<std::vector<int64_t>>({3, 4}), [] (const IValue& v) {EXPECT_EQ(std::vector<int64_t>({3, 4}), v.toIntListRef());},
     "(int[]? a) -> int[]?");
 
diff --git a/aten/src/ATen/core/op_registration/test_helpers.h b/aten/src/ATen/core/op_registration/test_helpers.h
index 5110ec6df065..fcf4c7a3e570 100644
--- a/aten/src/ATen/core/op_registration/test_helpers.h
+++ b/aten/src/ATen/core/op_registration/test_helpers.h
@@ -36,6 +36,30 @@ struct InputToIValue<c10::ArrayRef<T>> final {
   }
 };
 template<class Key, class Value>
+struct InputToIValue<std::vector<std::unordered_map<Key, Value>>> final {
+  template<class T_>
+  static c10::IValue call(T_&& v) {
+    auto list = c10::ivalue::GenericList::create({});
+    list->elements().reserve(v.size());
+    for (std::unordered_map<Key, Value>& e : v) {
+      list->elements().push_back(InputToIValue<std::unordered_map<Key, Value>>::call(std::move(e)));
+    }
+    return list;
+  }
+};
+template<>
+struct InputToIValue<std::vector<std::string>> final {
+  template<class T_>
+  static c10::IValue call(T_&& v) {
+    auto list = c10::ivalue::GenericList::create({});
+    list->elements().reserve(v.size());
+    for (std::string& e : v) {
+      list->elements().push_back(InputToIValue<std::string>::call(std::move(e)));
+    }
+    return list;
+  }
+};
+template<class Key, class Value>
 struct InputToIValue<c10::Dict<Key, Value>> final {
   template<class T_>
   static c10::IValue call(T_&& v) {
@@ -46,12 +70,12 @@ template<class Key, class Value>
 struct InputToIValue<std::unordered_map<Key, Value>> final {
   template<class T_>
   static c10::IValue call(T_&& v) {
-    c10::Dict<Key, Value> dict;
+    c10::impl::GenericDict dict;
     dict.reserve(v.size());
     for (auto& element : v) {
-      dict.insert(element.first, element.second);
+      dict.insert(InputToIValue<Key>::call(element.first), InputToIValue<Value>::call(element.second));
     }
-    return InputToIValue<c10::Dict<Key, Value>>::call(std::move(dict));
+    return c10::IValue(std::move(dict));
   }
 };
 }
diff --git a/aten/src/ATen/core/type.cpp b/aten/src/ATen/core/type.cpp
index 60dc0c973208..2520c90f1d2e 100644
--- a/aten/src/ATen/core/type.cpp
+++ b/aten/src/ATen/core/type.cpp
@@ -25,7 +25,7 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
     }
     out << ")";
   } else if (auto value = t.cast<ProfiledTensorType>()) {
-    out << "Tensor(dtype = ";
+    out << "ProfiledTensor(dtype = ";
     if  (value->scalarType().has_value())
     {
         out << *value->scalarType();
@@ -157,6 +157,8 @@ TypePtr incompleteInferTypeFrom(const IValue& value) {
     return TupleType::create(fmap(value.toTuple()->elements(), incompleteInferTypeFrom));
   } else if (value.isDevice()) {
     return DeviceObjType::get();
+  } else if (value.isObject()) {
+    return value.toObject()->type();
   }
   AT_ERROR("Type cannot be accurately recovered from this IValue.");
 }
diff --git a/aten/src/ATen/cpp_custom_type_hack.h b/aten/src/ATen/cpp_custom_type_hack.h
index 660c4bb6ff82..4b3b48583f24 100644
--- a/aten/src/ATen/cpp_custom_type_hack.h
+++ b/aten/src/ATen/cpp_custom_type_hack.h
@@ -14,9 +14,9 @@ namespace cpp_custom_type_hack {
 
 template <typename T>
 T& cast(const Tensor& packed) {
-  AT_CHECK(
+  TORCH_CHECK(
       packed.scalar_type() == kByte, "Expected temporary cpp type wrapper");
-  AT_CHECK(
+  TORCH_CHECK(
       packed.storage().data_ptr().get_deleter() ==
           caffe2::TypeMeta::Make<T>().deleteFn(),
       "Expected temporary cpp type wrapper of type ",
diff --git a/aten/src/ATen/cuda/CUDAApplyUtils.cuh b/aten/src/ATen/cuda/CUDAApplyUtils.cuh
index 724510b94870..f68880367123 100644
--- a/aten/src/ATen/cuda/CUDAApplyUtils.cuh
+++ b/aten/src/ATen/cuda/CUDAApplyUtils.cuh
@@ -728,10 +728,6 @@ inline bool CUDA_tensor_apply1(at::Tensor a,
 
     rearrangeDims(&aInfo);
     aInfo.collapseDims();
-#if CUDA_VERSION < 9000
-    if (!aInfo.isContiguous())
-        grid.x = std::min((unsigned int)at::cuda::getCurrentDeviceProperties()->multiProcessorCount * AT_APPLY_BLOCKS_PER_SM , grid.x);
-#endif
 
     HANDLE_A_CASE(unsigned int, aInfo.dims);
   } else {
@@ -748,9 +744,6 @@ inline bool CUDA_tensor_apply1(at::Tensor a,
     if (aInfo.dims == 1) {
       HANDLE_CASE(uint64_t, 1);
     } else {
-#if CUDA_VERSION < 9000
-      grid.x = std::min((unsigned int)at::cuda::getCurrentDeviceProperties()->multiProcessorCount * AT_APPLY_BLOCKS_PER_SM , grid.x);
-#endif
       HANDLE_CASE(uint64_t, -1);
     }
   }
@@ -881,10 +874,6 @@ inline bool CUDA_tensor_apply2(at::Tensor a,
     rearrangeDims(&aInfo, &bInfo);
     aInfo.collapseDims();
     bInfo.collapseDims();
-#if CUDA_VERSION < 9000
-    if (!(aInfo.isContiguous() && bInfo.isContiguous()))
-        grid.x = std::min((unsigned int)at::cuda::getCurrentDeviceProperties()->multiProcessorCount * AT_APPLY_BLOCKS_PER_SM , grid.x);
-#endif
 
     HANDLE_A_CASE(unsigned int, aInfo.dims, bInfo.dims);
   } else {
@@ -904,9 +893,6 @@ inline bool CUDA_tensor_apply2(at::Tensor a,
     if (aInfo.dims == 1 && bInfo.dims == 1) {
       HANDLE_CASE(uint64_t, 1, 1);
     } else {
-#if CUDA_VERSION < 9000
-      grid.x = std::min((unsigned int)at::cuda::getCurrentDeviceProperties()->multiProcessorCount * AT_APPLY_BLOCKS_PER_SM , grid.x);
-#endif
       HANDLE_CASE(uint64_t, -1, -1);
     }
   }
@@ -1071,10 +1057,6 @@ inline bool CUDA_tensor_apply3(at::Tensor a,
     bInfo.collapseDims();
     cInfo.collapseDims();
 
-#if CUDA_VERSION < 9000
-    if (!(aInfo.isContiguous() && bInfo.isContiguous() && cInfo.isContiguous()))
-      grid.x = std::min((unsigned int)at::cuda::getCurrentDeviceProperties()->multiProcessorCount * AT_APPLY_BLOCKS_PER_SM , grid.x);
-#endif
     HANDLE_A_CASE(unsigned int, aInfo.dims, bInfo.dims, cInfo.dims);
   } else {
     detail::TensorInfo<scalar1, uint64_t> aInfo =
@@ -1098,10 +1080,6 @@ inline bool CUDA_tensor_apply3(at::Tensor a,
     if (aInfo.dims == 1 && bInfo.dims == 1 && cInfo.dims == 1) {
       HANDLE_CASE(uint64_t, 1, 1, 1);
     } else {
-#if CUDA_VERSION < 9000
-  grid.x = std::min((unsigned int)at::cuda::getCurrentDeviceProperties()->multiProcessorCount * AT_APPLY_BLOCKS_PER_SM , grid.x);
-#endif
-
       HANDLE_CASE(uint64_t, -1, -1, -1);
     }
   }
@@ -1311,10 +1289,6 @@ inline bool CUDA_tensor_apply4(at::Tensor a,
     cInfo.collapseDims();
     dInfo.collapseDims();
 
-#if CUDA_VERSION < 9000
-    if (!(aInfo.isContiguous() && bInfo.isContiguous() && cInfo.isContiguous() && dInfo.isContiguous()))
-      grid.x = std::min((unsigned int)at::cuda::getCurrentDeviceProperties()->multiProcessorCount * AT_APPLY_BLOCKS_PER_SM , grid.x);
-#endif
     HANDLE_A_CASE(unsigned int, aInfo.dims, bInfo.dims, cInfo.dims, dInfo.dims);
   } else {
     detail::TensorInfo<scalar1, uint64_t> aInfo =
@@ -1342,9 +1316,6 @@ inline bool CUDA_tensor_apply4(at::Tensor a,
     if (aInfo.dims == 1 && bInfo.dims == 1 && cInfo.dims == 1 && dInfo.dims == 1) {
       HANDLE_CASE(uint64_t, 1, 1, 1, 1);
     } else {
-#if CUDA_VERSION < 9000
-  grid.x = std::min((unsigned int)at::cuda::getCurrentDeviceProperties()->multiProcessorCount * AT_APPLY_BLOCKS_PER_SM , grid.x);
-#endif
       HANDLE_CASE(uint64_t, -1, -1, -1, -1);
     }
   }
diff --git a/aten/src/ATen/cuda/CUDAEvent.h b/aten/src/ATen/cuda/CUDAEvent.h
index 53eadd4459bc..f27886722730 100644
--- a/aten/src/ATen/cuda/CUDAEvent.h
+++ b/aten/src/ATen/cuda/CUDAEvent.h
@@ -112,7 +112,7 @@ struct AT_CUDA_API CUDAEvent {
       createEvent(stream.device_index());
     }
 
-    AT_CHECK(device_index_ == stream.device_index(), "Event device ", device_index_,
+    TORCH_CHECK(device_index_ == stream.device_index(), "Event device ", device_index_,
       " does not match recording stream's device ", stream.device_index(), ".");
     CUDAGuard guard(device_index_);
     AT_CUDA_CHECK(cudaEventRecord(event_, stream));
@@ -130,7 +130,7 @@ struct AT_CUDA_API CUDAEvent {
 
   // Note: cudaEventElapsedTime can be safely called from any device
   float elapsed_time(const CUDAEvent& other) const {
-    AT_CHECK(is_created_ && other.isCreated(),
+    TORCH_CHECK(is_created_ && other.isCreated(),
       "Both events must be recorded before calculating elapsed time.");
     float time_ms = 0;
     // raise cudaErrorNotReady if either event is recorded but not yet completed
diff --git a/aten/src/ATen/cuda/detail/OffsetCalculator.cuh b/aten/src/ATen/cuda/detail/OffsetCalculator.cuh
index b750cf326b39..2f475d501628 100644
--- a/aten/src/ATen/cuda/detail/OffsetCalculator.cuh
+++ b/aten/src/ATen/cuda/detail/OffsetCalculator.cuh
@@ -17,7 +17,7 @@ struct OffsetCalculator {
   using offset_type = at::cuda::Array<index_t, NARGS>;
 
   OffsetCalculator(int dims, const int64_t* sizes, const int64_t* const* strides) : dims(dims) {
-    AT_CHECK(dims <= MAX_DIMS, "tensor has too many (>25) dims");
+    TORCH_CHECK(dims <= MAX_DIMS, "tensor has too many (>25) dims");
     for (int i = 0; i < MAX_DIMS; ++i) {
       if (i < dims) {
         sizes_[i] = IntDivider<index_t>(sizes[i]);
diff --git a/aten/src/ATen/cuda/detail/TensorInfo.cuh b/aten/src/ATen/cuda/detail/TensorInfo.cuh
index 7dfa9051e103..b5fcbe222391 100644
--- a/aten/src/ATen/cuda/detail/TensorInfo.cuh
+++ b/aten/src/ATen/cuda/detail/TensorInfo.cuh
@@ -62,7 +62,7 @@ TensorInfo<T, IndexType>::TensorInfo(T* p,
 template <typename T, typename IndexType>
 void
 TensorInfo<T, IndexType>::reduceDim(int dim) {
-  AT_CHECK(dim < dims && dim >= 0, "expected dim between 0 and dims - 1");
+  TORCH_CHECK(dim < dims && dim >= 0, "expected dim between 0 and dims - 1");
   sizes[dim] = 1;
 }
 
diff --git a/aten/src/ATen/detail/FunctionTraits.h b/aten/src/ATen/detail/FunctionTraits.h
index a8f84e6994cf..547bb71ea8aa 100644
--- a/aten/src/ATen/detail/FunctionTraits.h
+++ b/aten/src/ATen/detail/FunctionTraits.h
@@ -49,6 +49,12 @@ struct function_traits<ReturnType(Args...)> {
   };
 };
 
+template <typename T>
+struct nullary_function_traits {
+  using traits = function_traits<T>;
+  using result_type = typename traits::result_type;
+};
+
 template <typename T>
 struct unary_function_traits {
   using traits = function_traits<T>;
diff --git a/aten/src/ATen/function_wrapper.py b/aten/src/ATen/function_wrapper.py
index d4915a7fd0d0..68f0eb4dbc98 100644
--- a/aten/src/ATen/function_wrapper.py
+++ b/aten/src/ATen/function_wrapper.py
@@ -115,19 +115,9 @@ def TypedDict(name, attrs, total=True):  # type: ignore
 TYPE_DERIVED_DEFINITION_NATIVE = CodeTemplate("""\
 ${return_type} ${Type}::${api_name}(${type_method_formals}) const {
     ${device_guard_declaration}
-    ${dispatch_scalar_type_declaration}
-    switch (dispatch_scalar_type) {
-        ${cases}
-            ${return_call} at::native::${native_type_method_dispatch}(/* actuals */ ${actuals});
-        break;
-        default:
-            AT_ERROR("${api_name} not supported on ${Type} for ", dispatch_scalar_type);
-    }
+    ${type_definition_body}
 }
 """)
-TYPE_DERIVED_DEFINITION_NATIVE_CASE = CodeTemplate("""\
-case ScalarType::${ScalarName}:
-""")
 TYPE_DERIVED_DEFINITION_NATIVE_MISSING = CodeTemplate("""\
 ${return_type} ${Type}::${api_name}(${type_method_formals}) const {
     AT_ERROR("${api_name} not supported on ${Type}");
@@ -1660,10 +1650,9 @@ def process_native(option):
                         TYPE_DERIVED_DEFINITION_NATIVE_MISSING.substitute(env))
                 else:
                     option['native_type_method_dispatch'] = native_dispatch
-                    cases = []
-                    for scalar_type in option['backend_types'][backend]:
-                        cases.append(TYPE_DERIVED_DEFINITION_NATIVE_CASE.substitute(env, ScalarName=scalar_type))
-                    type_object_definitions.append(TYPE_DERIVED_DEFINITION_NATIVE.substitute(env, cases=cases))
+                    body = TYPE_DEFINITION_BODY_NATIVE.substitute(env)
+                    type_object_definitions.append(
+                        TYPE_DERIVED_DEFINITION_NATIVE.substitute(env, type_definition_body=body))
 
     for declaration in declarations:
         for option in declaration['options']:
diff --git a/aten/src/ATen/gen.py b/aten/src/ATen/gen.py
index f4bdf22621a6..1960b95dd300 100644
--- a/aten/src/ATen/gen.py
+++ b/aten/src/ATen/gen.py
@@ -188,7 +188,9 @@ def backend_to_devicetype(backend):
 
 # scalar_name, c_type, accreal, is_floating_type
 quantized_scalar_types = [
-    ('QInt8', 'qint8', 'QInt8AccrealNotDefined', 'Qint8IsFloatingTypeNotDefined'),
+    ('QInt8', 'qint8', 'QInt8AccrealNotDefined', 'QInt8IsFloatingTypeNotDefined'),
+    ('QUInt8', 'quint8', 'QUInt8AccrealNotDefined', 'QUInt8IsFloatingTypeNotDefined'),
+    ('QInt32', 'qint32', 'QInt32AccrealNotDefined', 'Qint32IsFloatingTypeNotDefined'),
 ]
 
 
@@ -390,9 +392,6 @@ def legacy_iterate_types():
             for scalar_type in (scalar_types + quantized_scalar_types):
                 if density == 'Mkldnn' and (backend != 'CPU' or scalar_type[0] != 'Float'):
                     continue
-                if density == 'Sparse' and scalar_type[0] == 'Half':
-                    # THS does not do half type yet.
-                    continue
                 else:
                     yield (backend, density, scalar_type)
     for backend in quantized_backends:
diff --git a/aten/src/ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h b/aten/src/ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h
index 77176a3acada..b80fc02baeeb 100644
--- a/aten/src/ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h
+++ b/aten/src/ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h
@@ -16,7 +16,7 @@ class HIPStreamMasqueradingAsCUDA {
   explicit HIPStreamMasqueradingAsCUDA(Stream stream)
     : HIPStreamMasqueradingAsCUDA(UNCHECKED, stream) {
     // We did the coercion unchecked; check that it was right.
-    AT_CHECK(stream.device().type() == DeviceType::CUDA /* !!! */);
+    TORCH_CHECK(stream.device().type() == DeviceType::CUDA /* !!! */);
   }
 
   explicit HIPStreamMasqueradingAsCUDA(Unchecked, Stream stream)
diff --git a/aten/src/ATen/native/Activation.cpp b/aten/src/ATen/native/Activation.cpp
index 55bc3c44df50..b35017ec0737 100644
--- a/aten/src/ATen/native/Activation.cpp
+++ b/aten/src/ATen/native/Activation.cpp
@@ -147,8 +147,8 @@ Tensor prelu_cpu(const Tensor& self, const Tensor& weight_) {
   auto input = self.contiguous();
   auto weight = weight_.contiguous();
 
-  AT_CHECK(input.is_contiguous());
-  AT_CHECK(weight.is_contiguous());
+  TORCH_CHECK(input.is_contiguous());
+  TORCH_CHECK(weight.is_contiguous());
 
   int64_t weight_num = weight.numel();
   Tensor result = at::empty_like(input);
@@ -162,7 +162,7 @@ Tensor prelu_cpu(const Tensor& self, const Tensor& weight_) {
   }
   else { // case2: multiple weights, one for each channel
     int64_t input_ndim = input.dim();
-    AT_CHECK(input_ndim > 0, "Not allow zero-dim input tensor.");
+    TORCH_CHECK(input_ndim > 0, "Not allow zero-dim input tensor.");
 
     int64_t channel_size = 1; // channel_size default to 1
     int64_t input_dim0_size = 1, input_stride0 = 1, input_stride1 = 1;
@@ -173,7 +173,7 @@ Tensor prelu_cpu(const Tensor& self, const Tensor& weight_) {
       input_stride0 = strides[0];
       input_stride1 = strides[1];
     }
-    AT_CHECK(channel_size == weight_num,
+    TORCH_CHECK(channel_size == weight_num,
       "Mismatch of parameter numbers and input channel size. Found parameter numbers = ", weight_num,
       " and channel size = ", channel_size, ".");
 
@@ -276,9 +276,9 @@ std::tuple<Tensor, Tensor> prelu_backward_cpu(const Tensor& grad_out_, const Ten
   auto grad_out = grad_out_.contiguous();
   auto weight = weight_.contiguous();
 
-  AT_CHECK(input.is_contiguous());
-  AT_CHECK(grad_out.is_contiguous());
-  AT_CHECK(weight.is_contiguous());
+  TORCH_CHECK(input.is_contiguous());
+  TORCH_CHECK(grad_out.is_contiguous());
+  TORCH_CHECK(weight.is_contiguous());
 
   int64_t weight_num = weight.numel();
   auto strides = input.strides();
@@ -296,7 +296,7 @@ std::tuple<Tensor, Tensor> prelu_backward_cpu(const Tensor& grad_out_, const Ten
   }
   else { // case2: multiple parameters, one for each channel
     int64_t input_ndim = input.dim();
-    AT_CHECK(input_ndim > 0, "Not allow zero-dim input tensor.");
+    TORCH_CHECK(input_ndim > 0, "Not allow zero-dim input tensor.");
 
     int64_t channel_size = 1; // channel_size default to 1
     int64_t input_dim0_size = 1, input_stride0 = 1, input_stride1 = 1;
@@ -307,7 +307,7 @@ std::tuple<Tensor, Tensor> prelu_backward_cpu(const Tensor& grad_out_, const Ten
       input_stride0 = strides[0];
       input_stride1 = strides[1];
     }
-    AT_CHECK(channel_size == weight_num,
+    TORCH_CHECK(channel_size == weight_num,
       "Mismatch of parameter numbers and input channel size. Found parameter numbers = ", weight_num,
       " and channel size = ", channel_size, ".");
 
diff --git a/aten/src/ATen/native/AdaptiveAveragePooling.cpp b/aten/src/ATen/native/AdaptiveAveragePooling.cpp
index 1546621da55f..e2badf7d6f05 100644
--- a/aten/src/ATen/native/AdaptiveAveragePooling.cpp
+++ b/aten/src/ATen/native/AdaptiveAveragePooling.cpp
@@ -77,13 +77,13 @@ namespace {
     IntArrayRef output_size)
   {
     for (int64_t i = 0; i < input.ndimension(); i++) {
-      AT_CHECK(input.size(i) > 0,
+      TORCH_CHECK(input.size(i) > 0,
         "adaptive_avg_pooling2d(): expected input to have non-empty spatial dimensions, "
         "but input has sizes ", input.sizes(), " with dimension ", i, " being "
         "empty");
     }
 
-    AT_CHECK((input.ndimension() == 3 || input.ndimension() == 4),
+    TORCH_CHECK((input.ndimension() == 3 || input.ndimension() == 4),
       "non-empty 3D or 4D (batch mode) tensor expected for input");
 
     /* sizes */
diff --git a/aten/src/ATen/native/AdaptiveAveragePooling3d.cpp b/aten/src/ATen/native/AdaptiveAveragePooling3d.cpp
new file mode 100644
index 000000000000..38ae3d5f63ce
--- /dev/null
+++ b/aten/src/ATen/native/AdaptiveAveragePooling3d.cpp
@@ -0,0 +1,312 @@
+#include <ATen/ATen.h>
+#include <ATen/NativeFunctions.h>
+
+namespace at {
+namespace native {
+
+namespace {
+
+inline int start_index(int a, int b, int c) {
+  return (int)std::floor((float)(a * c) / b);
+}
+
+inline int end_index(int a, int b, int c) {
+  return (int)std::ceil((float)((a + 1) * c) / b);
+}
+
+template <typename scalar_t>
+static void adaptive_avg_pool3d_out_frame(
+    scalar_t* input_p,
+    scalar_t* output_p,
+    int64_t sizeD,
+    int64_t isizeT,
+    int64_t isizeH,
+    int64_t isizeW,
+    int64_t osizeT,
+    int64_t osizeH,
+    int64_t osizeW,
+    int64_t istrideD,
+    int64_t istrideT,
+    int64_t istrideH,
+    int64_t istrideW) {
+  int64_t d;
+#pragma omp parallel for private(d)
+  for (d = 0; d < sizeD; d++) {
+    /* loop over output */
+    int64_t ot, oh, ow;
+    for (ot = 0; ot < osizeT; ot++) {
+      int istartT = start_index(ot, osizeT, isizeT);
+      int iendT = end_index(ot, osizeT, isizeT);
+      int kT = iendT - istartT;
+
+      for (oh = 0; oh < osizeH; oh++) {
+        int istartH = start_index(oh, osizeH, isizeH);
+        int iendH = end_index(oh, osizeH, isizeH);
+        int kH = iendH - istartH;
+
+        for (ow = 0; ow < osizeW; ow++) {
+          int istartW = start_index(ow, osizeW, isizeW);
+          int iendW = end_index(ow, osizeW, isizeW);
+          int kW = iendW - istartW;
+
+          /* local pointers */
+          scalar_t* ip = input_p + d * istrideD + istartT * istrideT +
+              istartH * istrideH + istartW * istrideW;
+          scalar_t* op = output_p + d * osizeT * osizeH * osizeW +
+              ot * osizeH * osizeW + oh * osizeW + ow;
+
+          /* compute local average: */
+          scalar_t sum = 0;
+          int it, ih, iw;
+          for (it = 0; it < kT; it++) {
+            for (ih = 0; ih < kH; ih++) {
+              for (iw = 0; iw < kW; iw++) {
+                scalar_t val =
+                    *(ip + it * istrideT + ih * istrideH + iw * istrideW);
+                sum += val;
+              }
+            }
+          }
+
+          /* set output to local average */
+          *op = sum / kT / kH / kW;
+        }
+      }
+    }
+  }
+}
+
+void adaptive_avg_pool3d_out_cpu_template(
+    Tensor& output,
+    Tensor const& input,
+    IntArrayRef output_size) {
+  for (int64_t i = 0; i < input.ndimension(); i++) {
+    TORCH_CHECK(
+        input.size(i) > 0,
+        "adaptive_avg_pool3d(): expected input to have non-empty spatial dimensions, "
+        "but input has sizes ",
+        input.sizes(),
+        " with dimension ",
+        i,
+        " being "
+        "empty");
+  }
+
+  TORCH_CHECK(
+      (input.ndimension() == 4 || input.ndimension() == 5),
+      "non-empty 4D or 5D (batch mode) tensor expected for input");
+
+  /* sizes */
+  int64_t sizeD = input.size(-4);
+  int64_t isizeT = input.size(-3);
+  int64_t isizeH = input.size(-2);
+  int64_t isizeW = input.size(-1);
+  /* strides */
+  int64_t istrideD = input.stride(-4);
+  int64_t istrideT = input.stride(-3);
+  int64_t istrideH = input.stride(-2);
+  int64_t istrideW = input.stride(-1);
+  /* output sizes */
+  auto osizeT = output_size[0];
+  auto osizeH = output_size[1];
+  auto osizeW = output_size[2];
+
+  if (input.ndimension() == 4) {
+    output.resize_({sizeD, osizeT, osizeH, osizeW});
+
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        input.scalar_type(), "adaptive_avg_pool3d_cpu", [&] {
+          auto input_data = input.data<scalar_t>();
+          auto output_data = output.data<scalar_t>();
+          adaptive_avg_pool3d_out_frame<scalar_t>(
+              input_data,
+              output_data,
+              sizeD,
+              isizeT,
+              isizeH,
+              isizeW,
+              osizeT,
+              osizeH,
+              osizeW,
+              istrideD,
+              istrideT,
+              istrideH,
+              istrideW);
+        });
+  } else {
+    output.resize_({input.size(-5), sizeD, osizeT, osizeH, osizeW});
+    int64_t b;
+#pragma omp parallel for private(b)
+    for (b = 0; b < input.size(0); b++) {
+      AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+          input.scalar_type(), "adaptive_avg_pool3d_cpu", [&] {
+            auto input_data = input.data<scalar_t>();
+            auto output_data = output.data<scalar_t>();
+            adaptive_avg_pool3d_out_frame<scalar_t>(
+                input_data + b * input.stride(0),
+                output_data + b * sizeD * osizeT * osizeH * osizeW,
+                sizeD,
+                isizeT,
+                isizeH,
+                isizeW,
+                osizeT,
+                osizeH,
+                osizeW,
+                istrideD,
+                istrideT,
+                istrideH,
+                istrideW);
+          });
+    }
+  }
+}
+
+template <typename scalar_t>
+static void adaptive_avg_pool3d_backward_out_frame(
+    scalar_t* gradInput_p,
+    scalar_t* gradOutput_p,
+    int64_t sizeD,
+    int64_t isizeT,
+    int64_t isizeH,
+    int64_t isizeW,
+    int64_t osizeT,
+    int64_t osizeH,
+    int64_t osizeW) {
+  int64_t d;
+#pragma omp parallel for private(d)
+  for (d = 0; d < sizeD; d++) {
+    scalar_t* gradInput_p_d = gradInput_p + d * isizeT * isizeW * isizeH;
+    scalar_t* gradOutput_p_d = gradOutput_p + d * osizeT * osizeW * osizeH;
+
+    /* calculate average */
+    int64_t ot, oh, ow;
+    for (ot = 0; ot < osizeT; ot++) {
+      int istartT = start_index(ot, osizeT, isizeT);
+      int iendT = end_index(ot, osizeT, isizeT);
+      int kT = iendT - istartT;
+
+      for (oh = 0; oh < osizeH; oh++) {
+        int istartH = start_index(oh, osizeH, isizeH);
+        int iendH = end_index(oh, osizeH, isizeH);
+        int kH = iendH - istartH;
+
+        for (ow = 0; ow < osizeW; ow++) {
+          int istartW = start_index(ow, osizeW, isizeW);
+          int iendW = end_index(ow, osizeW, isizeW);
+          int kW = iendW - istartW;
+
+          scalar_t grad_delta =
+              gradOutput_p_d[ot * osizeH * osizeW + oh * osizeW + ow] / kT /
+              kH / kW;
+
+          int it, ih, iw;
+          for (it = istartT; it < iendT; it++) {
+            for (ih = istartH; ih < iendH; ih++) {
+              for (iw = istartW; iw < iendW; iw++) {
+                /* update gradient */
+                gradInput_p_d[it * isizeH * isizeW + ih * isizeW + iw] +=
+                    grad_delta;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+Tensor& adaptive_avg_pool3d_backward_out_cpu_template(
+    Tensor& gradInput,
+    const Tensor& gradOutput_,
+    const Tensor& input) {
+  /* get contiguous gradOutput */
+  auto gradOutput = gradOutput_.contiguous();
+
+  /* sizes */
+  int64_t sizeD = input.size(-4);
+  int64_t isizeT = input.size(-3);
+  int64_t isizeH = input.size(-2);
+  int64_t isizeW = input.size(-1);
+  int64_t osizeT = gradOutput.size(-3);
+  int64_t osizeH = gradOutput.size(-2);
+  int64_t osizeW = gradOutput.size(-1);
+
+  /* backprop */
+  if (input.ndimension() == 4) {
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        input.scalar_type(), "adaptive_avg_pool3d_backward_cpu", [&] {
+          /* get raw pointers */
+          scalar_t* gradInput_data = gradInput.data<scalar_t>();
+          scalar_t* gradOutput_data = gradOutput.data<scalar_t>();
+
+          adaptive_avg_pool3d_backward_out_frame<scalar_t>(
+              gradInput_data,
+              gradOutput_data,
+              sizeD,
+              isizeT,
+              isizeH,
+              isizeW,
+              osizeT,
+              osizeH,
+              osizeW);
+        });
+  } else {
+    int64_t b;
+#pragma omp parallel for private(b)
+    for (b = 0; b < input.size(0); b++) {
+      AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+          input.scalar_type(), "adaptive_avg_pool3d_backward_cpu", [&] {
+            /* get raw pointers */
+            scalar_t* gradInput_data = gradInput.data<scalar_t>();
+            scalar_t* gradOutput_data = gradOutput.data<scalar_t>();
+            adaptive_avg_pool3d_backward_out_frame<scalar_t>(
+                gradInput_data + b * sizeD * isizeT * isizeH * isizeW,
+                gradOutput_data + b * sizeD * osizeT * osizeH * osizeW,
+                sizeD,
+                isizeT,
+                isizeH,
+                isizeW,
+                osizeT,
+                osizeH,
+                osizeW);
+          });
+    }
+  }
+  return gradInput;
+}
+
+} // namespace
+
+Tensor& adaptive_avg_pool3d_out_cpu(
+    Tensor& output,
+    const Tensor& input,
+    IntArrayRef output_size) {
+  adaptive_avg_pool3d_out_cpu_template(output, input, output_size);
+  return output;
+}
+
+Tensor adaptive_avg_pool3d_cpu(Tensor const& input, IntArrayRef output_size) {
+  auto output = at::empty({0}, input.options());
+  adaptive_avg_pool3d_out_cpu_template(output, input, output_size);
+  return output;
+}
+
+Tensor& adaptive_avg_pool3d_backward_out_cpu(
+    Tensor& gradInput,
+    const Tensor& gradOutput_,
+    const Tensor& input) {
+  gradInput.resize_as_(input).zero_();
+  adaptive_avg_pool3d_backward_out_cpu_template(gradInput, gradOutput_, input);
+  return gradInput;
+}
+
+Tensor adaptive_avg_pool3d_backward_cpu(
+    const Tensor& gradOutput_,
+    const Tensor& input) {
+  auto gradInput = at::zeros_like(input);
+  adaptive_avg_pool3d_backward_out_cpu_template(gradInput, gradOutput_, input);
+  return gradInput;
+}
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/AdaptiveMaxPooling2d.cpp b/aten/src/ATen/native/AdaptiveMaxPooling2d.cpp
index c6a774e98486..ae36afe5290e 100644
--- a/aten/src/ATen/native/AdaptiveMaxPooling2d.cpp
+++ b/aten/src/ATen/native/AdaptiveMaxPooling2d.cpp
@@ -134,16 +134,16 @@ void adaptive_max_pool2d_out_cpu_template(
   int64_t istrideB = 0;
 
   for (int64_t i = 0; i < input.ndimension(); i++) {
-    AT_CHECK(input.size(i) > 0,
+    TORCH_CHECK(input.size(i) > 0,
       "adaptive_max_pool2d: expected input to have non-empty spatial dimensions, "
       "but input has sizes ", input.sizes(), " with dimension ", i, " being "
       "empty");
   }
 
-  AT_CHECK((input.ndimension() == 3 || input.ndimension() == 4),
+  TORCH_CHECK((input.ndimension() == 3 || input.ndimension() == 4),
     "non-empty 3D or 4D (batch mode) tensor expected for input");
 
-  AT_CHECK(output_size.size() == 2,
+  TORCH_CHECK(output_size.size() == 2,
     "adaptive_max_pool2d: internal error: output_size.size() must be 2");
 
   if (input.ndimension() == 4)
diff --git a/aten/src/ATen/native/AdaptiveMaxPooling3d.cpp b/aten/src/ATen/native/AdaptiveMaxPooling3d.cpp
index 5b9cdb08595c..7d6581d78bef 100644
--- a/aten/src/ATen/native/AdaptiveMaxPooling3d.cpp
+++ b/aten/src/ATen/native/AdaptiveMaxPooling3d.cpp
@@ -156,16 +156,16 @@ void adaptive_max_pool3d_out_cpu_template(
   int64_t istrideW = 0;
 
   for (int64_t i = 0; i < input.ndimension(); i++) {
-    AT_CHECK(input.size(i) > 0,
+    TORCH_CHECK(input.size(i) > 0,
       "adaptive_max_pool3d: expected input to have non-empty spatial dimensions, "
       "but input has sizes ", input.sizes(), " with dimension ", i, " being "
       "empty");
   }
 
-  AT_CHECK((input.ndimension() == 4 || input.ndimension() == 5),
+  TORCH_CHECK((input.ndimension() == 4 || input.ndimension() == 5),
     "non-empty 4D or 5D (batch mode) tensor expected for input");
 
-  AT_CHECK(output_size.size() == 3,
+  TORCH_CHECK(output_size.size() == 3,
     "adaptive_max_pool3d: internal error: output_size.size() must be 3");
 
   if (input.ndimension() == 5)
diff --git a/aten/src/ATen/native/AffineGridGenerator.cpp b/aten/src/ATen/native/AffineGridGenerator.cpp
index 7ab91d2d8a50..e54aa6e56061 100644
--- a/aten/src/ATen/native/AffineGridGenerator.cpp
+++ b/aten/src/ATen/native/AffineGridGenerator.cpp
@@ -67,7 +67,7 @@ Tensor affine_grid_generator_5D(
 }
 
 Tensor affine_grid_generator(const Tensor& theta, IntArrayRef size) {
-  AT_CHECK(
+  TORCH_CHECK(
       size.size() == 4 || size.size() == 5,
       "AffineGridGenerator needs 4d (spatial) or 5d (volumetric) inputs.");
   if (size.size() == 4) {
@@ -108,7 +108,7 @@ Tensor affine_grid_generator_5D_backward(
 }
 
 Tensor affine_grid_generator_backward(const Tensor& grad, IntArrayRef size) {
-  AT_CHECK(
+  TORCH_CHECK(
       size.size() == 4 || size.size() == 5,
       "AffineGridGenerator needs 4d (spatial) or 5d (volumetric) inputs.");
   if (size.size() == 4) {
diff --git a/aten/src/ATen/native/BatchLinearAlgebra.cpp b/aten/src/ATen/native/BatchLinearAlgebra.cpp
index ef6268c6bad1..c71ff615d858 100644
--- a/aten/src/ATen/native/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebra.cpp
@@ -138,28 +138,22 @@ static void apply_solve(Tensor& b, Tensor& A, std::vector<int64_t>& infos) {
 #else
   auto A_data = A.data<scalar_t>();
   auto b_data = b.data<scalar_t>();
+  auto A_mat_stride = matrixStride(A);
+  auto b_mat_stride = matrixStride(b);
+  auto batch_size = batchCount(A);
   auto n = A.size(-2);
   auto nrhs = b.size(-1);
 
   auto ipiv = at::empty({n}, b.options().dtype(kInt));
 
   int info;
-  if (b.dim() == 2) {
-    lapackSolve<scalar_t>(n, nrhs, A_data, n, ipiv.data<int>(), b_data, n, &info);
-    infos[0] = info;
-  } else {
-    auto A_mat_stride = matrixStride(A);
-    auto b_mat_stride = matrixStride(b);
-    auto batch_size = batchCount(A);
-
-    for (int64_t i = 0; i < batch_size; i++) {
-      scalar_t* A_working_ptr = &A_data[i * A_mat_stride];
-      scalar_t* b_working_ptr = &b_data[i * b_mat_stride];
-      lapackSolve<scalar_t>(n, nrhs, A_working_ptr, n, ipiv.data<int>(), b_working_ptr, n, &info);
-      infos[i] = info;
-      if (info != 0) {
-        return;
-      }
+  for (int64_t i = 0; i < batch_size; i++) {
+    scalar_t* A_working_ptr = &A_data[i * A_mat_stride];
+    scalar_t* b_working_ptr = &b_data[i * b_mat_stride];
+    lapackSolve<scalar_t>(n, nrhs, A_working_ptr, n, ipiv.data<int>(), b_working_ptr, n, &info);
+    infos[i] = info;
+    if (info != 0) {
+      return;
     }
   }
 #endif
@@ -182,9 +176,9 @@ std::tuple<Tensor, Tensor> _solve_helper_cpu(const Tensor& self, const Tensor& A
 
 // Supports arbitrary batch dimensions for self and A
 std::tuple<Tensor,Tensor> solve(const Tensor& self, const Tensor& A) {
-  AT_CHECK(self.dim() >= 2,
+  TORCH_CHECK(self.dim() >= 2,
            "B should have at least 2 dimensions, but has ", self.dim(), " dimensions instead");
-  AT_CHECK(A.dim() >= 2,
+  TORCH_CHECK(A.dim() >= 2,
            "A should have at least 2 dimensions, but has ", A.dim(), " dimensions instead");
   Tensor self_broadcasted, A_broadcasted;
   std::tie(self_broadcasted, A_broadcasted) = _linear_solve_broadcast_args(self, A);
@@ -208,7 +202,6 @@ static void apply_inverse(Tensor& self, std::vector<int64_t>& infos) {
 #else
   auto self_data = self.data<scalar_t>();
   auto self_matrix_stride = matrixStride(self);
-
   auto batch_size = batchCount(self);
   auto n = self.size(-2);
 
@@ -217,8 +210,8 @@ static void apply_inverse(Tensor& self, std::vector<int64_t>& infos) {
   scalar_t wkopt;
   Tensor work;
 
+  int info;
   for (int64_t i = 0; i < batch_size; i++) {
-    int info;
     scalar_t* self_working_ptr = &self_data[i * self_matrix_stride];
     lapackLu<scalar_t>(n, n, self_working_ptr, n, ipiv.data<int>(), &info);
     infos[i] = info;
@@ -249,7 +242,11 @@ Tensor _inverse_helper_cpu(const Tensor& self) {
   AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "inverse_cpu", [&]{
     apply_inverse<scalar_t>(self_working_copy, infos);
   });
-  batchCheckErrors(infos, "inverse_cpu");
+  if (self.dim() > 2) {
+    batchCheckErrors(infos, "inverse_cpu");
+  } else {
+    singleCheckErrors(infos[0], "inverse_cpu");
+  }
   return self_working_copy;
 }
 
@@ -257,9 +254,6 @@ Tensor inverse(const Tensor &self) {
   if (self.size(-1) == 0) {
     return at::empty_like(self);
   }
-  if (self.dim() == 2) {
-    return at::legacy::th::_th_getri_single(self);
-  }
   squareCheckInputs(self);
   return at::_inverse_helper(self);
 }
@@ -283,25 +277,20 @@ static void apply_cholesky_solve(Tensor& b, Tensor& A, bool upper, std::vector<i
 
   auto A_data = A.data<scalar_t>();
   auto b_data = b.data<scalar_t>();
+  auto A_mat_stride = matrixStride(A);
+  auto b_mat_stride = matrixStride(b);
+  auto batch_size = batchCount(A);
   auto n = A.size(-2);
   auto nrhs = b.size(-1);
 
   int info;
-  if (b.dim() == 2) {
-    lapackCholeskySolve<scalar_t>(uplo, n, nrhs, A_data, n, b_data, n, &info);
-    infos[0] = info;
-  } else {
-    auto A_mat_stride = matrixStride(A);
-    auto b_mat_stride = matrixStride(b);
-    auto batch_size = batchCount(A);
-    for (int64_t i = 0; i < batch_size; i++) {
-      scalar_t* A_working_ptr = &A_data[i * A_mat_stride];
-      scalar_t* b_working_ptr = &b_data[i * b_mat_stride];
-      lapackCholeskySolve<scalar_t>(uplo, n, nrhs, A_working_ptr, n, b_working_ptr, n, &info);
-      infos[i] = info;
-      if (info != 0) {
-        return;
-      }
+  for (int64_t i = 0; i < batch_size; i++) {
+    scalar_t* A_working_ptr = &A_data[i * A_mat_stride];
+    scalar_t* b_working_ptr = &b_data[i * b_mat_stride];
+    lapackCholeskySolve<scalar_t>(uplo, n, nrhs, A_working_ptr, n, b_working_ptr, n, &info);
+    infos[i] = info;
+    if (info != 0) {
+      return;
     }
   }
 #endif
@@ -324,9 +313,9 @@ Tensor _cholesky_solve_helper_cpu(const Tensor& self, const Tensor& A, bool uppe
 
 // Supports arbitrary batch dimensions for self and A
 Tensor cholesky_solve(const Tensor& self, const Tensor& A, bool upper) {
-  AT_CHECK(self.dim() >= 2,
+  TORCH_CHECK(self.dim() >= 2,
            "b should have at least 2 dimensions, but has ", self.dim(), " dimensions instead");
-  AT_CHECK(A.dim() >= 2,
+  TORCH_CHECK(A.dim() >= 2,
            "u should have at least 2 dimensions, but has ", A.dim(), " dimensions instead");
   Tensor self_broadcasted, A_broadcasted;
   std::tie(self_broadcasted, A_broadcasted) = _linear_solve_broadcast_args(self, A);
@@ -350,22 +339,17 @@ static void apply_cholesky(Tensor& self, bool upper, std::vector<int64_t>& infos
   char uplo = upper ? 'U' : 'L';
 
   auto self_data = self.data<scalar_t>();
+  auto self_matrix_stride = matrixStride(self);
+  auto batch_size = batchCount(self);
   auto n = self.size(-2);
 
   int info;
-  if (self.dim() == 2) {
-    lapackCholesky<scalar_t>(uplo, n, self_data, n, &info);
-    infos[0] = info;
-  } else {
-    auto self_matrix_stride = matrixStride(self);
-    auto batch_size = batchCount(self);
-    for (int64_t i = 0; i < batch_size; i++) {
-      scalar_t* self_working_ptr = &self_data[i * self_matrix_stride];
-      lapackCholesky<scalar_t>(uplo, n, self_working_ptr, n, &info);
-      infos[i] = info;
-      if (info != 0) {
-        return;
-      }
+  for (int64_t i = 0; i < batch_size; i++) {
+    scalar_t* self_working_ptr = &self_data[i * self_matrix_stride];
+    lapackCholesky<scalar_t>(uplo, n, self_working_ptr, n, &info);
+    infos[i] = info;
+    if (info != 0) {
+      return;
     }
   }
 #endif
@@ -417,28 +401,23 @@ static void apply_lu(Tensor& self, Tensor& pivots, Tensor& infos) {
   auto self_data = self.data<scalar_t>();
   auto pivots_data = pivots.data<int>();
   auto infos_data = infos.data<int>();
-
+  auto self_matrix_stride = matrixStride(self);
+  auto pivots_matrix_stride = pivots.size(-1);
+  auto batch_size = batchCount(self);
   auto n = self.size(-1);
 
-  if (self.dim() == 2) {
-    lapackLu<scalar_t>(n, n, self_data, n, pivots_data, infos_data);
-  } else {
-    auto self_matrix_stride = matrixStride(self);
-    auto batch_size = batchCount(self);
-    auto pivots_matrix_stride = pivots.size(-1);
-    for (int64_t i = 0; i < batch_size; i++) {
-      scalar_t* self_working_ptr = &self_data[i * self_matrix_stride];
-      int* pivots_working_ptr = &pivots_data[i * pivots_matrix_stride];
-      int* infos_working_ptr = &infos_data[i];
-      lapackLu<scalar_t>(n, n, self_working_ptr, n, pivots_working_ptr, infos_working_ptr);
-    }
+  for (int64_t i = 0; i < batch_size; i++) {
+    scalar_t* self_working_ptr = &self_data[i * self_matrix_stride];
+    int* pivots_working_ptr = &pivots_data[i * pivots_matrix_stride];
+    int* infos_working_ptr = &infos_data[i];
+    lapackLu<scalar_t>(n, n, self_working_ptr, n, pivots_working_ptr, infos_working_ptr);
   }
 #endif
 }
 
 std::tuple<Tensor, Tensor, Tensor> _lu_with_info_cpu(const Tensor& self, bool pivot, bool check_errors) {
-  AT_CHECK(pivot, "lu without pivoting is not implemented on the CPU");
-  AT_CHECK(self.dim() >= 2,
+  TORCH_CHECK(pivot, "lu without pivoting is not implemented on the CPU");
+  TORCH_CHECK(self.dim() >= 2,
            "expected tensor with 2 or more dimensions, got size: ", self.sizes(),
            " instead");
   squareCheckInputs(self);
@@ -458,10 +437,10 @@ std::tuple<Tensor, Tensor, Tensor> _lu_with_info_cpu(const Tensor& self, bool pi
     });
   }
   if (check_errors) {
-    if (self.dim() == 2) {
-      singleCheckErrors(infos_tensor.item<int64_t>(), "lu");
-    } else {
+    if (self.dim() > 2) {
       batchCheckErrors(infos_tensor, "lu");
+    } else {
+      singleCheckErrors(infos_tensor.item<int64_t>(), "lu");
     }
   }
   return std::make_tuple(self_working_copy, pivots_tensor, infos_tensor);
@@ -621,21 +600,17 @@ static void apply_triangular_solve(Tensor& b, Tensor& A, bool upper, bool transp
 
   auto A_data = A.data<scalar_t>();
   auto b_data = b.data<scalar_t>();
+  auto A_mat_stride = matrixStride(A);
+  auto b_mat_stride = matrixStride(b);
+  auto batch_size = batchCount(A);
   auto n = A.size(-2);
   auto nrhs = b.size(-1);
 
   int info;
-  if (b.dim() == 2) {
-    lapackTriangularSolve<scalar_t>(uplo, trans, diag, n, nrhs, A_data, n, b_data, n, &info);
-  } else {
-    auto A_mat_stride = matrixStride(A);
-    auto b_mat_stride = matrixStride(b);
-    auto batch_size = batchCount(A);
-    for (int64_t i = 0; i < batch_size; i++) {
-      scalar_t* A_working_ptr = &A_data[i * A_mat_stride];
-      scalar_t* b_working_ptr = &b_data[i * b_mat_stride];
-      lapackTriangularSolve<scalar_t>(uplo, trans, diag, n, nrhs, A_working_ptr, n, b_working_ptr, n, &info);
-    }
+  for (int64_t i = 0; i < batch_size; i++) {
+    scalar_t* A_working_ptr = &A_data[i * A_mat_stride];
+    scalar_t* b_working_ptr = &b_data[i * b_mat_stride];
+    lapackTriangularSolve<scalar_t>(uplo, trans, diag, n, nrhs, A_working_ptr, n, b_working_ptr, n, &info);
   }
 #endif
 }
@@ -653,9 +628,9 @@ std::tuple<Tensor, Tensor> _triangular_solve_helper_cpu(const Tensor& self, cons
 // Supports arbitrary batch dimensions for self and A
 std::tuple<Tensor, Tensor> triangular_solve(const Tensor& self, const Tensor& A,
                                             bool upper, bool transpose, bool unitriangular) {
-  AT_CHECK(self.dim() >= 2,
+  TORCH_CHECK(self.dim() >= 2,
            "b should have at least 2 dimensions, but has ", self.dim(), " dimensions instead");
-  AT_CHECK(A.dim() >= 2,
+  TORCH_CHECK(A.dim() >= 2,
            "u should have at least 2 dimensions, but has ", A.dim(), " dimensions instead");
   Tensor self_broadcasted, A_broadcasted;
   std::tie(self_broadcasted, A_broadcasted) = _linear_solve_broadcast_args(self, A);
diff --git a/aten/src/ATen/native/ConstantPadNd.cpp b/aten/src/ATen/native/ConstantPadNd.cpp
index 594604688149..48caea7f54ba 100644
--- a/aten/src/ATen/native/ConstantPadNd.cpp
+++ b/aten/src/ATen/native/ConstantPadNd.cpp
@@ -3,7 +3,7 @@
 namespace at { namespace native {
 
 Tensor constant_pad_nd(const Tensor& self, IntArrayRef pad, Scalar value) {
-    AT_CHECK(pad.size() % 2 == 0, "Length of pad must be even but instead it equals ",
+    TORCH_CHECK(pad.size() % 2 == 0, "Length of pad must be even but instead it equals ",
              pad.size());
 
     auto input_sizes = self.sizes();
@@ -11,7 +11,7 @@ Tensor constant_pad_nd(const Tensor& self, IntArrayRef pad, Scalar value) {
 
     auto l_pad = pad.size() / 2;
     auto l_diff = l_inp - l_pad;
-    AT_CHECK(l_inp >= l_pad, "Length of pad should be no more than twice the number of "
+    TORCH_CHECK(l_inp >= l_pad, "Length of pad should be no more than twice the number of "
              "dimensions of the input. Pad length is ", pad.size(), "while the input has ",
              l_inp, "dimensions.");
 
@@ -48,7 +48,7 @@ Tensor constant_pad_nd(const Tensor& self, IntArrayRef pad, Scalar value) {
     for (int i = 0; i < l_pad; i++) {
         auto pad_idx = pad.size() - ((i + 1) * 2);
         auto new_dim = input_sizes[l_diff + i] + pad[pad_idx] + pad[pad_idx + 1];
-        AT_CHECK(new_dim > 0, "The input size ", input_sizes[l_diff + i], ", plus negative padding ",
+        TORCH_CHECK(new_dim > 0, "The input size ", input_sizes[l_diff + i], ", plus negative padding ",
                  pad[pad_idx], " and ", pad[pad_idx + 1], "resulted in a negative output size, "
                  "which is invalid. Check dimension ", l_diff + i, "of your input.");
         new_shape.emplace_back(new_dim);
diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
index bc02fdc25119..34afeeb9b586 100644
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@@ -211,18 +211,18 @@ static void check_shape_forward(const at::Tensor& input,
   auto dilation = params.dilation;
   bool transposed = params.transposed;
 
-  AT_CHECK(!params.is_padding_neg(), "negative padding is not supported");
-  AT_CHECK(!params.is_output_padding_neg(), "negative output_padding is not supported");
-  AT_CHECK(!params.is_stride_neg(), "negative stride is not supported");
+  TORCH_CHECK(!params.is_padding_neg(), "negative padding is not supported");
+  TORCH_CHECK(!params.is_output_padding_neg(), "negative output_padding is not supported");
+  TORCH_CHECK(!params.is_stride_neg(), "negative stride is not supported");
 
-  AT_CHECK(weight_dim == k,
+  TORCH_CHECK(weight_dim == k,
            "Expected ", weight_dim, "-dimensional input for ", weight_dim,
            "-dimensional weight ", weight_sizes, ", but got ", k, "-dimensional input of size ",
            input.sizes(), " instead");
-  AT_CHECK(weight_sizes[0] >= groups,
+  TORCH_CHECK(weight_sizes[0] >= groups,
            "Given groups=", groups, ", expected weight to be at least ", groups,
            " at dimension 0, but got weight of size ", weight_sizes, " instead");
-  AT_CHECK(weight_sizes[0] % groups == 0,
+  TORCH_CHECK(weight_sizes[0] % groups == 0,
            "Given groups=", groups, ", expected weight to be divisible by ",
            groups, " at dimension 0, but got weight of size ", weight_sizes,
            " instead");
@@ -232,12 +232,12 @@ static void check_shape_forward(const at::Tensor& input,
     std::vector<int64_t> kernel_shape;
     bool kernel_size_correct = true;
 
-    AT_CHECK(input.size(1) == (weight_sizes[1] * groups),
+    TORCH_CHECK(input.size(1) == (weight_sizes[1] * groups),
              "Given groups=", groups, ", weight of size ", weight_sizes,
              ", expected input", input.sizes(), " to have ",
              (weight_sizes[1] * groups), " channels, but got ", input.size(1),
              " channels instead");
-    AT_CHECK(!bias.defined() || (bias.ndimension() == 1 && bias.size(0) == weight_sizes[0]),
+    TORCH_CHECK(!bias.defined() || (bias.ndimension() == 1 && bias.size(0) == weight_sizes[0]),
              "Given weight of size ", weight_sizes,
              ", expected bias to be 1-dimensional with ", weight_sizes[0], " elements",
              ", but got bias of size ", bias.sizes(), " instead");
@@ -251,7 +251,7 @@ static void check_shape_forward(const at::Tensor& input,
       }
     }
 
-    AT_CHECK(input_shape.size() == kernel_shape.size(), "Inconsistent shape between Input and Kernel");
+    TORCH_CHECK(input_shape.size() == kernel_shape.size(), "Inconsistent shape between Input and Kernel");
 
     if (!kernel_size_correct) {
       // If kernel size is incorrect
@@ -270,11 +270,11 @@ static void check_shape_forward(const at::Tensor& input,
                "Kernel size: (", kernel_ss.str(), "). Kernel size can't be greater than actual input size");
     }
   } else { // transposed
-    AT_CHECK(input.size(1) == weight_sizes[0],
+    TORCH_CHECK(input.size(1) == weight_sizes[0],
              "Given transposed=", transposed, ", weight of size ", weight_sizes,
              ", expected input", input.sizes(), " to have ", weight_sizes[0],
              " channels, but got ", input.size(1), " channels instead");
-    AT_CHECK(!bias.defined() || (bias.ndimension() == 1 && bias.size(0) == weight_sizes[1] * groups),
+    TORCH_CHECK(!bias.defined() || (bias.ndimension() == 1 && bias.size(0) == weight_sizes[1] * groups),
              "Given transposed=", transposed, ", weight of size ", weight_sizes,
              ", expected bias to be 1-dimensional with ", weight_sizes[1] * groups, " elements",
              ", but got bias of size ", bias.sizes(), " instead");
@@ -282,14 +282,14 @@ static void check_shape_forward(const at::Tensor& input,
 }
 
 static auto view4d(const at::Tensor& tensor) -> at::Tensor {
-  AT_CHECK(tensor.ndimension() == 3,
+  TORCH_CHECK(tensor.ndimension() == 3,
            "expected 3D tensor, got tensor with ", tensor.ndimension(),
            " dimensions instead");
   return tensor.unsqueeze(2);
 }
 
 static auto view3d(const at::Tensor& tensor) -> at::Tensor {
-  AT_CHECK(tensor.ndimension() == 4,
+  TORCH_CHECK(tensor.ndimension() == 4,
            "expected 4D tensor, got tensor with ", tensor.ndimension(),
            " dimensions instead");
   return tensor.squeeze(2);
@@ -378,7 +378,7 @@ at::Tensor _convolution(
   }
   int64_t dim = k - 2;
 
-  AT_CHECK(dim > 0, "weight should have at least three dimensions");
+  TORCH_CHECK(dim > 0, "weight should have at least three dimensions");
 
   ConvParams params;
   params.stride = expand_param_if_needed(stride_, "stride", dim);
@@ -409,10 +409,10 @@ at::Tensor _convolution(
       auto dilation = params.dilation;
       output = at::thnn_conv_depthwise2d(input, weight, kernel_size, bias, stride, padding, dilation);
   } else if (params.use_cudnn(input)) {
-    AT_CHECK(input.type() == weight.type(),
+    TORCH_CHECK(input.type() == weight.type(),
              "Input type (", input.type().toString(), ") and weight type (", weight.type().toString(),
              ") should be the same");
-    AT_CHECK(!bias.defined() || (input.type() == bias.type()),
+    TORCH_CHECK(!bias.defined() || (input.type() == bias.type()),
              "Input type (", input.type().toString(), ") and bias type (", bias.type().toString(),
              ") should be the same");
 
@@ -426,10 +426,10 @@ at::Tensor _convolution(
           params.padding, params.stride, params.dilation, params.groups, params.benchmark, params.deterministic);
     }
   } else if (params.use_miopen(input)) {
-    AT_CHECK(input.type() == weight.type(),
+    TORCH_CHECK(input.type() == weight.type(),
              "Input type (", input.type().toString(), ") and weight type (", weight.type().toString(),
              ") should be the same");
-    AT_CHECK(!bias.defined() || (input.type() == bias.type()),
+    TORCH_CHECK(!bias.defined() || (input.type() == bias.type()),
              "Input type (", input.type().toString(), ") and bias type (", bias.type().toString(),
              ") should be the same");
 
@@ -444,10 +444,10 @@ at::Tensor _convolution(
     }
   } else if (params.use_mkldnn(input)) {
 #if AT_MKLDNN_ENABLED()
-    AT_CHECK(input.type() == weight.type(),
+    TORCH_CHECK(input.type() == weight.type(),
              "Input type (", input.type().toString(), ") and weight type (", weight.type().toString(),
              ") should be the same");
-    AT_CHECK(!bias.defined() || (input.type() == bias.type()),
+    TORCH_CHECK(!bias.defined() || (input.type() == bias.type()),
              "Input type (", input.type().toString(), ") and bias type (", bias.type().toString(),
              ") should be the same");
     if (!input_is_mkldnn) {
diff --git a/aten/src/ATen/native/ConvolutionTBC.cpp b/aten/src/ATen/native/ConvolutionTBC.cpp
index c798582dc0b7..90eec5d7853f 100644
--- a/aten/src/ATen/native/ConvolutionTBC.cpp
+++ b/aten/src/ATen/native/ConvolutionTBC.cpp
@@ -6,11 +6,11 @@ namespace at {
 namespace native {
 
 Tensor conv_tbc(const Tensor& self, const Tensor& weight, const Tensor& bias, int64_t pad) {
-  AT_CHECK(self.dim() == 3, "Input must have 3 dims: time, batch, "
+  TORCH_CHECK(self.dim() == 3, "Input must have 3 dims: time, batch, "
       "in_channel");
-  AT_CHECK(weight.dim() == 3, "Weight tensor must have 3 dims: kernel_width,"
+  TORCH_CHECK(weight.dim() == 3, "Weight tensor must have 3 dims: kernel_width,"
       " in_channels, out_channels.");
-  AT_CHECK(bias.dim() == 1, "Bias must be 1-D");
+  TORCH_CHECK(bias.dim() == 1, "Bias must be 1-D");
 
   auto input_size = self.sizes();
   auto weight_size = weight.sizes();
@@ -27,9 +27,9 @@ Tensor conv_tbc(const Tensor& self, const Tensor& weight, const Tensor& bias, in
   // Input = (time, batch, in_channels)
   // Weight = (kernel_width, in_channels, out_channels)
   // Bias = (out_channels)
-  AT_CHECK(inputPlanes == weight_size[1], "Input dim 2 (input channels) "
+  TORCH_CHECK(inputPlanes == weight_size[1], "Input dim 2 (input channels) "
       "is not == dim 1 in the weight tensor");
-  AT_CHECK(weight_size[2] == bias.sizes()[0], "Bias size must equal dim 2 in "
+  TORCH_CHECK(weight_size[2] == bias.sizes()[0], "Bias size must equal dim 2 in "
       "the weight tensor (output channels).");
 
   // input * weights + bias -> output_features
diff --git a/aten/src/ATen/native/Copy.cpp b/aten/src/ATen/native/Copy.cpp
index d55b554dd69e..59ec3a3c95ff 100644
--- a/aten/src/ATen/native/Copy.cpp
+++ b/aten/src/ATen/native/Copy.cpp
@@ -1,121 +1,142 @@
 #include <ATen/native/Copy.h>
 
 #include <ATen/ATen.h>
-#include <ATen/CPUApplyUtils.h>
 #include <ATen/Dispatch.h>
-#include <ATen/ExpandUtils.h>
 #include <ATen/NativeFunctions.h>
-#include <ATen/native/cpu/CopyKernel.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/quantized/Copy.h>
 
 namespace {
 
-bool copy_transpose_valid(const at::Tensor& self, const at::Tensor& src) {
+using namespace at;
+
+bool copy_transpose_valid(const Tensor& self, const Tensor& src) {
   const int MIN_SZ = 60 * 60;
   return self.is_contiguous() && src.numel() != 0 && src.dim() == 2 &&
       src.stride(0) == 1 && src.stride(1) == src.size(0) &&
+      self.scalar_type() == src.scalar_type() &&
       self.numel() >= MIN_SZ;
 }
 
+// special case copy where tensor is contiguous and src is a transposed matrix
+// This can be generalized to most copies, but it's trickier
+void copy_same_type_transpose_(Tensor& self, const Tensor& src) {
+  int64_t BLOCK_SZ;
+  if (self.scalar_type() == kByte) {
+    BLOCK_SZ = 120;
+  } else {
+    BLOCK_SZ = 60;
+  }
+  Tensor buf = empty({BLOCK_SZ, BLOCK_SZ}, self.options());
+
+  AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBool, self.scalar_type(), "copy_", [&] {
+    scalar_t* sp = src.data<scalar_t>();
+    scalar_t* rp = self.data<scalar_t>();
+    scalar_t* bp = buf.data<scalar_t>();
+
+    int64_t NR = src.size(0);
+    int64_t NC = src.size(1);
+    for (int64_t R = 0; R < NR; R += BLOCK_SZ) {
+      for (int64_t C = 0; C < NC; C += BLOCK_SZ) {
+        scalar_t* spo = sp + R + C * NR;
+        scalar_t* rpo = rp + C + R * NC;
+
+        int nr = std::min(NR - R, BLOCK_SZ);
+        int nc = std::min(NC - C, BLOCK_SZ);
+
+        // 1. copy columns from src to buf
+        for (int c = 0; c < nc; c++) {
+          memcpy(bp + c * BLOCK_SZ, spo + c * NR, nr * sizeof(scalar_t));
+        }
+
+        // 2. transpose buf in place
+        int rc_max = std::max(nr, nc);
+        int rc_min = std::min(nr, nc);
+        for (int r = 0; r < rc_max; r++) {
+          int end = std::min(r, rc_min);
+          for (int c = 0; c < end; c++) {
+            scalar_t tmp = bp[r + BLOCK_SZ * c];
+            bp[r + BLOCK_SZ * c] = bp[r * BLOCK_SZ + c];
+            bp[r * BLOCK_SZ + c] = tmp;
+          }
+        }
+
+        // 3. copy rows from buf to dst
+        for (int r = 0; r < nr; r++) {
+          memcpy(rpo + r * NC, bp + r * BLOCK_SZ, nc * sizeof(scalar_t));
+        }
+      }
+    }
+  });
+}
+
+// Devices directly supported by this copy implementation. Other device types
+// (e.g. XLA) may be supported by overriding copy_ and _copy_from.
+bool is_supported_device(Device device) {
+  DeviceType device_type = device.type();
+  return device_type == kCPU || device_type == kCUDA || device_type == kHIP;
+}
+
 } // namespace
 
 namespace at {
 namespace native {
 
 Tensor & copy_(Tensor & self, const Tensor & src, bool non_blocking) {
-  Tensor b_src;
+  // TODO: this should be handled during dispatch, but that's missing...
+  TORCH_CHECK(self.defined(), "self is undefined");
+  TORCH_CHECK(self.defined(), "src is undefined");
+
   if (self.is_sparse() && src.is_sparse()) {
     return at::copy_sparse_to_sparse_(self, src, non_blocking);
+  } else if (self.is_sparse() || src.is_sparse()) {
+    AT_ERROR("copy_() between dense and sparse Tensors is not implemented! Found self type = ",
+             self.type(), " and src type = ", src.type());
   }
-  if (!self.is_sparse() && !src.is_sparse()) {
-    std::tie(b_src) = expand_inplace(self, src, "copy");
-    return s_copy_(self, b_src, non_blocking);
-  }
-  AT_ERROR("copy_() between dense and sparse Tensors is not implemented! Found self type = ",
-           self.type(), " and src type = ", src.type());
-}
 
-Tensor& _s_copy__cpu(Tensor& self, const Tensor& src, bool non_blocking) {
-  if (src.type_id() != CPUTensorId()) {
-    _s_copy_from(src, self, non_blocking);
+  if (self.is_same(src)) {
     return self;
   }
 
-  if (self.scalar_type() == src.scalar_type()) {
-    copy_kernel_same_type(kCPU, self, src);
-  } else {
-    AT_CHECK(self.numel() == src.numel(), "sizes do not match");
-    copy_kernel_cast(kCPU, self, src);
+  // Re-dispatch copies when src device not implemented here (e.g. XLA).
+  // This includes: cpu_tensor.copy_(xla_tensor) which
+  // calls xla_tensor._copy_from(cpu_tensor)
+  if (!is_supported_device(src.device())) {
+    TORCH_INTERNAL_ASSERT(is_supported_device(self.device()));
+    at::_copy_from(src, self, non_blocking);
+    return self;
   }
-  return self;
-}
 
-// special case copy where tensor is contiguous and src is a transposed matrix
-// This can be generalized to most copies, but it's tricker
-void _copy_same_type_transpose_(Tensor& self, const Tensor& src) {
-  int64_t BLOCK_SZ;
-  if (self.scalar_type() == kByte) {
-    BLOCK_SZ = 120;
-  } else {
-    BLOCK_SZ = 60;
+  if (self.scalar_type() == kQUInt8) {
+    return quantized_copy_(self, src);
   }
-  Tensor buf = empty({BLOCK_SZ, BLOCK_SZ}, self.options());
 
-  AT_DISPATCH_ALL_TYPES_AND(
-    at::ScalarType::Half, self.scalar_type(), "_copy_same_type_transpose_", [&]() {
-        scalar_t* sp = src.data<scalar_t>();
-        scalar_t* rp = self.data<scalar_t>();
-        scalar_t* bp = buf.data<scalar_t>();
-
-        int64_t NR = src.size(0);
-        int64_t NC = src.size(1);
-        for (int64_t R = 0; R < NR; R += BLOCK_SZ) {
-          for (int64_t C = 0; C < NC; C += BLOCK_SZ) {
-            scalar_t* spo = sp + R + C * NR;
-            scalar_t* rpo = rp + C + R * NC;
-
-            int nr = std::min(NR - R, BLOCK_SZ);
-            int nc = std::min(NC - C, BLOCK_SZ);
-
-            // 1. copy columns from src to buf
-            for (int c = 0; c < nc; c++) {
-              memcpy(bp + c * BLOCK_SZ, spo + c * NR, nr * sizeof(scalar_t));
-            }
-
-            // 2. transpose buf in place
-            int rc_max = std::max(nr, nc);
-            int rc_min = std::min(nr, nc);
-            for (int r = 0; r < rc_max; r++) {
-              int end = std::min(r, rc_min);
-              for (int c = 0; c < end; c++) {
-                scalar_t tmp = bp[r + BLOCK_SZ * c];
-                bp[r + BLOCK_SZ * c] = bp[r * BLOCK_SZ + c];
-                bp[r * BLOCK_SZ + c] = tmp;
-              }
-            }
-
-            // 3. copy rows from buf to dst
-            for (int r = 0; r < nr; r++) {
-              memcpy(rpo + r * NC, bp + r * BLOCK_SZ, nc * sizeof(scalar_t));
-            }
-          }
-        }
-      });
-}
+  auto builder = TensorIterator::Builder();
+  builder.add_output(self);
+  builder.add_input(src);
+  builder.dont_resize_outputs();
+  builder.dont_compute_common_dtype();
+  auto iter = builder.build();
 
-void _copy_same_type__cpu(Tensor& self, const Tensor& src) {
-  if (self.is_same(src)) {
-    return;
+  if (iter->numel() == 0) {
+    return self;
   }
 
-  if (self.numel() == src.numel() && copy_transpose_valid(self, src)) {
-    return _copy_same_type_transpose_(self, src);
+  DeviceType device_type = iter->device_type(0);
+  if (iter->device_type(1) == kCUDA) {
+    device_type = kCUDA;
   }
 
-  copy_kernel_same_type(kCPU, self, src);
+  if (device_type == kCPU && copy_transpose_valid(self, src)) {
+    copy_same_type_transpose_(self, src);
+    return self;
+  }
+
+  copy_stub(device_type, *iter, non_blocking);
+  return self;
 }
 
-DEFINE_DISPATCH(copy_kernel_cast);
-DEFINE_DISPATCH(copy_kernel_same_type);
+DEFINE_DISPATCH(copy_stub);
 
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/Copy.h b/aten/src/ATen/native/Copy.h
index 08989740a17b..a8d16f6f7b87 100644
--- a/aten/src/ATen/native/Copy.h
+++ b/aten/src/ATen/native/Copy.h
@@ -1,8 +1,12 @@
 #pragma once
 
 #include <ATen/ATen.h>
+#include <ATen/native/DispatchStub.h>
 
 namespace at {
+
+struct TensorIterator;
+
 namespace native {
 
 // Note [Implicit conversion between signed and unsigned]
@@ -43,5 +47,9 @@ struct inter_copy_type<uint8_t> {
 template <typename T>
 using inter_copy_type_t = typename inter_copy_type<T>::type;
 
+using copy_fn = void (*)(TensorIterator&, bool non_blocking);
+
+DECLARE_DISPATCH(copy_fn, copy_stub);
+
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/Cross.cpp b/aten/src/ATen/native/Cross.cpp
index 8788969797a5..4e1e230cc37d 100644
--- a/aten/src/ATen/native/Cross.cpp
+++ b/aten/src/ATen/native/Cross.cpp
@@ -16,17 +16,17 @@ Tensor cross(const Tensor & input, const Tensor & other, const c10::optional<int
 
 Tensor & cross_out(Tensor & out, const Tensor & input, const Tensor & other, const c10::optional<int64_t> dimension) {
   auto device_res = input.type().device_type();
-  AT_CHECK(device_res == kCPU || device_res == kCUDA, "cross only supports CPU and CUDA devices, out got: ", device_res);
+  TORCH_CHECK(device_res == kCPU || device_res == kCUDA, "cross only supports CPU and CUDA devices, out got: ", device_res);
   auto device1 = input.type().device_type();
-  AT_CHECK(device1 == kCPU || device1 == kCUDA, "cross only supports CPU and CUDA devices, input got: ", device1);
+  TORCH_CHECK(device1 == kCPU || device1 == kCUDA, "cross only supports CPU and CUDA devices, input got: ", device1);
   auto device2 = other.type().device_type();
-  AT_CHECK(device2 == kCPU || device2 == kCUDA, "cross only supports CPU and CUDA devices, other got: ", device2);
-  AT_CHECK(device_res == device1, "out and input must have the same device type. out: ", device_res, " input: ", device1);
-  AT_CHECK(device1 == device2, "input and other must have the same device type. input: ", device1, " other: ", device2);
-  AT_CHECK(!out.is_cuda() || out.get_device() == input.get_device(), "device of out (", input.get_device(), ") must match device of input (", other.get_device(), ")");
-  AT_CHECK(!input.is_cuda() || input.get_device() == other.get_device(), "device of input (", input.get_device(), ") must match device of other (", other.get_device(), ")");
-  AT_CHECK(input.dim() == other.dim(), "inconsistent tensors dimensions input: ", input.dim(), " other: ", other.dim());
-  AT_CHECK(input.sizes() == other.sizes(), "inconsistent tensors sizes input: ", input.sizes(), " other: ", other.sizes());
+  TORCH_CHECK(device2 == kCPU || device2 == kCUDA, "cross only supports CPU and CUDA devices, other got: ", device2);
+  TORCH_CHECK(device_res == device1, "out and input must have the same device type. out: ", device_res, " input: ", device1);
+  TORCH_CHECK(device1 == device2, "input and other must have the same device type. input: ", device1, " other: ", device2);
+  TORCH_CHECK(!out.is_cuda() || out.get_device() == input.get_device(), "device of out (", input.get_device(), ") must match device of input (", other.get_device(), ")");
+  TORCH_CHECK(!input.is_cuda() || input.get_device() == other.get_device(), "device of input (", input.get_device(), ") must match device of other (", other.get_device(), ")");
+  TORCH_CHECK(input.dim() == other.dim(), "inconsistent tensors dimensions input: ", input.dim(), " other: ", other.dim());
+  TORCH_CHECK(input.sizes() == other.sizes(), "inconsistent tensors sizes input: ", input.sizes(), " other: ", other.sizes());
 
   int64_t dim = -1;
   if(!dimension.has_value()) {
@@ -36,10 +36,10 @@ Tensor & cross_out(Tensor & out, const Tensor & input, const Tensor & other, con
         break;
       }
     }
-    AT_CHECK(dim >= 0, "no dimension of size 3 in input");
+    TORCH_CHECK(dim >= 0, "no dimension of size 3 in input");
   } else {
     dim = maybe_wrap_dim(dimension.value(), input.dim());
-    AT_CHECK(input.size(dim) == 3, "dimension ", dimension.value(), " does not have size 3");
+    TORCH_CHECK(input.size(dim) == 3, "dimension ", dimension.value(), " does not have size 3");
   }
 
   if (out.sizes() != input.sizes()) {
diff --git a/aten/src/ATen/native/DilatedMaxPool.h b/aten/src/ATen/native/DilatedMaxPool.h
new file mode 100644
index 000000000000..71f5bb5fe813
--- /dev/null
+++ b/aten/src/ATen/native/DilatedMaxPool.h
@@ -0,0 +1,112 @@
+#include <ATen/ATen.h>
+#include <ATen/Parallel.h>
+#include <ATen/NativeFunctions.h>
+#include <tuple>
+
+#pragma once
+
+namespace at {
+namespace native {
+
+namespace {
+
+template <typename dest_t, typename src_t>
+static inline dest_t
+safe_downcast(src_t v)
+{
+  TORCH_CHECK(std::numeric_limits<dest_t>::min() <= v && v <= std::numeric_limits<dest_t>::max(),
+              "integer out of range");
+
+  return static_cast<dest_t>(v);
+}
+
+template<typename T>
+static inline T pooling_output_shape(
+        T inputSize, T kernelSize, T pad, T stride, T dilation, bool ceil_mode) {
+    T outputSize = ((inputSize + 2 * pad - dilation * (kernelSize - 1) - 1 + (ceil_mode ? stride - 1 : 0)) / stride + 1);
+    if (pad) {
+        // ensure that the last pooling starts inside the image
+        // needed to avoid problems in ceil mode
+        if ((outputSize - 1) * stride >= inputSize + pad)
+          --outputSize;
+    }
+    return outputSize;
+}
+
+static inline void
+max_pool2d_with_indices_shape_check(
+  const Tensor& input,
+  int kH, int kW, int dH, int dW, int padH, int padW, int dilationH, int dilationW,
+  int64_t nInputPlane,
+  int64_t inputHeight, int64_t inputWidth,
+  int64_t outputHeight, int64_t outputWidth)
+{
+  const int64_t ndim = input.ndimension();
+  const int64_t nOutputPlane = nInputPlane;
+
+  TORCH_CHECK(kW > 0 && kH > 0,
+              "kernel size should be greater than zero, but got ",
+              "kH: ", kH, " kW: ", kW);
+  TORCH_CHECK(dW > 0 && dH > 0,
+              "stride should be greater than zero, but got "
+              "dH: ", dH, " dW: ", dW);
+  TORCH_CHECK(dilationH > 0 && dilationW > 0,
+              "dilation should be greater than zero, but got ",
+              "dilationH: ", dilationH, " dilationW: ", dilationW);
+
+  TORCH_CHECK(input.numel() > 0 && (ndim == 3 || ndim == 4),
+              "non-empty 3D or 4D input tensor expected but got ndim: ", ndim);
+  TORCH_CHECK(kW/2 >= padW && kH/2 >= padH,
+              "pad should be smaller than half of kernel size, but got ",
+              "padW = ", padW, ", padH = ", padH, ", kW = ", kW, ", kH = ", kH);
+
+  if (outputWidth < 1 || outputHeight < 1) {
+    AT_ERROR("Given input size: (",
+              nInputPlane, "x", inputHeight, "x", inputWidth, "). ",
+             "Calculated output size: (",
+              nOutputPlane, "x", outputHeight, "x", outputWidth, "). ",
+             "Output size is too small");
+  }
+}
+
+static inline void
+max_pool2d_with_indices_shape_check(
+  const Tensor& input,
+  const Tensor& gradOutput,
+  const Tensor& indices,
+  int64_t nbatch,
+  int kH, int kW, int dH, int dW, int padH, int padW, int dilationH, int dilationW,
+  int64_t nInputPlane,
+  int64_t inputHeight, int64_t inputWidth,
+  int64_t outputHeight, int64_t outputWidth,
+  bool cuda=false)
+{
+  max_pool2d_with_indices_shape_check(
+    input,
+    kH, kW, dH, dW, padH, padW, dilationH, dilationW,
+    nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth);
+
+  const int64_t ndim = input.ndimension();
+  const int64_t nOutputPlane = nInputPlane;
+
+  check_dim_size(gradOutput, ndim, ndim-3, nOutputPlane);
+  check_dim_size(gradOutput, ndim, ndim-2, outputHeight);
+  check_dim_size(gradOutput, ndim, ndim-1, outputWidth);
+
+  if (cuda) {
+    check_dim_size(indices, 4, 0, nbatch);
+    check_dim_size(indices, 4, 1, nOutputPlane);
+    check_dim_size(indices, 4, 2, outputHeight);
+    check_dim_size(indices, 4, 3, outputWidth);
+  }
+  else {
+    check_dim_size(indices, ndim, ndim-3, nOutputPlane);
+    check_dim_size(indices, ndim, ndim-2, outputHeight);
+    check_dim_size(indices, ndim, ndim-1, outputWidth);
+  }
+}
+
+} // namespace
+
+} // at::native
+} // at
diff --git a/aten/src/ATen/native/DilatedMaxPool2d.cpp b/aten/src/ATen/native/DilatedMaxPool2d.cpp
new file mode 100644
index 000000000000..acc7eebaf88b
--- /dev/null
+++ b/aten/src/ATen/native/DilatedMaxPool2d.cpp
@@ -0,0 +1,499 @@
+#include <ATen/ATen.h>
+#include <ATen/Parallel.h>
+#include <ATen/NativeFunctions.h>
+#include <ATen/native/DilatedMaxPool.h>
+#include <tuple>
+
+
+namespace at {
+namespace native {
+
+namespace {
+
+template <typename scalar_t>
+static void max_pool2d_with_indices_single_out_frame(
+          scalar_t *input_p,
+          scalar_t *output_p,
+          int64_t *ind_p,
+          int64_t nslices,
+          int64_t iwidth,
+          int64_t iheight,
+          int64_t owidth,
+          int64_t oheight,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          int dilationW,
+          int dilationH
+          )
+{
+  at::parallel_for(0, nslices, 0, [&](int64_t start, int64_t end) {
+    for (auto k = start; k < end; k++)
+    {
+      /* loop over output */
+      int64_t i, j;
+      scalar_t *ip = input_p   + k*iwidth*iheight;
+      for(i = 0; i < oheight; i++)
+      {
+        for(j = 0; j < owidth; j++)
+        {
+          int64_t hstart = i * dH - padH;
+          int64_t wstart = j * dW - padW;
+          int64_t hend = std::min(hstart + (kH - 1) * dilationH + 1, iheight);
+          int64_t wend = std::min(wstart + (kW - 1) * dilationW + 1, iwidth);
+          while(hstart < 0)
+            hstart += dilationH;
+          while(wstart < 0)
+            wstart += dilationW;
+
+          /* local pointers */
+          scalar_t *op = output_p  + k*owidth*oheight + i*owidth + j;
+          int64_t *indp = ind_p   + k*owidth*oheight + i*owidth + j;
+
+          /* compute local max: */
+          int64_t maxindex = -1;
+          scalar_t maxval = -std::numeric_limits<scalar_t>::max();
+          int64_t tcntr = 0;
+          int64_t x,y;
+          for(y = hstart; y < hend; y += dilationH)
+          {
+            for(x = wstart; x < wend; x += dilationW)
+            {
+              tcntr = y*iwidth + x;
+              scalar_t val = *(ip + tcntr);
+              if ((val > maxval) || std::isnan(val))
+              {
+                maxval = val;
+                maxindex = tcntr;
+              }
+            }
+          }
+
+          /* set output to local max */
+          *op = maxval;
+
+          /* store location of max */
+          *indp = maxindex;
+        }
+      }
+    }
+  });
+}
+
+template <typename scalar_t>
+static void max_pool2d_with_indices_out_frame(
+          scalar_t *input_data,
+          scalar_t *output_data,
+          int64_t *indices_data,
+          int64_t nbatch,
+          int64_t nInputPlane,
+          int64_t inputWidth,
+          int64_t inputHeight,
+          int64_t outputWidth,
+          int64_t outputHeight,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          int dilationW,
+          int dilationH)
+{
+  at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) {
+    for (auto p = start; p < end; p++) {
+      max_pool2d_with_indices_single_out_frame(
+        input_data+p*nInputPlane*inputWidth*inputHeight,
+        output_data+p*nInputPlane*outputWidth*outputHeight,
+        indices_data+p*nInputPlane*outputWidth*outputHeight,
+        nInputPlane,
+        inputWidth, inputHeight,
+        outputWidth, outputHeight,
+        kW, kH, dW, dH,
+        padW, padH,
+        dilationW, dilationH);
+    }
+  });
+}
+
+void max_pool2d_with_indices_out_cpu_template(
+          Tensor& output,
+          Tensor& indices,
+          const Tensor& input_,
+          IntArrayRef kernel_size,
+          IntArrayRef stride,
+          IntArrayRef padding,
+          IntArrayRef dilation,
+          bool ceil_mode)
+{
+  // XXX JIT: Pooling.cpp allows stride.empty().
+  // XXX IntegrationTest.MNIST: padding.size() == 1 && dilation.size() == 1.
+  TORCH_CHECK(kernel_size.size() == 2 &&
+              (stride.empty() || stride.size() == 2) &&
+              (padding.size() == 1 || padding.size() == 2) &&
+              (dilation.size() == 1 || dilation.size() == 2),
+    "max_pool2d_with_indices: internal error: all IntArrayRef sizes must be 2");
+
+  TORCH_CHECK((input_.ndimension() == 3 || input_.ndimension() == 4),
+    "non-empty 3D or 4D (batch mode) tensor expected for input");
+
+  const int kH = safe_downcast<int, int64_t>(kernel_size[0]);
+  const int kW = safe_downcast<int, int64_t>(kernel_size[1]);
+
+  const int dH = stride.empty() ? kH : safe_downcast<int, int64_t>(stride[0]);
+  const int dW = stride.empty() ? kW : safe_downcast<int, int64_t>(stride[1]);
+
+  const int padH = safe_downcast<int, int64_t>(padding[0]);
+  const int padW = padding.size() == 1 ? padH : safe_downcast<int, int64_t>(padding[1]);
+
+  const int dilationH = safe_downcast<int, int64_t>(dilation[0]);
+  const int dilationW = dilation.size() == 1 ? dilationH : safe_downcast<int, int64_t>(dilation[1]);
+
+  /* sizes */
+  const int64_t nbatch = input_.ndimension() == 4 ? input_.size(-4) : 1;
+  const int64_t nInputPlane = input_.size(-3);
+  const int64_t inputHeight = input_.size(-2);
+  const int64_t inputWidth = input_.size(-1);
+
+  const int64_t outputHeight = pooling_output_shape<int64_t>(inputHeight, kH, padH, dH, dilationH, ceil_mode);
+  const int64_t outputWidth = pooling_output_shape<int64_t>(inputWidth, kW, padW, dW, dilationW, ceil_mode);
+
+  max_pool2d_with_indices_shape_check(
+    input_,
+    kH, kW, dH, dW, padH, padW, dilationH, dilationW,
+    nInputPlane,
+    inputHeight, inputWidth,
+    outputHeight, outputWidth);
+
+  /* get contiguous input */
+  Tensor input = input_.contiguous();
+
+  /* resize output */
+  if (input.ndimension() == 3)
+  {
+    output.resize_({nInputPlane, outputHeight, outputWidth});
+    /* indices will contain the locations for each output point */
+    indices.resize_({nInputPlane, outputHeight, outputWidth});
+
+    AT_DISPATCH_FLOATING_TYPES(input.scalar_type(),
+      "max_pool2d_with_indices_cpu",
+      [&] {
+        /* get raw pointers */
+        scalar_t *input_data = input.data<scalar_t>();
+        scalar_t *output_data = output.data<scalar_t>();
+        int64_t *indices_data = indices.data<int64_t>();
+
+        max_pool2d_with_indices_single_out_frame(
+          input_data, output_data,
+          indices_data,
+          nInputPlane,
+          inputWidth, inputHeight,
+          outputWidth, outputHeight,
+          kW, kH, dW, dH,
+          padW, padH,
+          dilationW, dilationH);
+      }
+    );
+  }
+  else
+  {
+    output.resize_({nbatch, nInputPlane, outputHeight, outputWidth});
+    /* indices will contain the locations for each output point */
+    indices.resize_({nbatch, nInputPlane, outputHeight, outputWidth});
+
+    AT_DISPATCH_FLOATING_TYPES(input.scalar_type(),
+      "max_pool2d_with_indices_cpu",
+      [&] {
+        scalar_t *input_data = input.data<scalar_t>();
+        scalar_t *output_data = output.data<scalar_t>();
+        int64_t *indices_data = indices.data<int64_t>();
+
+        max_pool2d_with_indices_out_frame(
+          input_data,
+          output_data,
+          indices_data,
+          nbatch,
+          nInputPlane,
+          inputWidth, inputHeight,
+          outputWidth, outputHeight,
+          kW, kH, dW, dH,
+          padW, padH,
+          dilationW, dilationH); }
+    );
+  }
+}
+
+template <typename scalar_t>
+static void max_pool2d_with_indices_backward_single_out_frame(
+          scalar_t *gradInput_p,
+          scalar_t *gradOutput_p,
+          int64_t *ind_p,
+          int64_t nInputPlane,
+          int64_t inputWidth,
+          int64_t inputHeight,
+          int64_t outputWidth,
+          int64_t outputHeight,
+          int dW,
+          int dH)
+{
+  at::parallel_for(0, nInputPlane, 0, [&](int64_t start, int64_t end) {
+    for (auto k = start; k < end; k++)
+    {
+      scalar_t *gradInput_p_k = gradInput_p + k*inputWidth*inputHeight;
+      scalar_t *gradOutput_p_k = gradOutput_p + k*outputWidth*outputHeight;
+      int64_t *ind_p_k = ind_p + k*outputWidth*outputHeight;
+
+      /* calculate max points */
+      int64_t i, j;
+      for(i = 0; i < outputHeight; i++)
+      {
+        for(j = 0; j < outputWidth; j++)
+        {
+          /* retrieve position of max */
+          int64_t maxp = ind_p_k[i*outputWidth + j];
+          if (maxp != -1) {
+            /* update gradient */
+            gradInput_p_k[maxp] += gradOutput_p_k[i*outputWidth + j];
+          }
+        }
+      }
+    }
+  });
+}
+
+template <typename scalar_t>
+static void max_pool2d_with_indices_backward_out_frame(
+          scalar_t *gradInput_data,
+          scalar_t *gradOutput_data,
+          int64_t *indices_data,
+          int64_t nbatch,
+          int64_t nInputPlane,
+          int64_t inputWidth,
+          int64_t inputHeight,
+          int64_t outputWidth,
+          int64_t outputHeight,
+          int dW,
+          int dH)
+{
+  at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) {
+    for (auto p = start; p < end; p++) {
+      max_pool2d_with_indices_backward_single_out_frame<scalar_t>(
+        gradInput_data+p*nInputPlane*inputWidth*inputHeight,
+        gradOutput_data+p*nInputPlane*outputWidth*outputHeight,
+        indices_data+p*nInputPlane*outputWidth*outputHeight,
+        nInputPlane,
+        inputWidth, inputHeight,
+        outputWidth, outputHeight,
+        dW, dH);
+    }
+  });
+}
+
+Tensor& max_pool2d_with_indices_backward_out_cpu_template(
+          Tensor& gradInput,
+          const Tensor& gradOutput_,
+          const Tensor& input,
+          const Tensor& indices,
+          IntArrayRef kernel_size,
+          IntArrayRef stride,
+          IntArrayRef padding,
+          IntArrayRef dilation,
+          bool ceil_mode)
+{
+  // XXX JIT: Pooling.cpp allows stride.empty().
+  // XXX IntegrationTest.MNIST: padding.size() == 1 && dilation.size() == 1.
+  TORCH_CHECK(kernel_size.size() == 2 &&
+              (stride.empty() || stride.size() == 2) &&
+              (padding.size() == 1 || padding.size() == 2) &&
+              (dilation.size() == 1 || dilation.size() == 2),
+    "max_pool2d_with_indices: internal error: all IntArrayRef sizes must be 2");
+
+  TORCH_CHECK((input.ndimension() == 3 || input.ndimension() == 4),
+    "non-empty 3D or 4D (batch mode) tensor expected for input");
+
+  const int kH = safe_downcast<int, int64_t>(kernel_size[0]);
+  const int kW = safe_downcast<int, int64_t>(kernel_size[1]);
+
+  const int dH = stride.empty() ? kH : safe_downcast<int, int64_t>(stride[0]);
+  const int dW = stride.empty() ? kW : safe_downcast<int, int64_t>(stride[1]);
+
+  const int padH = safe_downcast<int, int64_t>(padding[0]);
+  const int padW = padding.size() == 1 ? padH : safe_downcast<int, int64_t>(padding[1]);
+
+  const int dilationH = safe_downcast<int, int64_t>(dilation[0]);
+  const int dilationW = dilation.size() == 1 ? dilationH : safe_downcast<int, int64_t>(dilation[1]);
+
+  /* get contiguous gradOutput */
+  const Tensor gradOutput = gradOutput_.contiguous();
+
+  /* resize */
+  gradInput.resize_as_(input);
+  gradInput.zero_();
+
+  /* sizes */
+  const int64_t nbatch = input.ndimension() == 4 ? input.size(-4) : 1;
+  const int64_t nInputPlane = input.size(-3);
+  const int64_t inputHeight = input.size(-2);
+  const int64_t inputWidth = input.size(-1);
+  const int64_t outputHeight = gradOutput.size(-2);
+  const int64_t outputWidth = gradOutput.size(-1);
+
+  /* XXX preserve the existing shape check behavior */
+  const int64_t outputHeight_for_shape_check = pooling_output_shape<int64_t>(inputHeight, kH, padH, dH, dilationH, ceil_mode);
+  const int64_t outputWidth_for_shape_check = pooling_output_shape<int64_t>(inputWidth, kW, padW, dW, dilationW, ceil_mode);
+
+  max_pool2d_with_indices_shape_check(
+    input,
+    gradOutput_,
+    indices,
+    nbatch,
+    kH, kW, dH, dW, padH, padW, dilationH, dilationW,
+    nInputPlane,
+    inputHeight, inputWidth,
+    outputHeight_for_shape_check, outputWidth_for_shape_check);
+
+  /* backprop */
+  if (input.ndimension() == 3)
+  {
+    AT_DISPATCH_FLOATING_TYPES(input.scalar_type(),
+      "max_pool2d_with_indices_backward",
+      [&] {
+        /* get raw pointers */
+        scalar_t *gradInput_data = gradInput.data<scalar_t>();
+        scalar_t *gradOutput_data = gradOutput.data<scalar_t>();
+        int64_t *indices_data = indices.data<int64_t>();
+
+        max_pool2d_with_indices_backward_single_out_frame(
+          gradInput_data, gradOutput_data,
+          indices_data,
+          nInputPlane,
+          inputWidth, inputHeight,
+          outputWidth, outputHeight,
+          dW, dH);
+      }
+    );
+  }
+  else
+  {
+    AT_DISPATCH_FLOATING_TYPES(input.scalar_type(),
+      "max_pool2d_with_indices_backward",
+      [&] {
+        /* get raw pointers */
+        scalar_t *gradInput_data = gradInput.data<scalar_t>();
+        scalar_t *gradOutput_data = gradOutput.data<scalar_t>();
+        int64_t *indices_data = indices.data<int64_t>();
+
+        max_pool2d_with_indices_backward_out_frame<scalar_t>(
+          gradInput_data, gradOutput_data,
+          indices_data,
+          nbatch,
+          nInputPlane,
+          inputWidth, inputHeight,
+          outputWidth, outputHeight,
+          dW, dH);
+      }
+    );
+  }
+
+  return gradInput;
+}
+
+} // namespace
+
+std::tuple<Tensor& ,Tensor&> max_pool2d_with_indices_out_cpu(
+  Tensor& output,
+  Tensor& indices,
+  const Tensor& input,
+  IntArrayRef kernel_size,
+  IntArrayRef stride,
+  IntArrayRef padding,
+  IntArrayRef dilation,
+  bool ceil_mode)
+{
+  max_pool2d_with_indices_out_cpu_template(
+    output,
+    indices,
+    input,
+    kernel_size,
+    stride,
+    padding,
+    dilation,
+    ceil_mode);
+  return std::tuple<Tensor&, Tensor&>(output, indices);
+}
+
+std::tuple<Tensor ,Tensor> max_pool2d_with_indices_cpu(
+  const Tensor& input,
+  IntArrayRef kernel_size,
+  IntArrayRef stride,
+  IntArrayRef padding,
+  IntArrayRef dilation,
+  bool ceil_mode)
+{
+  Tensor output = at::empty({0}, input.options());
+  Tensor indices = at::empty({0}, input.options().dtype(kLong));
+  max_pool2d_with_indices_out_cpu_template(
+    output,
+    indices,
+    input,
+    kernel_size, 
+    stride,
+    padding,
+    dilation, 
+    ceil_mode);
+  return std::tuple<Tensor&, Tensor&>(output, indices);
+}
+
+Tensor& max_pool2d_with_indices_backward_out_cpu(
+  Tensor& gradInput,
+  const Tensor& gradOutput_,
+  const Tensor& input,
+  IntArrayRef kernel_size,
+  IntArrayRef stride,
+  IntArrayRef padding,
+  IntArrayRef dilation,
+  bool ceil_mode,
+  const Tensor& indices)
+{
+  max_pool2d_with_indices_backward_out_cpu_template(
+    gradInput,
+    gradOutput_,
+    input,
+    indices,
+    kernel_size,
+    stride,
+    padding,
+    dilation,
+    ceil_mode);
+  return gradInput;
+}
+
+Tensor max_pool2d_with_indices_backward_cpu(
+  const Tensor& gradOutput_,
+  const Tensor& input,
+  IntArrayRef kernel_size,
+  IntArrayRef stride,
+  IntArrayRef padding,
+  IntArrayRef dilation,
+  bool ceil_mode,
+  const Tensor& indices)
+{
+  auto gradInput = at::zeros_like(input);
+  max_pool2d_with_indices_backward_out_cpu_template(
+    gradInput,
+    gradOutput_,
+    input,
+    indices,
+    kernel_size,
+    stride,
+    padding,
+    dilation,
+    ceil_mode);
+  return gradInput;
+}
+
+} // at::native
+} // at
diff --git a/aten/src/ATen/native/Distance.cpp b/aten/src/ATen/native/Distance.cpp
index d8420fc3cd3a..76dad527721e 100644
--- a/aten/src/ATen/native/Distance.cpp
+++ b/aten/src/ATen/native/Distance.cpp
@@ -17,28 +17,28 @@ Tensor pairwise_distance(const Tensor& x1, const Tensor& x2, double p, double ep
 
 // This is to guarantee that the contiguous memory is passed to the backward pass
 Tensor pdist(const Tensor& self, const double p) {
-  AT_CHECK(self.dim() == 2,
+  TORCH_CHECK(self.dim() == 2,
       "pdist only supports 2D tensors, got: ", self.dim(), "D");
-  AT_CHECK(at::isFloatingType(self.scalar_type()), "pdist only supports floating-point dtypes");
-  AT_CHECK(p >= 0, "pdist only supports non-negative p values");
+  TORCH_CHECK(at::isFloatingType(self.scalar_type()), "pdist only supports floating-point dtypes");
+  TORCH_CHECK(p >= 0, "pdist only supports non-negative p values");
   return at::_pdist_forward(self.contiguous(), p);
 }
 
 Tensor cdist(const Tensor& x1, const Tensor& x2, const double p) {
-  AT_CHECK(x1.dim() == 2, "cdist only supports 2D tensors, X1 got: ", x1.dim(), "D");
-  AT_CHECK(at::isFloatingType(x1.scalar_type()), "cdist only supports floating-point dtypes, X1 got: ", x1.scalar_type());
+  TORCH_CHECK(x1.dim() == 2, "cdist only supports 2D tensors, X1 got: ", x1.dim(), "D");
+  TORCH_CHECK(at::isFloatingType(x1.scalar_type()), "cdist only supports floating-point dtypes, X1 got: ", x1.scalar_type());
   auto device1 = x1.type().device_type();
-  AT_CHECK(device1 == kCPU || device1 == kCUDA, "cdist only supports CPU and CUDA devices, X1 got: ", device1);
-  AT_CHECK(x2.dim() == 2, "cdist only supports 2D tensors, X2 got: ", x2.dim(), "D");
-  AT_CHECK(at::isFloatingType(x1.scalar_type()), "cdist only supports floating-point dtypes, X2 got: ", x2.scalar_type());
+  TORCH_CHECK(device1 == kCPU || device1 == kCUDA, "cdist only supports CPU and CUDA devices, X1 got: ", device1);
+  TORCH_CHECK(x2.dim() == 2, "cdist only supports 2D tensors, X2 got: ", x2.dim(), "D");
+  TORCH_CHECK(at::isFloatingType(x1.scalar_type()), "cdist only supports floating-point dtypes, X2 got: ", x2.scalar_type());
   auto device2 = x2.type().device_type();
-  AT_CHECK(device2 == kCPU || device2 == kCUDA, "cdist only supports CPU and CUDA devices, X2 got: ", device2);
-  AT_CHECK(p >= 0, "cdist only supports non-negative p values");
-  AT_CHECK(device1 == device2, "X1 and X2 must have the same device type. X1: ", device1, " X2: ", device2);
-  AT_CHECK(!x1.is_cuda() || x1.get_device() == x2.get_device(), "device of X1 (", x1.get_device(), ") must match device of X2 (", x2.get_device(), ")");
+  TORCH_CHECK(device2 == kCPU || device2 == kCUDA, "cdist only supports CPU and CUDA devices, X2 got: ", device2);
+  TORCH_CHECK(p >= 0, "cdist only supports non-negative p values");
+  TORCH_CHECK(device1 == device2, "X1 and X2 must have the same device type. X1: ", device1, " X2: ", device2);
+  TORCH_CHECK(!x1.is_cuda() || x1.get_device() == x2.get_device(), "device of X1 (", x1.get_device(), ") must match device of X2 (", x2.get_device(), ")");
   int64_t c1 = x1.size(-1);
   int64_t c2 = x2.size(-1);
-  AT_CHECK(c1 == c2, "X1 and X2 must have the same number of columns. X1: ", c1, " X2: ", c2);
+  TORCH_CHECK(c1 == c2, "X1 and X2 must have the same number of columns. X1: ", c1, " X2: ", c2);
 
   int64_t r1 = x1.size(-2);
   int64_t r2 = x2.size(-2);
@@ -54,24 +54,24 @@ Tensor cdist(const Tensor& x1, const Tensor& x2, const double p) {
 }
 
 Tensor _cdist_backward(const Tensor& grad, const Tensor& x1, const Tensor& x2, const double p, const Tensor& cdist) {
-  AT_CHECK(x1.is_contiguous(), "_cdist_backward requires X1 to be contiguous");
-  AT_CHECK(x2.is_contiguous(), "_cdist_backward requires X2 to be contiguous");
-  AT_CHECK(cdist.is_contiguous(), "_cdist_backward requires dist to be contiguous");
+  TORCH_CHECK(x1.is_contiguous(), "_cdist_backward requires X1 to be contiguous");
+  TORCH_CHECK(x2.is_contiguous(), "_cdist_backward requires X2 to be contiguous");
+  TORCH_CHECK(cdist.is_contiguous(), "_cdist_backward requires dist to be contiguous");
   int64_t n = x1.size(-2);
   int64_t m = x1.size(-1);
   auto device1 = x1.type().device_type();
-  AT_CHECK(device1 == kCPU || device1 == kCUDA, "_cdist_backward only supports CPU and CUDA devices, X1 got: ", device1);
+  TORCH_CHECK(device1 == kCPU || device1 == kCUDA, "_cdist_backward only supports CPU and CUDA devices, X1 got: ", device1);
   auto device2 = x2.type().device_type();
-  AT_CHECK(device2 == kCPU || device2 == kCUDA, "_cdist_backward only supports CPU and CUDA devices, X2 got: ", device2);
+  TORCH_CHECK(device2 == kCPU || device2 == kCUDA, "_cdist_backward only supports CPU and CUDA devices, X2 got: ", device2);
   Tensor grad_x1 = at::empty({n, m}, x1.options());
   cdist_backward_stub(device1, grad_x1, grad, x1, x2, p, cdist);
   return grad_x1;
 }
 
 Tensor _pdist_forward(const Tensor& self, const double p) {
-  AT_CHECK(self.is_contiguous(), "_pdist_forward requires contiguous input");
+  TORCH_CHECK(self.is_contiguous(), "_pdist_forward requires contiguous input");
   auto device = self.type().device_type();
-  AT_CHECK(device == kCPU || device == kCUDA, "_pdist_forward only supports CPU and CUDA devices, got: ", device);
+  TORCH_CHECK(device == kCPU || device == kCUDA, "_pdist_forward only supports CPU and CUDA devices, got: ", device);
   Tensor result = at::empty({0}, self.options());
   if (self.size(0) <= 1) {
     result.resize_({0});
@@ -89,10 +89,10 @@ Tensor _pdist_forward(const Tensor& self, const double p) {
 }
 
 Tensor _pdist_backward(const Tensor& grad, const Tensor& self, const double p, const Tensor& pdist) {
-  AT_CHECK(self.is_contiguous(), "_pdist_backward requires self to be contiguous");
-  AT_CHECK(pdist.is_contiguous(), "_pdist_backward requires pdist to be contiguous");
+  TORCH_CHECK(self.is_contiguous(), "_pdist_backward requires self to be contiguous");
+  TORCH_CHECK(pdist.is_contiguous(), "_pdist_backward requires pdist to be contiguous");
   auto device = self.type().device_type();
-  AT_CHECK(device == kCPU || device == kCUDA, "_pdist_backward only supports CPU and CUDA devices, got: ", device);
+  TORCH_CHECK(device == kCPU || device == kCUDA, "_pdist_backward only supports CPU and CUDA devices, got: ", device);
   Tensor result = at::empty_like(self);
   pdist_backward_stub(device, result, grad, self, p, pdist);
   return result;
diff --git a/aten/src/ATen/native/Distributions.cpp b/aten/src/ATen/native/Distributions.cpp
index 9d7c3aa2f3c8..e99f6be6c994 100644
--- a/aten/src/ATen/native/Distributions.cpp
+++ b/aten/src/ATen/native/Distributions.cpp
@@ -153,7 +153,7 @@ Tensor& bernoulli_tensor_cpu_(Tensor& self, const Tensor& p_, Generator* gen) {
 DEFINE_DISPATCH(bernoulli_mkl_stub);
 
 Tensor& bernoulli_scalar_cpu_(Tensor& self, double p, Generator* gen) {
-  AT_CHECK(0 <= p && p <= 1, "bernoulli_ expects p to be in [0, 1], but got p=", p);
+  TORCH_CHECK(0 <= p && p <= 1, "bernoulli_ expects p to be in [0, 1], but got p=", p);
 #if AT_MKL_ENABLED()
   if (cpuinfo_initialize() && cpuinfo_vendor_intel == cpuinfo_get_processor(0)->core->vendor) {
     bernoulli_mkl_stub(kCPU, self, p, gen);
diff --git a/aten/src/ATen/native/Dropout.cpp b/aten/src/ATen/native/Dropout.cpp
index 52181f1f1eac..32b27d438134 100644
--- a/aten/src/ATen/native/Dropout.cpp
+++ b/aten/src/ATen/native/Dropout.cpp
@@ -10,7 +10,7 @@ using Ctype = typename std::conditional<inplace, Tensor&, Tensor>::type;
 
 Tensor make_feature_noise(const Tensor& input) {
   auto input_sizes = input.sizes();
-  AT_CHECK(input.dim() >= 2, "Feature dropout requires at least 2 dimensions in the input");
+  TORCH_CHECK(input.dim() >= 2, "Feature dropout requires at least 2 dimensions in the input");
   std::vector<int64_t> sizes;
   sizes.reserve(input.dim());
   sizes.push_back(input_sizes[0]);
@@ -21,7 +21,7 @@ Tensor make_feature_noise(const Tensor& input) {
 }
 
 bool is_fused_kernel_acceptable(const Tensor& input, double p) {
-  return input.is_cuda() && p > 0 && p < 1;
+  return input.is_cuda() && p > 0 && p < 1 && input.numel() > 0;
 }
 
 // NB: sure, we could have used different overloads here, but I would feel insecure
@@ -40,8 +40,8 @@ Tensor multiply(const Tensor& input, const Tensor& noise) {
 
 template<bool feature_dropout, bool alpha_dropout, bool inplace, typename T>
 Ctype<inplace> _dropout_impl(T& input, double p, bool train) {
-  AT_CHECK(p >= 0 && p <= 1, "dropout probability has to be between 0 and 1, but got ", p);
-  if (p == 0 || !train) {
+  TORCH_CHECK(p >= 0 && p <= 1, "dropout probability has to be between 0 and 1, but got ", p);
+  if (p == 0 || !train || input.numel() == 0) {
     return input;
   }
 
diff --git a/aten/src/ATen/native/EmbeddingBag.cpp b/aten/src/ATen/native/EmbeddingBag.cpp
index 485a2b0d39b5..66d4fb1c48b3 100644
--- a/aten/src/ATen/native/EmbeddingBag.cpp
+++ b/aten/src/ATen/native/EmbeddingBag.cpp
@@ -339,7 +339,7 @@ _embedding_bag_cpu(const Tensor &weight, const Tensor &indices,
   checkScalarTypes("embedding_bag", weight_arg, {kFloat, kDouble});
 
   if (per_sample_weights.defined()) {
-    AT_CHECK(mode == MODE_SUM,
+    TORCH_CHECK(mode == MODE_SUM,
         "embedding_bag: per_sample_weights only supported with mode='sum'");
     auto per_input_weights_arg = TensorArg(
         per_sample_weights,"per_sample_weights", 1);
@@ -624,7 +624,7 @@ Tensor _embedding_bag_per_sample_weights_backward_cpu_template(
     const Tensor& offsets,
     const Tensor& offset2bag,
     int64_t mode) {
-  AT_CHECK(
+  TORCH_CHECK(
       mode == MODE_SUM,
       "embedding_bag_backward: per_sample_weights only supported for mode='sum'");
 
diff --git a/aten/src/ATen/native/FractionalMaxPool2d.cpp b/aten/src/ATen/native/FractionalMaxPool2d.cpp
index 879a798a40b0..8e7dc4e6343a 100644
--- a/aten/src/ATen/native/FractionalMaxPool2d.cpp
+++ b/aten/src/ATen/native/FractionalMaxPool2d.cpp
@@ -144,7 +144,7 @@ void fractional_max_pool2d_out_cpu_template(
   auto input = input_.contiguous();
 
   int ndims = input.ndimension();
-  AT_CHECK(input.numel() > 0 && (ndims == 3 || ndims == 4),
+  TORCH_CHECK(input.numel() > 0 && (ndims == 3 || ndims == 4),
     "non-empty 3D or 4D (batch mode) tensor expected for input, but got: ",
     ndims);
 
@@ -160,10 +160,10 @@ void fractional_max_pool2d_out_cpu_template(
   int inputH = input.size(heightDim);
   int inputW = input.size(widthDim);
 
-  AT_CHECK(outputH + poolSizeH - 1 <= inputH,
+  TORCH_CHECK(outputH + poolSizeH - 1 <= inputH,
     "fractional_max_pool2d(): pool height ", poolSizeH,
     " too large relative to input height ", inputH);
-  AT_CHECK(outputW + poolSizeW - 1 <= inputW,
+  TORCH_CHECK(outputW + poolSizeW - 1 <= inputW,
     "fractional_max_pool2d(): pool width ", poolSizeW,
     " too large relative to input width ", inputW);
 
@@ -284,9 +284,9 @@ Tensor& fractional_max_pool2d_backward_out_cpu_template(
   /* get contiguous gradOutput */
   auto gradOutput = gradOutput_.contiguous();
 
-  AT_CHECK(outputW == gradOutput.size(widthDim),
+  TORCH_CHECK(outputW == gradOutput.size(widthDim),
     "fractional_max_pool2d_backward(): gradOutput width unexpected");
-  AT_CHECK(outputH == gradOutput.size(heightDim),
+  TORCH_CHECK(outputH == gradOutput.size(heightDim),
     "fractional_max_pool2d_backward(): gradOutput height unexpected");
 
   /* resize */
diff --git a/aten/src/ATen/native/FractionalMaxPool3d.cpp b/aten/src/ATen/native/FractionalMaxPool3d.cpp
index f807cc610d02..9ed35b4add68 100644
--- a/aten/src/ATen/native/FractionalMaxPool3d.cpp
+++ b/aten/src/ATen/native/FractionalMaxPool3d.cpp
@@ -161,7 +161,7 @@ void fractional_max_pool3d_out_cpu_template(
   int64_t widthDim = 3;
 
   int64_t ndims = input_.ndimension();
-  AT_CHECK(input_.numel() != 0 && (ndims == 4 || ndims == 5),
+  TORCH_CHECK(input_.numel() != 0 && (ndims == 4 || ndims == 5),
     "fractional_max_pool3d_out(): non-empty 4D or 5D (batch mode) tensor ",
     " expected for input, but got: ", ndims);
 
@@ -179,13 +179,13 @@ void fractional_max_pool3d_out_cpu_template(
   int64_t inputH = input_.size(heightDim);
   int64_t inputW = input_.size(widthDim);
 
-  AT_CHECK(outputT + poolSizeT - 1 < inputT,
+  TORCH_CHECK(outputT + poolSizeT - 1 < inputT,
            "fractional_max_pool3d_out(): pool time ", poolSizeT,
            " too large relative to input time ", inputT);
-  AT_CHECK(outputW + poolSizeW - 1 < inputW,
+  TORCH_CHECK(outputW + poolSizeW - 1 < inputW,
            "fractional_max_pool3d_out(): pool width ", poolSizeW,
            " too large relative to input width ", inputW);
-  AT_CHECK(outputH + poolSizeH - 1 < inputH,
+  TORCH_CHECK(outputH + poolSizeH - 1 < inputH,
            "fractional_max_pool3d_out(): pool height ", poolSizeH,
            " too large relative to input height ", inputH);
 
@@ -317,12 +317,12 @@ void fractional_max_pool3d_backward_out_cpu_template(
   int64_t inputH = input.size(heightDim);
   int64_t inputW = input.size(widthDim);
 
-  AT_CHECK(outputT == gradOutput_.size(timeDim),
+  TORCH_CHECK(outputT == gradOutput_.size(timeDim),
            "fractional_max_pool3d_backward_out(): gradOutput time unexpected");
-  AT_CHECK(outputH == gradOutput_.size(heightDim),
+  TORCH_CHECK(outputH == gradOutput_.size(heightDim),
            "fractional_max_pool3d_backward_out(): ",
            "gradOutput height unexpected");
-  AT_CHECK(outputW == gradOutput_.size(widthDim),
+  TORCH_CHECK(outputW == gradOutput_.size(widthDim),
            "fractional_max_pool3d_backward_out(): gradOutput width unexpected");
 
   /* get contiguous gradOutput */
diff --git a/aten/src/ATen/native/GridSampler.cpp b/aten/src/ATen/native/GridSampler.cpp
index 6b54a5f779f0..71f17d99b269 100644
--- a/aten/src/ATen/native/GridSampler.cpp
+++ b/aten/src/ATen/native/GridSampler.cpp
@@ -559,39 +559,39 @@ grid_sampler_3d_backward_cpu(const Tensor& grad_output, const Tensor& input, con
 
 Tensor grid_sampler(const Tensor& input, const Tensor& grid,
                     int64_t interpolation_mode, int64_t padding_mode) {
-  AT_CHECK(
+  TORCH_CHECK(
     input.defined() && grid.defined(),
     "grid_sampler(): expected input and grid to not be undefined, but input "
     "is ", input, " and grid is ", grid);
   auto input_opt = input.options();
   auto grid_opt = grid.options();
-  AT_CHECK(
+  TORCH_CHECK(
     input_opt.device() == grid_opt.device(),
     "grid_sampler(): expected input and grid to be on same device, but input "
     "is on ", input_opt.device(), " and grid is on ", grid_opt.device());
-  AT_CHECK(
+  TORCH_CHECK(
     input_opt.dtype() == grid_opt.dtype(),
     "grid_sampler(): expected input and grid to have same dtype, but input "
     "has ", input_opt.dtype(), " and grid has ", grid_opt.dtype());
-  AT_CHECK(
+  TORCH_CHECK(
     input_opt.layout() == kStrided && grid_opt.layout() == kStrided,
     "grid_sampler(): expected input and grid to have torch.strided layout, but "
     "input has ", input_opt.layout(), " and grid has ", grid_opt.layout());
-  AT_CHECK(
+  TORCH_CHECK(
     (input.dim() == 4 || input.dim() == 5) && input.dim() == grid.dim(),
     "grid_sampler(): expected 4D or 5D input and grid with same number of "
     "dimensions, but got input with sizes ", input.sizes(),
     " and grid with sizes ", grid.sizes());
-  AT_CHECK(
+  TORCH_CHECK(
     input.size(0) == grid.size(0),
     "grid_sampler(): expected grid and input to have same batch size, but got "
     "input with sizes ", input.sizes(), " and grid with sizes ", grid.sizes());
-  AT_CHECK(
+  TORCH_CHECK(
     grid.size(-1) == input.dim() - 2,
     "grid_sampler(): expected grid to have size ", input.dim() - 2, " in last "
     "dimension, but got grid with sizes ", grid.sizes());
   for (int64_t i = 2; i < input.dim(); i++) {
-    AT_CHECK(input.size(i) > 0,
+    TORCH_CHECK(input.size(i) > 0,
       "grid_sampler(): expected input to have non-empty spatial dimensions, "
       "but input has sizes ", input.sizes(), " with dimension ", i, " being "
       "empty");
diff --git a/aten/src/ATen/native/Itertools.cpp b/aten/src/ATen/native/Itertools.cpp
index 839d69bdd73a..cb379df7a31a 100644
--- a/aten/src/ATen/native/Itertools.cpp
+++ b/aten/src/ATen/native/Itertools.cpp
@@ -32,7 +32,7 @@ namespace native{
 
 Tensor cartesian_prod(TensorList tensors) {
   for(const Tensor &t : tensors) {
-    AT_CHECK(t.dim() == 1, "Expect a 1D vector, but got shape ", t.sizes());
+    TORCH_CHECK(t.dim() == 1, "Expect a 1D vector, but got shape ", t.sizes());
   }
   if (tensors.size() == 1) {
     return tensors[0];
@@ -45,8 +45,8 @@ Tensor cartesian_prod(TensorList tensors) {
 }
 
 Tensor combinations(const Tensor& self, int64_t r, bool with_replacement) {
-  AT_CHECK(self.dim() == 1, "Expect a 1D vector, but got shape ", self.sizes());
-  AT_CHECK(r > 0, "Expect a positive number, but got ", r);
+  TORCH_CHECK(self.dim() == 1, "Expect a 1D vector, but got shape ", self.sizes());
+  TORCH_CHECK(r > 0, "Expect a positive number, but got ", r);
   int64_t num_elements = self.numel();
   std::vector<Tensor> grids = at::meshgrid(std::vector<Tensor>(r, self));
   Tensor mask = _triu_mask(num_elements, r, with_replacement, self.options());
diff --git a/aten/src/ATen/native/LegacyDefinitions.cpp b/aten/src/ATen/native/LegacyDefinitions.cpp
index c62de35f2937..73e64d72dec1 100644
--- a/aten/src/ATen/native/LegacyDefinitions.cpp
+++ b/aten/src/ATen/native/LegacyDefinitions.cpp
@@ -249,7 +249,7 @@ Tensor & random_(Tensor& self, Generator * generator) {
   return at::legacy::th::_th_random_(self, generator);
 }
 
-Tensor & uniform_(Tensor& self, double from, double to, Generator * generator) {
+Tensor & uniform_cpu_(Tensor& self, double from, double to, Generator * generator) {
   return at::legacy::th::_th_uniform_(self, from, to, generator);
 }
 
@@ -405,10 +405,10 @@ Tensor & masked_select_out(Tensor & result, const Tensor & self, const Tensor &
 
 Tensor masked_select(const Tensor & self, const Tensor & mask) {
   if (mask.dtype() == at::ScalarType::Byte) {
-  return at::legacy::th::_th_masked_select(self, mask);
-} else {
-  return at::legacy::th::_th_masked_select_bool(self, mask);
-}
+    return at::legacy::th::_th_masked_select(self, mask);
+  } else {
+    return at::legacy::th::_th_masked_select_bool(self, mask);
+  }
 }
 
 Tensor & nonzero_out(Tensor & result, const Tensor & self) {
diff --git a/aten/src/ATen/native/LegacyNNDefinitions.cpp b/aten/src/ATen/native/LegacyNNDefinitions.cpp
index dcf819e51a39..ecde3af9dcab 100644
--- a/aten/src/ATen/native/LegacyNNDefinitions.cpp
+++ b/aten/src/ATen/native/LegacyNNDefinitions.cpp
@@ -332,22 +332,6 @@ Tensor softshrink_backward(const Tensor & grad_output, const Tensor & self, Scal
   return at::legacy::th::_thnn_softshrink_backward(grad_output, self, lambd);
 }
 
-Tensor & adaptive_avg_pool3d_out(Tensor & output, const Tensor & self, IntArrayRef output_size) {
-  return at::legacy::th::_thnn_adaptive_avg_pool3d_forward_out(output, self, output_size);
-}
-
-Tensor adaptive_avg_pool3d(const Tensor & self, IntArrayRef output_size) {
-  return at::legacy::th::_thnn_adaptive_avg_pool3d_forward(self, output_size);
-}
-
-Tensor & adaptive_avg_pool3d_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self) {
-  return at::legacy::th::_thnn_adaptive_avg_pool3d_backward_out(grad_input, grad_output, self);
-}
-
-Tensor adaptive_avg_pool3d_backward(const Tensor & grad_output, const Tensor & self) {
-  return at::legacy::th::_thnn_adaptive_avg_pool3d_backward(grad_output, self);
-}
-
 Tensor & avg_pool2d_out(Tensor & output, const Tensor & self, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, bool ceil_mode, bool count_include_pad) {
   return at::legacy::th::_thnn_avg_pool2d_forward_out(output, self, kernel_size, stride, padding, ceil_mode, count_include_pad);
 }
@@ -380,22 +364,6 @@ Tensor avg_pool3d_backward(const Tensor & grad_output, const Tensor & self, IntA
   return at::legacy::th::_thnn_avg_pool3d_backward(grad_output, self, kernel_size, stride, padding, ceil_mode, count_include_pad);
 }
 
-std::tuple<Tensor &,Tensor &> max_pool2d_with_indices_out(Tensor & output, Tensor & indices, const Tensor & self, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool ceil_mode) {
-  return at::legacy::th::_thnn_max_pool2d_with_indices_forward_out(output, indices, self, kernel_size, stride, padding, dilation, ceil_mode);
-}
-
-std::tuple<Tensor,Tensor> max_pool2d_with_indices(const Tensor & self, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool ceil_mode) {
-  return at::legacy::th::_thnn_max_pool2d_with_indices_forward(self, kernel_size, stride, padding, dilation, ceil_mode);
-}
-
-Tensor & max_pool2d_with_indices_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool ceil_mode, const Tensor & indices) {
-  return at::legacy::th::_thnn_max_pool2d_with_indices_backward_out(grad_input, grad_output, self, kernel_size, stride, padding, dilation, ceil_mode, indices);
-}
-
-Tensor max_pool2d_with_indices_backward(const Tensor & grad_output, const Tensor & self, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool ceil_mode, const Tensor & indices) {
-  return at::legacy::th::_thnn_max_pool2d_with_indices_backward(grad_output, self, kernel_size, stride, padding, dilation, ceil_mode, indices);
-}
-
 std::tuple<Tensor &,Tensor &> max_pool3d_with_indices_out(Tensor & output, Tensor & indices, const Tensor & self, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool ceil_mode) {
   return at::legacy::th::_thnn_max_pool3d_with_indices_forward_out(output, indices, self, kernel_size, stride, padding, dilation, ceil_mode);
 }
diff --git a/aten/src/ATen/native/Lerp.cpp b/aten/src/ATen/native/Lerp.cpp
index 37dfa0522d8f..5cd4c67376ee 100644
--- a/aten/src/ATen/native/Lerp.cpp
+++ b/aten/src/ATen/native/Lerp.cpp
@@ -38,7 +38,7 @@ namespace native {
 Tensor& lerp_cpu_tensor_out(Tensor& result, const Tensor& self,
                             const Tensor& end, const Tensor& weight) {
   Tensor b_self, b_end, b_weight;
-  AT_CHECK(weight.dim() <= std::max(self.dim(), end.dim()),
+  TORCH_CHECK(weight.dim() <= std::max(self.dim(), end.dim()),
            "weight should be of dimension max(self.dim(), end.dim()) or lesser");
   std::tie(b_self, b_end, b_weight) = expand_outplace(self, end, weight, "lerp_out_cpu");
   result.resize_as_(b_self);
@@ -62,10 +62,10 @@ Tensor& lerp_cpu_scalar_out(Tensor& result, const Tensor& self,
 Tensor& lerp_cpu_tensor_(Tensor& self, const Tensor& end, const Tensor& weight) {
   Tensor b_self, b_end, b_weight;
   std::tie(b_self, b_end, b_weight) = expand_outplace(self, end, weight, "lerp__cpu");
-  AT_CHECK(b_self.sizes() == self.sizes(),
+  TORCH_CHECK(b_self.sizes() == self.sizes(),
            "output with shape ", self.sizes(),
            " doesn't match the broadcast shape ", b_self.sizes());
-  AT_CHECK(weight.dim() <= std::max(self.dim(), end.dim()),
+  TORCH_CHECK(weight.dim() <= std::max(self.dim(), end.dim()),
            "weight should be of dimension max(self.dim(), end.dim()) or lesser");
   AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "lerp__cpu", [&]{
     lerp_cpu<scalar_t>(self, b_self, b_end, b_weight);
@@ -76,7 +76,7 @@ Tensor& lerp_cpu_tensor_(Tensor& self, const Tensor& end, const Tensor& weight)
 Tensor& lerp_cpu_scalar_(Tensor& self, const Tensor& end, Scalar weight) {
   Tensor b_self, b_end;
   std::tie(b_self, b_end) = expand_outplace(self, end, "lerp__cpu");
-  AT_CHECK(b_self.sizes() == self.sizes(),
+  TORCH_CHECK(b_self.sizes() == self.sizes(),
            "output with shape ", self.sizes(),
            " doesn't match the broadcast shape ", b_self.sizes());
   AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "lerp__cpu", [&]{
@@ -87,7 +87,7 @@ Tensor& lerp_cpu_scalar_(Tensor& self, const Tensor& end, Scalar weight) {
 
 Tensor lerp_cpu_tensor(const Tensor& self, const Tensor& end, const Tensor& weight) {
   Tensor b_self, b_end, b_weight;
-  AT_CHECK(weight.dim() <= std::max(self.dim(), end.dim()),
+  TORCH_CHECK(weight.dim() <= std::max(self.dim(), end.dim()),
            "weight should be of dimension max(self.dim(), end.dim()) or lesser");
   std::tie(b_self, b_end, b_weight) = expand_outplace(self, end, weight, "lerp_cpu");
   Tensor result = at::empty_like(b_self);
diff --git a/aten/src/ATen/native/Linear.cpp b/aten/src/ATen/native/Linear.cpp
index 0970a9217eea..b03e1032d945 100644
--- a/aten/src/ATen/native/Linear.cpp
+++ b/aten/src/ATen/native/Linear.cpp
@@ -33,7 +33,7 @@ Tensor linear(const Tensor& input, const Tensor& weight, const Tensor& bias) {
 static Tensor sumproduct_pair(const Tensor& left_, const Tensor& right_, IntArrayRef sum_dims_, bool keepdim) {
   // assumes that tensors have been pre-unsqueezed (so that all dimensions match - after broadcasting)
   // but makes no other assumptions on the order of dimensions
-  AT_CHECK(left_.dim()==right_.dim(), "number of dimensions must match");
+  TORCH_CHECK(left_.dim()==right_.dim(), "number of dimensions must match");
   if (sum_dims_.size() == 0)
     return at::mul(left_, right_);
   int64_t dim = left_.dim();
@@ -50,7 +50,7 @@ static Tensor sumproduct_pair(const Tensor& left_, const Tensor& right_, IntArra
     auto sr = right.size(i)>1;
     if (sum_dims[i]) { // first dimensions that will be summed over after multiplication
       if (sl && sr) {  // dimensions nontrivially in both left and right must be of the same size
-        AT_CHECK(left.size(i)==right.size(i), "non-broadcast dimensions must match");
+        TORCH_CHECK(left.size(i)==right.size(i), "non-broadcast dimensions must match");
         sum_size *= left.size(i);
       } else if (sl) { // if it is only in one of left and right, we can sum right away
         left = left.sum(i, true);
@@ -59,7 +59,7 @@ static Tensor sumproduct_pair(const Tensor& left_, const Tensor& right_, IntArra
       }
     } else if (sl && sr) { // now deal with dimensions  dimensions that will be in the output
       // dimensions nontrivially in both left and right must be of the same size
-      AT_CHECK(left.size(i)==right.size(i), "non-broadcast dimensions must match");
+      TORCH_CHECK(left.size(i)==right.size(i), "non-broadcast dimensions must match");
       lro.push_back(i);
       lro_size *= left.size(i);
     } else if (sl) { // keep track of dimensions appearing only once
@@ -168,7 +168,7 @@ Tensor einsum(std::string eqn, TensorList tensors) {
   int64_t num_total_idxes = 0;
   while (! eqn_stream.eof()) {
     std::getline(eqn_stream, term, ',');  // term = string with indices of current term
-    AT_CHECK((int64_t) tensors.size()>operand, "more operands in equation than tensors"); // we cannot have a longer equation than operands. We need to check here before we use the dimension
+    TORCH_CHECK((int64_t) tensors.size()>operand, "more operands in equation than tensors"); // we cannot have a longer equation than operands. We need to check here before we use the dimension
 
     int64_t ell_char_count = 0;            // handling of ellipsis '...' is a bit tedious, we count the '.'
     // if there is an ellipsis, the number of dimensions it represents must be total dim - letter dimensions
@@ -178,7 +178,7 @@ Tensor einsum(std::string eqn, TensorList tensors) {
     for (auto &c : term) {                 // c = character with a single letter or '.'
       if (c == '.') {
         ell_char_count++;
-        AT_CHECK(ell_char_count <= 3, "can only have '.' in one ellispis '...' in term ", operand, " of the equation");
+        TORCH_CHECK(ell_char_count <= 3, "can only have '.' in one ellispis '...' in term ", operand, " of the equation");
         if (ell_char_count == 3) {        // this completes the ellipsis
           if (num_ell_idxes == -1) {      // if we have not seen an ellipsis before, keep track of indices and size
             first_ell_idx = num_total_idxes;
@@ -186,7 +186,7 @@ Tensor einsum(std::string eqn, TensorList tensors) {
             num_total_idxes += num_ell_idxes;
           }
           else {                          // we have seen an ellipsis before, so we check compatibility
-            AT_CHECK(candidate_num_ell_idxes == num_ell_idxes,
+            TORCH_CHECK(candidate_num_ell_idxes == num_ell_idxes,
                      "ellipsis must represent ", num_ell_idxes, " dimensions in all terms");
           }
           for (int64_t i = 0; i < num_ell_idxes; ++i) { // map ellipsis dimensions in operand to indices
@@ -196,8 +196,8 @@ Tensor einsum(std::string eqn, TensorList tensors) {
           dims_in_term += num_ell_idxes;                // keep track of dimensions
         }
       } else {                                          // a letter (hopefully)
-        AT_CHECK((ell_char_count == 0) || (ell_char_count == 3), "'.' must only occur in ellipsis, operand ", operand);
-        AT_CHECK(('a' <= c) && (c <= 'z'), "only lowercase letters a-z allowed as indices");
+        TORCH_CHECK((ell_char_count == 0) || (ell_char_count == 3), "'.' must only occur in ellipsis, operand ", operand);
+        TORCH_CHECK(('a' <= c) && (c <= 'z'), "only lowercase letters a-z allowed as indices");
         int64_t letter_num = c-'a';                     // letter_num  = position in letter_mapping
         if (letter_mapping[letter_num] == -1) {         // new letter, add internal index and mapping
           letter_mapping[letter_num] = num_total_idxes;
@@ -211,12 +211,12 @@ Tensor einsum(std::string eqn, TensorList tensors) {
         dims_in_term++;
       }
     }
-    AT_CHECK(dims_in_term == tensors[operand].dim(), "dimension mismatch for operand ", operand, ": equation ", dims_in_term, " tensor ", tensors[operand].dim());
+    TORCH_CHECK(dims_in_term == tensors[operand].dim(), "dimension mismatch for operand ", operand, ": equation ", dims_in_term, " tensor ", tensors[operand].dim());
     input_op_idxes.push_back(std::move(current_op_idxes));
     operand++;
   }
   // in the check below, we need ==, but > is captured above, so the error message can be specific that it is <.
-  AT_CHECK((int64_t) tensors.size()==operand, "more tensors than operands in equation");
+  TORCH_CHECK((int64_t) tensors.size()==operand, "more tensors than operands in equation");
 
   // the following parses or infers output (right hand side)
   // it also assigns the idxes_to_preprocessed_dims (index -> dimension in preprocessed / output tensors)
@@ -228,19 +228,19 @@ Tensor einsum(std::string eqn, TensorList tensors) {
     for (auto &c : eqn.substr(pos+2)) {
       if (c == '.') {                        // '.' as part of ellipsis
         ell_char_count++;
-        AT_CHECK(ell_char_count <= 3, "can only have '.' in one ellispis '...' in right hand side of the equation");
+        TORCH_CHECK(ell_char_count <= 3, "can only have '.' in one ellispis '...' in right hand side of the equation");
         if (ell_char_count == 3) {           // ellipsis complete
-          AT_CHECK(num_ell_idxes >= 0, "ellipsis '...' may only appear in right hand side if it does in left hand side");
+          TORCH_CHECK(num_ell_idxes >= 0, "ellipsis '...' may only appear in right hand side if it does in left hand side");
           for (int64_t i = 0; i < num_ell_idxes; ++i) {
             idxes_to_preprocessed_dims[first_ell_idx + i] = num_output_dims;
             num_output_dims++;
           }
         }
       } else if (! isspace(c)) {                              // letter (hopefully)
-        AT_CHECK((ell_char_count == 0) || (ell_char_count == 3), "'.' must only occur in ellipsis in the right hand side");
-        AT_CHECK(('a' <= c) && (c <= 'z'), "only lowercase letters a-z allowed as indices");
+        TORCH_CHECK((ell_char_count == 0) || (ell_char_count == 3), "'.' must only occur in ellipsis in the right hand side");
+        TORCH_CHECK(('a' <= c) && (c <= 'z'), "only lowercase letters a-z allowed as indices");
         int64_t letter_num = c-'a';
-        AT_CHECK(idxes_to_preprocessed_dims[letter_mapping[letter_num]] == -1, "index ", c, "occurs twice in output");
+        TORCH_CHECK(idxes_to_preprocessed_dims[letter_mapping[letter_num]] == -1, "index ", c, "occurs twice in output");
         idxes_to_preprocessed_dims[letter_mapping[letter_num]] = num_output_dims;
         num_output_dims++;
       }
@@ -293,11 +293,11 @@ Tensor einsum(std::string eqn, TensorList tensors) {
           size_of_dims[idx] = preprocessed_op.size(dim);
         }
         else {
-          AT_CHECK(size_of_dims[idx] == preprocessed_op.size(dim), "size of dimension does not match previous size, operand ", op, ", dim ", i);
+          TORCH_CHECK(size_of_dims[idx] == preprocessed_op.size(dim), "size of dimension does not match previous size, operand ", op, ", dim ", i);
         }
         dim++;
       } else { // duplicate dimension in tensor --> take diagonal of idx_to_dim[dim_out] and dim and put the diagonal dimension to idx_to_dim[dim_out]
-        AT_CHECK(size_of_dims[idx] == preprocessed_op.size(dim), "size of dimension does not match previous size, operand ", op, ", dim ", i);
+        TORCH_CHECK(size_of_dims[idx] == preprocessed_op.size(dim), "size of dimension does not match previous size, operand ", op, ", dim ", i);
         preprocessed_op = preprocessed_op.diagonal(0, idx_to_dim[dim_out], dim);
         // diagonal moves the diagonal dimension to the back
         // now we permute the last dim back to idx_to_dim[dim_out]
@@ -367,7 +367,7 @@ Tensor _trilinear(const Tensor& i1_, const Tensor& i2_, const Tensor& i3_,
                   IntArrayRef expand1_, IntArrayRef expand2_, IntArrayRef expand3_,
                   IntArrayRef sumdim_, int64_t unroll_dim) {
   int64_t total_dim = i1_.dim()+expand1_.size();
-  AT_CHECK((unroll_dim >= 0) && (unroll_dim < total_dim), "unroll_dim must be in [0,", total_dim-1, "]");
+  TORCH_CHECK((unroll_dim >= 0) && (unroll_dim < total_dim), "unroll_dim must be in [0,", total_dim-1, "]");
   auto expand1 = at::dim_list_to_bitset(expand1_, total_dim);
   auto expand2 = at::dim_list_to_bitset(expand2_, total_dim);
   auto expand3 = at::dim_list_to_bitset(expand3_, total_dim);
@@ -433,18 +433,18 @@ Tensor _trilinear(const Tensor& i1_, const Tensor& i2_, const Tensor& i3_,
 }
 
 Tensor bilinear(const Tensor& input1, const Tensor& input2, const Tensor& weight, const Tensor& bias) {
-  AT_CHECK(input1.dim() == input2.dim(), "bilinear(): input dimensions do not match: got ", input1.dim(), " and ", input2.dim());
+  TORCH_CHECK(input1.dim() == input2.dim(), "bilinear(): input dimensions do not match: got ", input1.dim(), " and ", input2.dim());
   for (int64_t i = 0; i < input1.dim() - 1; i++) {
-    AT_CHECK(input1.size(i) == input2.size(i),
+    TORCH_CHECK(input1.size(i) == input2.size(i),
               "bilinear(): input batch dimensions do not match at dim ", i, ": got ", input1.size(i), " and ", input2.size(i));
   }
-  AT_CHECK(input1.size(input1.dim() - 1) == weight.size(1),
+  TORCH_CHECK(input1.size(input1.dim() - 1) == weight.size(1),
             "bilinear(): input1 size does not match weight size: got ",
             input1.size(input1.dim() - 1), " but expected ", weight.size(1));
-  AT_CHECK(input2.size(input2.dim() - 1) == weight.size(2),
+  TORCH_CHECK(input2.size(input2.dim() - 1) == weight.size(2),
             "bilinear(): input2 size does not match weight size: got ",
             input2.size(input2.dim() - 1), " but expected ", weight.size(2));
-  AT_CHECK(!bias.defined() || bias.size(0) == weight.size(0),
+  TORCH_CHECK(!bias.defined() || bias.size(0) == weight.size(0),
             "bilinear(): bias size does not match weight size: got ",
             bias.size(0), " but expected ", weight.size(0));
 
@@ -464,7 +464,7 @@ Tensor bilinear(const Tensor& input1, const Tensor& input2, const Tensor& weight
 // implements tensordot, a matrix-multiplication-like contraction, but the dimensions given
 // in the two dimension lists
 Tensor tensordot(const Tensor& input1, const Tensor& input2, IntArrayRef dims1, IntArrayRef dims2) {
-  AT_CHECK(dims1.size() == dims2.size(), "both dimension lists should have same length");
+  TORCH_CHECK(dims1.size() == dims2.size(), "both dimension lists should have same length");
   int64_t csize = 1;  // total size of the contracted dimensions
   Tensor t1 = input1;
   Tensor t2 = input2;
@@ -476,7 +476,7 @@ Tensor tensordot(const Tensor& input1, const Tensor& input2, IntArrayRef dims1,
     } else if (s1 == 1) {
       t2 = t2.sum(dims2[i], true);
     } else {
-      AT_CHECK(s1 == s2, "contracted dimensions need to match, but first has size ", s1, " in dim ", dims1[i],
+      TORCH_CHECK(s1 == s2, "contracted dimensions need to match, but first has size ", s1, " in dim ", dims1[i],
                " and second has size ", s2, " in dim ", dims2[i]);
       csize *= s1;
     }
diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp
index b40120c3a5e2..c01f37beb480 100644
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@@ -23,7 +23,7 @@ static inline std::tuple<double, Tensor, int> _lu_det_P_diag_U_info(const Tensor
   Tensor p, lu, info;
   std::tie(lu, p, info) = at::_lu_with_info(self, /*pivot=*/true, /*check_errors=*/false);
   int int_info = info.item<int32_t>();
-  AT_CHECK(int_info >= 0, "LU factorization (getrf) failed with info = ", int_info);
+  TORCH_CHECK(int_info >= 0, "LU factorization (getrf) failed with info = ", int_info);
   auto n = self.size(0);
   auto num_exchanges = (at::arange(1, n + 1, p.options()) != p).nonzero().size(0);
   if (num_exchanges % 2 == 1) {
@@ -34,7 +34,7 @@ static inline std::tuple<double, Tensor, int> _lu_det_P_diag_U_info(const Tensor
 }
 
 Tensor det(const Tensor& self) {
-  AT_CHECK(at::isFloatingType(self.scalar_type()) &&
+  TORCH_CHECK(at::isFloatingType(self.scalar_type()) &&
            self.dim() == 2 && self.size(0) == self.size(1),
            "det(", self.type(), "{", self.sizes(), "}): expected a 2D square tensor "
            "of floating types");
@@ -50,7 +50,7 @@ Tensor det(const Tensor& self) {
 }
 
 Tensor logdet(const Tensor& self) {
-  AT_CHECK(at::isFloatingType(self.scalar_type()) &&
+  TORCH_CHECK(at::isFloatingType(self.scalar_type()) &&
            self.dim() == 2 && self.size(0) == self.size(1),
            "logdet(", self.type(), "{", self.sizes(), "}): expected a 2D square tensor "
            "of floating types");
@@ -73,7 +73,7 @@ Tensor logdet(const Tensor& self) {
 }
 
 std::tuple<Tensor, Tensor> slogdet(const Tensor& self) {
-  AT_CHECK(at::isFloatingType(self.scalar_type()) &&
+  TORCH_CHECK(at::isFloatingType(self.scalar_type()) &&
            self.dim() == 2 && self.size(0) == self.size(1),
            "slogdet(", self.type(), "{", self.sizes(), "}): expected a 2D square tensor "
            "of floating types");
@@ -93,7 +93,7 @@ std::tuple<Tensor, Tensor> slogdet(const Tensor& self) {
 }
 
 Tensor pinverse(const Tensor& self, double rcond) {
-  AT_CHECK(at::isFloatingType(self.scalar_type()) && self.dim() == 2,
+  TORCH_CHECK(at::isFloatingType(self.scalar_type()) && self.dim() == 2,
            "pinverse(", self.type(), "{", self.sizes(), "}): expected a 2D tensor "
            "of floating types");
   if (self.numel() == 0) {
@@ -121,7 +121,7 @@ static inline Tensor _matrix_rank_helper(const Tensor& self, bool symmetric) {
 }
 
 Tensor matrix_rank(const Tensor& self, double tol, bool symmetric) {
-  AT_CHECK(at::isFloatingType(self.scalar_type()) && self.dim() == 2,
+  TORCH_CHECK(at::isFloatingType(self.scalar_type()) && self.dim() == 2,
            "matrix_rank(", self.type(), "{", self.sizes(), "}): expected a 2D tensor "
            "of floating types");
 
@@ -130,7 +130,7 @@ Tensor matrix_rank(const Tensor& self, double tol, bool symmetric) {
 }
 
 Tensor matrix_rank(const Tensor& self, bool symmetric) {
-  AT_CHECK(at::isFloatingType(self.scalar_type()) && self.dim() == 2,
+  TORCH_CHECK(at::isFloatingType(self.scalar_type()) && self.dim() == 2,
            "matrix_rank(", self.type(), "{", self.sizes(), "}): expected a 2D tensor "
            "of floating types");
 
@@ -140,7 +140,7 @@ Tensor matrix_rank(const Tensor& self, bool symmetric) {
 }
 
 static void check_1d(const Tensor& t, const char* arg, const char* fn) {
- AT_CHECK(t.dim() == 1, fn, ": Expected 1-D argument ", arg, ", but got ", t.dim(), "-D");
+ TORCH_CHECK(t.dim() == 1, fn, ": Expected 1-D argument ", arg, ", but got ", t.dim(), "-D");
 }
 
 Tensor ger(const Tensor& self, const Tensor& vec2) {
@@ -368,7 +368,7 @@ Tensor dot(const Tensor& self, const Tensor& tensor) {
 
 Tensor& dot_out(Tensor& result, const Tensor& self, const Tensor& tensor) {
   result.resize_({});
-  AT_CHECK(result.scalar_type() == self.scalar_type(),
+  TORCH_CHECK(result.scalar_type() == self.scalar_type(),
            "result dtype ", result.scalar_type(), " does not match self dtype ", self.scalar_type());
   return result.fill_(self.dot(tensor));
 }
@@ -428,6 +428,29 @@ Tensor matmul(
     Tensor output = has_out ? at::_unsafe_view(at::mm_out(out, t1, t2), output_size)
                             : at::_unsafe_view(t1.mm(t2), output_size);
     return has_out ? out.set_(output) : output;
+  } else if ((dim_tensor1 == 1 || dim_tensor1 == 2) && dim_tensor2 >= 3) {
+    // optimization: transpose the inner dimensions of the arguments, call
+    // matmul on the swapped arguments, then transpose the inner dimensions
+    // of the result.
+    const int64_t n = dim_tensor1 == 2 ? tensor1.size(-2) : 1;
+    const int64_t m = tensor1.size(-1);
+    const int64_t p = tensor2.size(-1);
+
+    const Tensor t2_T = tensor2.transpose(-1, -2);
+    const Tensor t1_T = dim_tensor1 == 2 ? tensor1.t() : tensor1.reshape({n, m}).t();
+    const Tensor res_T = matmul(out_opt, t2_T, t1_T);
+
+    if (dim_tensor1 == 2) {
+      Tensor res = res_T.transpose(-1, -2).contiguous();
+      return has_out ? out.set_(res) : res;
+    }
+    else {
+      std::vector<int64_t> shape = tensor2.sizes().slice(0, dim_tensor2 - 2).vec();
+      shape.push_back(p);
+
+      Tensor res = res_T.reshape(shape).contiguous();
+      return has_out ? out.set_(res) : res;
+    }
   } else if ((dim_tensor1 >= 1 && dim_tensor2 >= 1) && (dim_tensor1 >= 3 || dim_tensor2 >= 3)) {
     // We are multiplying b1 x n x m1 by x2 x m2 x p (where b1 can be a list);
     // we track m1 vs m2 separately even though they must match for nicer error messages
@@ -489,7 +512,7 @@ Tensor& matmul_out(Tensor &result, const Tensor & tensor1, const Tensor & tensor
 }
 
 Tensor matrix_power(const Tensor& a, int64_t n) {
-  AT_CHECK(a.dim() >= 2 && at::isFloatingType(a.scalar_type()),
+  TORCH_CHECK(a.dim() >= 2 && at::isFloatingType(a.scalar_type()),
            "matrix_power(", a.type(), "{", a.sizes(), "}): expected a tensor "
            "of floating types with dim at least 2");
   if (n == 0) {
@@ -531,7 +554,7 @@ Tensor frobenius_norm(const Tensor& self) {
 }
 
 Tensor frobenius_norm(const Tensor& self, IntArrayRef dim, bool keepdim) {
-  AT_CHECK(
+  TORCH_CHECK(
       dim.size() <= 2,
       "Expected at most 2 dimensions, but got ",
       dim.size(),
@@ -547,7 +570,7 @@ Tensor &frobenius_norm_out(
     const Tensor& self,
     IntArrayRef dim,
     bool keepdim) {
-  AT_CHECK(
+  TORCH_CHECK(
       dim.size() <= 2,
       "Expected at most 2 dimensions, but got ",
       dim.size(),
@@ -559,7 +582,7 @@ Tensor &frobenius_norm_out(
 }
 
 Tensor nuclear_norm(const Tensor& self, bool keepdim) {
-  AT_CHECK(
+  TORCH_CHECK(
       self.dim() == 2,
       "Expected a tensor with 2 dimensions, but got a ",
       self.dim(),
@@ -568,7 +591,7 @@ Tensor nuclear_norm(const Tensor& self, bool keepdim) {
 }
 
 Tensor &nuclear_norm_out(Tensor& result, const Tensor& self, bool keepdim) {
-  AT_CHECK(
+  TORCH_CHECK(
       self.dim() == 2,
       "Expected a tensor with 2 dimensions, but got a ",
       self.dim(),
diff --git a/aten/src/ATen/native/LinearAlgebraUtils.h b/aten/src/ATen/native/LinearAlgebraUtils.h
index 3e470f51da06..9fee50876462 100644
--- a/aten/src/ATen/native/LinearAlgebraUtils.h
+++ b/aten/src/ATen/native/LinearAlgebraUtils.h
@@ -99,11 +99,11 @@ static inline void linearSolveCheckInputs(const Tensor& self, const Tensor& A) {
     AT_ERROR(ss.str());
   }
 
-  AT_CHECK(A.size(-1) == A.size(-2),
+  TORCH_CHECK(A.size(-1) == A.size(-2),
            "A must be batches of square matrices, "
            "but they are ", A.size(-1), " by ", A.size(-2), " matrices");
 
-  AT_CHECK(A.size(-1) == self.size(-2),
+  TORCH_CHECK(A.size(-1) == self.size(-2),
            "Incompatible matrix sizes for matmul: each A "
            "matrix is ", A.size(-1), " by ", A.size(-1),
            " but each b matrix is ", self.size(-2), " by ", self.size(-1));
@@ -111,7 +111,7 @@ static inline void linearSolveCheckInputs(const Tensor& self, const Tensor& A) {
 
 // Validates input shapes for operations on batches of square matrices (inverse, cholesky, lu)
 static inline void squareCheckInputs(const Tensor& self) {
-  AT_CHECK(self.size(-1) == self.size(-2),
+  TORCH_CHECK(self.size(-1) == self.size(-2),
            "A must be batches of square matrices, "
            "but they are ", self.size(-1), " by ", self.size(-2), " matrices");
 }
@@ -164,7 +164,7 @@ static inline void singleCheckErrors(int64_t info, const char* name) {
 // Checks if all the Tensors in a TensorList are of the same dimensions
 static inline void checkAllSameDim(TensorList tensors, int64_t dim) {
   for (auto &t : tensors) {
-    AT_CHECK(t.dim() == dim, "Tensor dimension is ", t.dim(), ", expected ", dim, " instead.");
+    TORCH_CHECK(t.dim() == dim, "Tensor dimension is ", t.dim(), ", expected ", dim, " instead.");
   }
 }
 
diff --git a/aten/src/ATen/native/Loss.cpp b/aten/src/ATen/native/Loss.cpp
index 7def0da9ae0f..ad18ee8578e4 100644
--- a/aten/src/ATen/native/Loss.cpp
+++ b/aten/src/ATen/native/Loss.cpp
@@ -1,9 +1,15 @@
+// define constants like M_PI and C keywords for MSVC
+#ifdef _MSC_VER
+#define _USE_MATH_DEFINES
+#include <math.h>
+#endif
 #include <ATen/ATen.h>
 #include <ATen/NativeFunctions.h>
 #include <ATen/Dispatch.h>
 #include <ATen/CPUApplyUtils.h>
 
 #define EPSILON 1e-12
+#define _USE_MATH_DEFINES
 
 namespace {
   static inline at::Tensor apply_loss_reduction(const at::Tensor& unreduced, int64_t reduction) {
@@ -125,4 +131,21 @@ Tensor binary_cross_entropy_with_logits_backward(const Tensor& grad, const Tenso
 
     return grad_input;
 }
+
+Tensor poisson_nll_loss(const Tensor& input, const Tensor& target, const bool log_input, const bool full, const double eps, const int64_t reduction)
+{
+    Tensor loss;
+    if (log_input) {
+        loss = at::exp(input) - target * input;
+    } else {
+        loss = input - target * at::log(input + eps);
+    }
+    
+    if (full) {
+        auto mask1 = (target > 1);
+        loss.masked_select(mask1) += (target * at::log(target) - target + 0.5 * at::log(2 * M_PI * target)).masked_select(mask1);
+    }
+
+    return apply_loss_reduction(loss, reduction);
+}
 }}  // namespace at::native
diff --git a/aten/src/ATen/native/LossCTC.cpp b/aten/src/ATen/native/LossCTC.cpp
index b17f9c9f7d8d..756f36f4abc4 100644
--- a/aten/src/ATen/native/LossCTC.cpp
+++ b/aten/src/ATen/native/LossCTC.cpp
@@ -49,9 +49,9 @@ std::tuple<Tensor, Tensor> ctc_loss_cpu_template(const Tensor& log_probs, const
 
   int64_t batch_size = log_probs.size(1);
   int64_t num_labels = log_probs.size(2);
-  AT_CHECK((0 <= BLANK) && (BLANK < num_labels), "blank must be in label range");
-  AT_CHECK((int64_t) input_lengths.size() == batch_size, "input_lengths must be of size batch_size");
-  AT_CHECK((int64_t) target_lengths.size() == batch_size, "target_lengths must be of size batch_size");
+  TORCH_CHECK((0 <= BLANK) && (BLANK < num_labels), "blank must be in label range");
+  TORCH_CHECK((int64_t) input_lengths.size() == batch_size, "input_lengths must be of size batch_size");
+  TORCH_CHECK((int64_t) target_lengths.size() == batch_size, "target_lengths must be of size batch_size");
 
   size_t tg_target_stride;
   int64_t max_target_length = 0;
@@ -77,13 +77,13 @@ std::tuple<Tensor, Tensor> ctc_loss_cpu_template(const Tensor& log_probs, const
     }
     tg_target_stride = targets.stride(1);
     checkSize(c, targets_arg, 0, batch_size);
-    AT_CHECK(targets.size(1) >= max_target_length,
+    TORCH_CHECK(targets.size(1) >= max_target_length,
              "Expected tensor to have size at least ", max_target_length, " at dimension 1, but got size ", targets.size(1), " for ", targets_arg,
              " (while checking arguments for ", c, ")");
   }
   int64_t max_input_length = log_probs.size(0);
   for (int64_t b = 0; b < batch_size; b++) {
-    AT_CHECK(input_lengths[b] <= max_input_length,
+    TORCH_CHECK(input_lengths[b] <= max_input_length,
              "Expected tensor to have size at least ", max_input_length, " at dimension 1, but got size ", input_lengths[b], " for ", log_probs_arg,
              " (while checking arguments for ", c, ")");
   }
@@ -377,8 +377,8 @@ Tensor ctc_loss(const Tensor& log_probs, const Tensor& targets, IntArrayRef inpu
 
 // Convenience function accepting Tensors
 Tensor ctc_loss(const Tensor& log_probs, const Tensor& targets, const Tensor& input_lengths, const Tensor& target_lengths, int64_t BLANK, int64_t reduction, bool zero_infinity) {
-  AT_CHECK(isIntegralType(input_lengths.scalar_type()), "input_lenghts must be integral");
-  AT_CHECK(isIntegralType(target_lengths.scalar_type()), "target_lenghts must be integral");
+  TORCH_CHECK(isIntegralType(input_lengths.scalar_type()), "input_lenghts must be integral");
+  TORCH_CHECK(isIntegralType(target_lengths.scalar_type()), "target_lenghts must be integral");
 
   Tensor ilc = input_lengths.toType(kLong).toBackend(Backend::CPU).contiguous();
   Tensor tlc = target_lengths.toType(kLong).toBackend(Backend::CPU).contiguous();
diff --git a/aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/Normalization.cpp
index ca0215cc4a8b..0f257e6eb247 100644
--- a/aten/src/ATen/native/Normalization.cpp
+++ b/aten/src/ATen/native/Normalization.cpp
@@ -15,7 +15,7 @@ namespace at { namespace native {
 
 namespace {
   void check_dims_match_num_input_features(const char* arg_name, int64_t expected, int64_t actual){
-    AT_CHECK(actual == expected,
+    TORCH_CHECK(actual == expected,
              arg_name, " should contain ", expected, " elements not ", actual);
   }
 
@@ -434,7 +434,7 @@ Tensor instance_norm(
     const Tensor& input, const Tensor& weight /* optional */, const Tensor& bias /* optional */,
     const Tensor& running_mean /* optional */, const Tensor& running_var /* optional */,
     bool use_input_stats, double momentum, double eps, bool cudnn_enabled) {
-  AT_CHECK(use_input_stats || (running_mean.defined() && running_var.defined()),
+  TORCH_CHECK(use_input_stats || (running_mean.defined() && running_var.defined()),
            "Expected running_mean and running_var to be defined when use_input_stats is false");
   std::vector<int64_t> shape = input.sizes().vec();
   int64_t b = input.size(0);
@@ -462,64 +462,6 @@ Tensor instance_norm(
   return out.view(input.sizes());
 }
 
-Tensor layer_norm(const Tensor& input, IntArrayRef normalized_shape,
-    const Tensor& weight /* optional */, const Tensor& bias /* optional */,
-    double eps, bool cudnn_enabled) {
-
-    int64_t normalized_ndim = normalized_shape.size();
-
-    AT_CHECK(normalized_ndim >= 1,
-             "Expected normalized_shape to be at least 1-dimensional, i.e., ",
-             "containing at least one element, but got normalized_shape=",
-             normalized_shape);
-
-    AT_CHECK(!weight.defined() || weight.sizes().equals(normalized_shape),
-             "Expected weight to be of same shape as normalized_shape, but got ",
-             "weight of shape ", weight.sizes(), " and normalized_shape=",
-             normalized_shape);
-    AT_CHECK(!bias.defined() || bias.sizes().equals(normalized_shape),
-             "Expected bias to be of same shape as normalized_shape, but got ",
-             "bias of shape ", bias.sizes(), " and normalized_shape=",
-             normalized_shape);
-
-    auto input_shape = input.sizes();
-    auto input_ndim = input.dim();
-
-    if (input_ndim < normalized_ndim ||
-        !input_shape.slice(input_ndim - normalized_ndim).equals(normalized_shape)) {
-      std::stringstream ss;
-      ss << "Given normalized_shape=" << normalized_shape
-         << ", expected input with shape [*";
-      for (auto size : normalized_shape) {
-        ss << ", " << size;
-      }
-      ss << "], but got input of size" << input_shape;
-      AT_ERROR(ss.str());
-    }
-
-    int64_t n = 1;
-    for (int64_t i = 0; i < input_ndim - normalized_ndim; i++) {
-      n *= input_shape[i];
-    }
-
-    // Apply layer norm
-    auto input_reshaped = input.contiguous().view({1, n, -1});
-
-    auto out = at::batch_norm(input_reshaped, {}, {}, {}, {}, true, 0, eps,
-                              cudnn_enabled);
-    out = out.view(input_shape);
-
-    if (weight.defined() && bias.defined()) {
-      return bias.addcmul(out, weight, 1);
-    } else if (weight.defined()) {
-      return out.mul(weight);
-    } else if (bias.defined()) {
-      return out.add(bias);
-    } else {
-      return out;
-    }
-}
-
 Tensor group_norm(const Tensor& input, int64_t num_groups,
     const Tensor& weight /* optional */, const Tensor& bias /* optional */,
     double eps, bool cudnn_enabled) {
@@ -528,16 +470,16 @@ Tensor group_norm(const Tensor& input, int64_t num_groups,
     int64_t b = input.size(0);
     int64_t c = input.size(1);
 
-    AT_CHECK(c % num_groups == 0,
+    TORCH_CHECK(c % num_groups == 0,
              "Expected number of channels in input to be divisible by ",
              "num_groups, but got input of shape ", input.sizes(), " and "
              "num_groups=", num_groups);
 
-    AT_CHECK(!weight.defined() || (weight.dim() == 1 && weight.numel() == c),
+    TORCH_CHECK(!weight.defined() || (weight.dim() == 1 && weight.numel() == c),
              "Expected weight to be a vector of size equal to the number of ",
              "channels in input, but got weight of shape ", weight.sizes(),
              " and input of shape ", input.sizes());
-    AT_CHECK(!bias.defined() || (bias.dim() == 1 && bias.numel() == c),
+    TORCH_CHECK(!bias.defined() || (bias.dim() == 1 && bias.numel() == c),
              "Expected bias to be a vector of size equal to the number of ",
              "channels in input, but got bias of shape ", weight.sizes(),
              " and input of shape ", input.sizes());
diff --git a/aten/src/ATen/native/Onehot.cpp b/aten/src/ATen/native/Onehot.cpp
index d02f97ee3e59..2962e04676e9 100644
--- a/aten/src/ATen/native/Onehot.cpp
+++ b/aten/src/ATen/native/Onehot.cpp
@@ -3,7 +3,7 @@
 namespace at { namespace native {
 
 Tensor one_hot(const Tensor &self, int64_t num_classes) {
-    AT_CHECK(self.dtype() == kLong, "one_hot is only applicable to index tensor.");
+    TORCH_CHECK(self.dtype() == kLong, "one_hot is only applicable to index tensor.");
     auto shape = self.sizes().vec();
 
     // empty tensor could be converted to one hot representation,
@@ -18,11 +18,11 @@ Tensor one_hot(const Tensor &self, int64_t num_classes) {
     }
 
     // non-empty tensor
-    AT_CHECK(self.min().item().toLong() >= 0, "Class values must be non-negative.");
+    TORCH_CHECK(self.min().item().toLong() >= 0, "Class values must be non-negative.");
     if (num_classes == -1) {
         num_classes = self.max().item().toLong() + 1;
     } else {
-        AT_CHECK(num_classes > self.max().item().toLong(), "Class values must be smaller than num_classes.");
+        TORCH_CHECK(num_classes > self.max().item().toLong(), "Class values must be smaller than num_classes.");
     }
 
     shape.push_back(num_classes);
diff --git a/aten/src/ATen/native/PackedSequence.cpp b/aten/src/ATen/native/PackedSequence.cpp
index bd9a70c781ff..39dd0b518863 100644
--- a/aten/src/ATen/native/PackedSequence.cpp
+++ b/aten/src/ATen/native/PackedSequence.cpp
@@ -4,7 +4,7 @@
 namespace at { namespace native {
 
 void checkLongTensor(const Tensor& tensor) {
-  AT_CHECK(tensor.dim() == 1 && tensor.type().device_type() == at::kCPU && tensor.scalar_type() == at::kLong,
+  TORCH_CHECK(tensor.dim() == 1 && tensor.type().device_type() == at::kCPU && tensor.scalar_type() == at::kLong,
            "'lengths' argument should be a 1D CPU int64 tensor");
 }
 
@@ -20,10 +20,10 @@ std::tuple<Tensor, Tensor> _pack_padded_sequence(const Tensor& _input, const Ten
 
   int64_t batch_size = input.size(1);
   int64_t * lengths = lengths_t.data<int64_t>();
-  AT_CHECK(lengths_t.size(0) == batch_size,
+  TORCH_CHECK(lengths_t.size(0) == batch_size,
            "Expected `len(lengths)` to be equal to batch_size, but got ", lengths_t.size(0),
            " (batch_size=", batch_size, ")");
-  AT_CHECK(lengths[batch_size - 1] > 0,
+  TORCH_CHECK(lengths[batch_size - 1] > 0,
            "Length of all samples has to be greater than 0, but found an element "
            "in 'lengths' that is <= 0");
   for(auto i = 0; i < batch_size - 1; i++) {
@@ -83,7 +83,7 @@ std::tuple<Tensor, Tensor> _pack_padded_sequence(const Tensor& _input, const Ten
       }
       prev_l = l;
     }
-    AT_CHECK(l >= prev_l);
+    TORCH_CHECK(l >= prev_l);
   }
 
   return std::make_tuple(at::cat(steps), batch_sizes_t);
@@ -95,7 +95,7 @@ std::tuple<Tensor, Tensor> _pack_padded_sequence(const Tensor& _input, const Ten
 Tensor _pack_padded_sequence_backward(const Tensor& grad, at::IntArrayRef input_size, const Tensor& _batch_sizes, bool batch_first) {
   std::vector<int64_t> input_size_after_t = input_size.vec();
   if (batch_first) {
-    AT_CHECK(input_size.size() >= 2);
+    TORCH_CHECK(input_size.size() >= 2);
     std::swap(input_size_after_t[0], input_size_after_t[1]);
   }
   auto grad_input = at::zeros(input_size_after_t, grad.options());
@@ -126,7 +126,7 @@ std::tuple<Tensor, Tensor> _pad_packed_sequence(const Tensor& data, const Tensor
   int64_t max_real_seq_length = batch_sizes_t.size(0);
   int64_t max_seq_length = max_real_seq_length;
   if (total_length > 0) {
-    AT_CHECK(total_length >= max_seq_length,
+    TORCH_CHECK(total_length >= max_seq_length,
              "Expected total_length to be at least the length of the longest "
              "sequence in input, but got total_length=", total_length, " and "
              "max sequence length being ", max_seq_length);
diff --git a/aten/src/ATen/native/Pooling.cpp b/aten/src/ATen/native/Pooling.cpp
index 9e267ffb76ed..a04c2878845e 100644
--- a/aten/src/ATen/native/Pooling.cpp
+++ b/aten/src/ATen/native/Pooling.cpp
@@ -12,7 +12,7 @@ static void check1d(
     const char* function_name,
     const char* argument_name,
     IntArrayRef x) {
-  AT_CHECK(
+  TORCH_CHECK(
       x.size() == 1,
       function_name, "() argument '", argument_name,
       "' should contain one int (got ", x.size(), ")");
diff --git a/aten/src/ATen/native/README.md b/aten/src/ATen/native/README.md
index 3e5b7369fc05..449a99530b1c 100644
--- a/aten/src/ATen/native/README.md
+++ b/aten/src/ATen/native/README.md
@@ -49,13 +49,13 @@ signature.
   and may be omitted by passing an undefined tensor.  When a function takes multiple
   `Tensor` arguments, these tensors are assumed to be the same type (e.g.,
   if one argument is a `FloatTensor`, all other arguments are checked
-  to be `FloatTensor`s.)
+  to be `FloatTensor`s).  
   `Tensor` or `Tensor?` must sometimes be annotated to indicate aliasing and mutability.
-  In general annotations can be defined via the following four situtations
-  `Tensor(a)` - `a` is a set of Tensors that may alias to the same data.
-  `Tensor(a!)` - `a` members of a may be written to thus mutating the underlying data.
-  `Tensor!` - shorthand for Tensor(fresh\_identifier!)
-  `Tensor(a! -> a|b)` - Tensor is in set `a`, written to, and after the write is in set `a` AND `b`.
+  In general annotations can be defined via the following four situations:
+  - `Tensor(a)` - `a` is a set of Tensors that may alias to the same data.
+  - `Tensor(a!)` - `a` members of a may be written to thus mutating the underlying data.
+  - `Tensor!` - shorthand for Tensor(fresh\_identifier!)
+  - `Tensor(a! -> a|b)` - Tensor is in set `a`, written to, and after the write is in set `a` AND `b`.
   For more details on when and why this needs to happen, please see the section on annotations.
 - `Tensor[]`.  A `Tensor[]` argument translates into a C++ argument of type `ArrayRef<Tensor>`
   (a.k.a. `TensorList`)
@@ -80,18 +80,18 @@ signature.
 - `*` is a special sentinel argument, which doesn't translate into an actual
   argument, but indicates that in the Python bindings, any subsequent arguments
   must be specified as keyword arguments (and cannot be provided positionally).
-- `?` is trailing question mark that annotate an argument to be an optional type, grep for
+- `?` is trailing question mark that annotates an argument to be an optional type. Grep for
   `optional` to find some example usages. In general, most functions will not need to use
   this, but there are some cases that we want to use optional for the different types:
-    - You want to pass in a `None` to a ATen function/method from Python, and handles the
-      None type in the C++ side. For example, `clamp(Tensor self, Scalar? min=None, Scalar? max=None)`
-      can take `None` for its `min` and `max` parameter, and do dispatch to different
-      backend if one of the parameters is `None`. Optional type can accept a `None` type
+    - You want to pass a `None` to an ATen function/method from Python and handle the
+      None type on the C++ side. For example, `clamp(Tensor self, Scalar? min=None, Scalar? max=None)`
+      can take `None` for its `min` and `max` parameter, but does not dispatch to different
+      backends if one of the parameters is `None`. Optional type can accept a `None` type
       (`nullopt` in C++) from Python and use the [C++ Optional class](https://en.cppreference.com/w/cpp/utility/optional) to interact with the parameters.
-    - You want a default value which is fine in Python but would cause ambiguity in C++.
+    - You want a default value, which is fine in Python, but would cause ambiguity in C++.
       For example, `norm(Tensor self, Scalar p=2, int dim, bool keepdim=False)` would
-      cause ambiguity in C++ since it default args must be adjacent and `p` could not
-      have a default value when `dim` does not. Therefore, we need to make `p` as a
+      cause ambiguity in C++ since its default args must be adjacent (`p` could not
+      have a default value when `dim` does not). Therefore, we need to make `p` as a
       optional Scalar, and make `p=2` when `p` is not passed in (nullopt).
     - You want a value to default to the same value as another argument (this cannot be
       expressed in C++ default arguments).
@@ -123,7 +123,7 @@ Here are the supported default values:
 * Numbers (e.g., `0` or `5.0` for `int`, `float` and `int[]`
   with an explicit length (e.g., `int[2]`)--in the case of `int[]`
   a number is replicated to fill the length (e.g., `int[2] x=2`
-  is equivalent to `int[2] x=[2,2]`.
+  is equivalent to `int[2] x=[2,2]`).
 * Lists of numbers (e.g., `[0, 0]`) for `IntList`.
 * Booleans (e.g., `True`) for `bool`.
 * Empty initializer lists (e.g., `[]`) for `Tensor` (this implicitly changes
@@ -191,19 +191,19 @@ more complicated neural network layers (e.g., `conv2d`) and internal functions
 designed specifically for binding (e.g., `cudnn_convolution`).
 
 As we progress along our schema unification of the `func` schema with the JIT
-signatue schema, we must introduce features that allow us to increase compliance.
+signature schema, we must introduce features that allow us to increase compliance.
 One of these features are Tensor annotations. As of now we use naming conventions
 to indicate whether an argument of a function is going to be mutated and returned.
 
 ### `annotations`
 
 There are two typical situations in which we mutate the memory of an argument in the Python
-frontend:
-a) For an inplace operations such as `self.abs_()`
+frontend:  
+a) For an inplace operations such as `self.abs_()`  
 b) for a function with an output keyword argument such as `torch.abs(input, out=None)`.
 
 In order to provide implementations for these Python functions the legacy schema
-requires C++ implementations for three situations `abs(Tensor self)  -> Tensor`, 
+requires C++ implementations for three situations `abs(Tensor self)  -> Tensor`,
 `abs_(Tensor self) -> Tensor` and `abs_out(Tensor out, Tensor self) -> Tensor`.
 
 Now, as we move towards the unification, we start to use a different syntax to represent
@@ -220,14 +220,14 @@ Let's revisit the previous native function declarations and see the conventions
     `self` may be written to and returned. Further, the annotation indicates that the return value
     may alias the input. This indicates an inplace function and by convention ends in a single '\_'.
   - `abs(Tensor self, *, Tensor(a!) out) -> Tensor(a!)`
-    In the Python frontend `out` can be passed as a keyword argument and may be written to. 
+    In the Python frontend `out` can be passed as a keyword argument and may be written to.
     In this case it indicates the schema for a function that must accept `out` as this does not
     provide a default argument. The idea behind representing this as a optional argument is to
     document the intended usage. This maps to the legacy `abs_out(Tensor out, Tensor self) -> Tensor`.
     As with the legacy `_out` function you must call the argument `Tensor out` or `Tensor out0`,
     `Tensor out1` in the context of multiple arguments.
 
-There is also another situtation in which we use annotations, namely views.
+There is also another situation in which we use annotations, namely views.
   - `transpose(Tensor(a) self, int dim0, int dim1) -> Tensor(a)`
     An alias to the memory represented by `self` may be also returned, however it is not mutated.
 
@@ -298,9 +298,8 @@ implementation (no header necessary) with a matching signature to
 the generated header from the ATen metadata.  There are many
 simple native functions; take a look at some of them to see what to do.
 
-Although, for the most part, writing an ATen function is mostly writing
-the algorithm you want to implement, there are some less obvious details
-you should also consider.
+Although writing an ATen function is mostly writing the algorithm you want
+to implement, there are some less obvious details you should also consider.
 
 ### Will your function be automatically differentiable?
 
diff --git a/aten/src/ATen/native/RNN.cpp b/aten/src/ATen/native/RNN.cpp
index 420e0ea4df04..9b5071941e89 100644
--- a/aten/src/ATen/native/RNN.cpp
+++ b/aten/src/ATen/native/RNN.cpp
@@ -114,10 +114,10 @@ struct QuantizedCellParams {
   const Scalar zero_point_hh;
 
   Tensor matmul_ih(Tensor input) const {
-    AT_CHECK(false, "matmul is not supported with quantized cell params");
+    TORCH_CHECK(false, "matmul is not supported with quantized cell params");
   }
   Tensor matmul_hh(Tensor h) const {
-    AT_CHECK(false, "matmul is not supported with quantized cell params");
+    TORCH_CHECK(false, "matmul is not supported with quantized cell params");
   }
   Tensor linear_ih(Tensor input) const {
     return at::fbgemm_linear_int8_weight(
@@ -132,7 +132,7 @@ struct QuantizedCellParams {
 // Gathers every two elements of a vector in a vector of pairs
 template<typename T>
 static std::vector<pair_of<T>> pair_vec(const std::vector<T>& vals) {
-  AT_CHECK(vals.size() % 2 == 0, "Odd number of params or hiddens given to a bidirectional RNN");
+  TORCH_CHECK(vals.size() % 2 == 0, "Odd number of params or hiddens given to a bidirectional RNN");
   std::vector<pair_of<T>> result;
   result.reserve(vals.size() / 2);
   for (int64_t i = 0; i < vals.size(); i += 2) {
@@ -158,12 +158,12 @@ static std::vector<CellParams> gather_params(TensorList params, bool has_biases)
   static at::Tensor undefined;
   std::vector<CellParams> result;
   if (has_biases) {
-    AT_CHECK(params.size() % 4 == 0, "got an incorrect number of RNN parameters");
+    TORCH_CHECK(params.size() % 4 == 0, "got an incorrect number of RNN parameters");
     for (size_t i = 0; i < params.size(); i += 4) {
       result.emplace_back(params[i], params[i + 1], params[i + 2], params[i + 3]);
     }
   } else {
-    AT_CHECK(params.size() % 2 == 0, "got an incorrect number of RNN parameters");
+    TORCH_CHECK(params.size() % 2 == 0, "got an incorrect number of RNN parameters");
     for (size_t i = 0; i < params.size(); i += 2) {
       result.emplace_back(params[i], params[i + 1], undefined, undefined);
     }
@@ -174,7 +174,7 @@ static std::vector<CellParams> gather_params(TensorList params, bool has_biases)
 static std::vector<QuantizedCellParams> gather_quantized_params(TensorList params) {
   static at::Tensor undefined;
   std::vector<QuantizedCellParams> result;
-  AT_CHECK(params.size() % 12 == 0, "got an incorrect number of quantized RNN parameters");
+  TORCH_CHECK(params.size() % 12 == 0, "got an incorrect number of quantized RNN parameters");
   for (size_t i = 0; i < params.size(); i += 12) {
     result.emplace_back(params[i], params[i + 1], params[i + 2], params[i + 3],
                         params[i + 4], params[i + 5], params[i + 6], params[i + 7],
@@ -512,8 +512,8 @@ LayerOutput<io_type, std::vector<hidden_type>>
 apply_layer_stack(const Layer<io_type, hidden_type, weight_type>& layer, const io_type& input,
                   const std::vector<hidden_type>& hiddens, const std::vector<weight_type>& weights,
                   int64_t num_layers, double dropout_p, bool train) {
-  AT_CHECK(num_layers == hiddens.size(), "Expected more hidden states in stacked_rnn");
-  AT_CHECK(num_layers == weights.size(), "Expected more weights in stacked_rnn");
+  TORCH_CHECK(num_layers == hiddens.size(), "Expected more hidden states in stacked_rnn");
+  TORCH_CHECK(num_layers == weights.size(), "Expected more weights in stacked_rnn");
 
   auto layer_input = input;
   auto hidden_it = hiddens.begin();
@@ -658,7 +658,7 @@ std::tuple<Tensor, Tensor, Tensor> lstm(
       const Tensor& _input, TensorList hx,
       TensorList _params, bool has_biases,
       int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) {
-  AT_CHECK(hx.size() == 2, "lstm expects two hidden states");
+  TORCH_CHECK(hx.size() == 2, "lstm expects two hidden states");
   if (at::cudnn_is_acceptable(_input)) {
     Tensor output, hy, cy;
     lstm_cudnn_stub(_input.type().device_type(), output, hy, cy, _input, hx, _params, has_biases,
@@ -680,7 +680,7 @@ std::tuple<Tensor, Tensor, Tensor> lstm(
       const Tensor& data, const Tensor& batch_sizes, TensorList hx,
       TensorList _params, bool has_biases,
       int64_t num_layers, double dropout_p, bool train, bool bidirectional) {
-  AT_CHECK(hx.size() == 2, "lstm expects two hidden states");
+  TORCH_CHECK(hx.size() == 2, "lstm expects two hidden states");
   if (at::cudnn_is_acceptable(data)) {
     Tensor output, hy, cy;
     lstm_packed_cudnn_stub(data.type().device_type(), output, hy, cy, data, batch_sizes, hx,
@@ -698,7 +698,7 @@ std::tuple<Tensor, Tensor, Tensor> lstm(
 std::tuple<Tensor, Tensor> lstm_cell(
     const Tensor& input, TensorList hx,
     const Tensor& w_ih, const Tensor& w_hh, const Tensor& b_ih, const Tensor& b_hh) {
-  AT_CHECK(hx.size() == 2, "lstm_cell expects two hidden states");
+  TORCH_CHECK(hx.size() == 2, "lstm_cell expects two hidden states");
   return LSTMCell<CellParams>{}(input, std::make_tuple(hx[0], hx[1]), CellParams{w_ih, w_hh, b_ih, b_hh});
 }
 
@@ -730,7 +730,7 @@ std::tuple<Tensor, Tensor, Tensor> quantized_lstm(
       const Tensor& _input, TensorList hx,
       TensorList _params, bool has_biases,
       int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) {
-  AT_CHECK(hx.size() == 2, "lstm expects two hidden states");
+  TORCH_CHECK(hx.size() == 2, "lstm expects two hidden states");
   if (at::cudnn_is_acceptable(_input)) {
     Tensor output, hy, cy;
     lstm_cudnn_stub(_input.type().device_type(), output, hy, cy, _input, hx, _params, has_biases,
@@ -739,7 +739,7 @@ std::tuple<Tensor, Tensor, Tensor> quantized_lstm(
   }
   check_device(_input, _params, hx);
   auto input = batch_first ? _input.transpose(0, 1) : _input;
-  AT_CHECK(has_biases, "quantized LSTM requires biases");
+  TORCH_CHECK(has_biases, "quantized LSTM requires biases");
   auto params = gather_quantized_params(_params);
   auto results = _lstm_impl<FullLayer, FullBidirectionalLayer>(
       input, params, hx[0], hx[1], num_layers, dropout_p, train, bidirectional);
diff --git a/aten/src/ATen/native/RNN.h b/aten/src/ATen/native/RNN.h
index a4a359a07380..d9bdd90e4860 100644
--- a/aten/src/ATen/native/RNN.h
+++ b/aten/src/ATen/native/RNN.h
@@ -25,7 +25,7 @@ inline void check_device(const Tensor& input, const TensorList& params, const Te
   auto check_tensors = [&](const std::string& name, const Tensor& t) {
     if (!t.defined()) return;
     auto t_device = t.device();
-    AT_CHECK(input_device == t_device,
+    TORCH_CHECK(input_device == t_device,
              "Input and ", name, " tensors are not at the same device, found input tensor at ",
              input_device, " and ", name, " tensor at ", t_device);
   };
diff --git a/aten/src/ATen/native/RangeFactories.cpp b/aten/src/ATen/native/RangeFactories.cpp
index 2a7fe9b4f1f8..63a4d7b71cc0 100644
--- a/aten/src/ATen/native/RangeFactories.cpp
+++ b/aten/src/ATen/native/RangeFactories.cpp
@@ -9,7 +9,7 @@ namespace at { namespace native {
 
 
 Tensor& linspace_cpu_out(Tensor& result, Scalar start, Scalar end, int64_t steps) {
-  AT_CHECK(steps >= 0, "number of steps must be non-negative");
+  TORCH_CHECK(steps >= 0, "number of steps must be non-negative");
 
   if (result.numel() != steps) {
     result.resize_({steps});
@@ -42,7 +42,7 @@ Tensor& linspace_cpu_out(Tensor& result, Scalar start, Scalar end, int64_t steps
 }
 
 Tensor& logspace_cpu_out(Tensor& result, Scalar start, Scalar end, int64_t steps, double base) {
-  AT_CHECK(steps >= 0, "number of steps must be non-negative");
+  TORCH_CHECK(steps >= 0, "number of steps must be non-negative");
 
   if (result.numel() != steps) {
     result.resize_({steps});
@@ -82,11 +82,11 @@ Tensor& range_cpu_out(Tensor& result, Scalar start, Scalar end, Scalar step) {
     auto xend = end.to<accscalar_t>();
     auto xstep = step.to<accscalar_t>();
 
-    AT_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
-    AT_CHECK(std::isfinite(static_cast<double>(xstart)) &&
+    TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
+    TORCH_CHECK(std::isfinite(static_cast<double>(xstart)) &&
              std::isfinite(static_cast<double>(xend)),
              "unsupported range: ", xstart, " -> ", xend);
-    AT_CHECK(((xstep > 0) && (xend >= xstart)) || ((xstep < 0) && (xend <= xstart)),
+    TORCH_CHECK(((xstep > 0) && (xend >= xstart)) || ((xstep < 0) && (xend <= xstart)),
              "upper bound and larger bound inconsistent with step sign");
     int64_t size = static_cast<int64_t>(((xend - xstart) / xstep) + 1);
     if (result.numel() != size) {
@@ -132,14 +132,14 @@ Tensor& arange_cpu_out(Tensor& result, Scalar start, Scalar end, Scalar step) {
                          / step.to<double>());
     }
 
-    AT_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
-    AT_CHECK(std::isfinite(static_cast<double>(xstart)) &&
+    TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
+    TORCH_CHECK(std::isfinite(static_cast<double>(xstart)) &&
              std::isfinite(static_cast<double>(xend)),
              "unsupported range: ", xstart, " -> ", xend);
-    AT_CHECK(((xstep > 0) && (xend >= xstart)) || ((xstep < 0) && (xend <= xstart)),
+    TORCH_CHECK(((xstep > 0) && (xend >= xstart)) || ((xstep < 0) && (xend <= xstart)),
              "upper bound and larger bound inconsistent with step sign");
 
-    AT_CHECK(size_d >= 0 && size_d <= static_cast<double>(std::numeric_limits<int64_t>::max()),
+    TORCH_CHECK(size_d >= 0 && size_d <= static_cast<double>(std::numeric_limits<int64_t>::max()),
              "invalid size, possible overflow?");
     int64_t size = static_cast<int64_t>(size_d);
 
diff --git a/aten/src/ATen/native/ReduceOps.cpp b/aten/src/ATen/native/ReduceOps.cpp
index 9cb247defb4c..330cad03a2af 100644
--- a/aten/src/ATen/native/ReduceOps.cpp
+++ b/aten/src/ATen/native/ReduceOps.cpp
@@ -91,7 +91,7 @@ static std::unique_ptr<TensorIterator> make_reduction(
     bool keepdim, ScalarType dtype)
 {
   // check that result type and dtype match if provided
-  AT_CHECK(
+  TORCH_CHECK(
       !result.defined() || result.scalar_type() == dtype,
       name, ": provided dtype must match dtype of result. Got ",
       toString(result.scalar_type()),
@@ -114,6 +114,41 @@ static std::unique_ptr<TensorIterator> make_reduction(
   return TensorIterator::reduce_op(viewed_result, self.to(dtype));
 }
 
+static std::unique_ptr<TensorIterator> make_reduction(
+    const char* name, Tensor& result1, Tensor& result2, const Tensor& self, IntArrayRef dim,
+    bool keepdim, ScalarType dtype)
+{
+  // check that result type and dtype match if provided
+  for (const Tensor *t: {&result1, &result2}) {
+    const Tensor& result = *t;
+    TORCH_CHECK(
+        !result.defined() || result.type().scalarType() == dtype,
+        name, ": provided dtype must match dtype of result. Got ",
+        toString(result.type().scalarType()),
+        " and ",
+        toString(dtype),
+        ".");
+  }
+
+  int64_t ndim = self.dim();
+  DimMask mask = make_dim_mask(dim, ndim);
+  allocate_reduction_result(result1, self, mask, keepdim, dtype);
+  auto viewed_result1 = review_reduce_result(result1, ndim, mask, keepdim);
+
+  allocate_reduction_result(result2, self, mask, keepdim, dtype);
+  auto viewed_result2 = review_reduce_result(result2, ndim, mask, keepdim);
+
+  // special case for type promotion in mixed precision, improves computational
+  // efficiency.
+  // We don't generalize this to common mismatched input/output types to avoid cross
+  // product of templated kernel launches.
+  if (self.type().scalarType() == dtype ||
+      (self.is_cuda() && self.type().scalarType() == kHalf && dtype == kFloat)) {
+    return TensorIterator::reduce_op(viewed_result1, viewed_result2, self);
+  }
+  return TensorIterator::reduce_op(viewed_result1, viewed_result2, self.to(dtype));
+}
+
 static inline int64_t n_dim_size(const Tensor& self, IntArrayRef dim) {
   int64_t numel = 1;
   for (auto d : dim) {
@@ -136,7 +171,7 @@ Tensor cumsum(const Tensor& self, int64_t dim) {
 
 static inline Tensor& cumsum_out(Tensor& result, const Tensor& self, int64_t dim, optional<ScalarType> dtype) {
   // result type is favored over dtype; check that they match if provided (NumPy doesn't check)
-  AT_CHECK(
+  TORCH_CHECK(
       !dtype.has_value() || (result.scalar_type() == dtype.value()),
       "provided dtype must match dtype of result in cumsum. Got ",
       toString(result.scalar_type()),
@@ -168,7 +203,7 @@ Tensor cumprod(const Tensor& self, int64_t dim) {
 
 static inline Tensor& cumprod_out(Tensor& result, const Tensor& self, int64_t dim, optional<ScalarType> dtype) {
   // result type is favored over dtype; check that they match if provided (NumPy doesn't check)
-  AT_CHECK(
+  TORCH_CHECK(
       !dtype.has_value() || (result.scalar_type() == dtype.value()),
       "provided dtype must match dtype of result in cumprod. Got ",
       toString(result.scalar_type()),
@@ -257,7 +292,7 @@ Tensor prod(const Tensor &self) {
 static inline Tensor &mean_out(Tensor &result, const Tensor &self, IntArrayRef dim,
                  bool keepdim, optional<ScalarType> opt_dtype) {
   ScalarType scalarType = opt_dtype.has_value() ? opt_dtype.value() : self.scalar_type();
-  AT_CHECK(
+  TORCH_CHECK(
       at::isFloatingType(scalarType),
       "Can only calculate the mean of floating types. Got ",
       toString(scalarType),
@@ -419,11 +454,11 @@ Tensor logsumexp(const Tensor &self, IntArrayRef dims, bool keepdim) {
 static Tensor& norm_out(Tensor &result, const Tensor &self, optional<Scalar> opt_p,
                                IntArrayRef dim, bool keepdim, optional<ScalarType> opt_dtype) {
   auto p = opt_p.value_or(2.0);
-  AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA,
+  TORCH_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA,
            "norm only supports CPU AND CUDA backend, got: ", toString(self.type().backend()));
 
   ScalarType scalarType = opt_dtype.has_value() ? opt_dtype.value() : self.scalar_type();
-  AT_CHECK(
+  TORCH_CHECK(
       at::isFloatingType(scalarType),
       "Can only calculate the mean of floating types. Got ",
       toString(scalarType),
@@ -443,9 +478,9 @@ static inline Tensor _norm(const Tensor &self, Scalar p) {
   if (self.is_sparse()) {
     return at::native_norm(self, p);
   } else {
-    AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA,
+    TORCH_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA,
              "norm only supports CPU AND CUDA backend, got: ", toString(self.type().backend()));
-    AT_CHECK(at::isFloatingType(self.scalar_type()), "norm only supports floating-point dtypes");
+    TORCH_CHECK(at::isFloatingType(self.scalar_type()), "norm only supports floating-point dtypes");
 
     Tensor result;
     return at::native::norm_out(result, self, p, {}, false, c10::nullopt);
@@ -494,10 +529,10 @@ inline Tensor & _all(Tensor & result, std::unique_ptr<TensorIterator> & iter) {
 }
 
 Tensor all(const Tensor& self) {
-  AT_CHECK(self.type().backend() == Backend::CPU ||
+  TORCH_CHECK(self.type().backend() == Backend::CPU ||
     self.type().backend() == Backend::CUDA, "all only supports CPU AND CUDA "
     "backend, got: ", toString(self.type().backend()));
-  AT_CHECK(self.scalar_type() == at::ScalarType::Byte,
+  TORCH_CHECK(self.scalar_type() == at::ScalarType::Byte,
     "all only supports torch.uint8 dtype");
 
   Tensor result = at::empty({0}, self.options());
@@ -512,10 +547,10 @@ Tensor all(const Tensor& self, int64_t dim, bool keepdim) {
 }
 
 Tensor &all_out(Tensor &result, const Tensor &self, int64_t dim, bool keepdim) {
-  AT_CHECK(self.type().backend() == Backend::CPU ||
+  TORCH_CHECK(self.type().backend() == Backend::CPU ||
     self.type().backend() == Backend::CUDA, "all only supports CPU AND CUDA "
     "backend, got: ", toString(self.type().backend()));
-  AT_CHECK(self.scalar_type() == at::ScalarType::Byte,
+  TORCH_CHECK(self.scalar_type() == at::ScalarType::Byte,
     "all only supports torch.uint8 dtype");
   dim = maybe_wrap_dim(dim, self.dim());
   if (_dimreduce_return_trivial(result, self, 1, dim, keepdim)) {
@@ -538,10 +573,10 @@ inline Tensor & _any(Tensor & result, std::unique_ptr<TensorIterator> & iter) {
 }
 
 Tensor any(const Tensor& self) {
-  AT_CHECK(self.type().backend() == Backend::CPU ||
+  TORCH_CHECK(self.type().backend() == Backend::CPU ||
     self.type().backend() == Backend::CUDA, "any only supports CPU AND CUDA "
     "backend, got: ", toString(self.type().backend()));
-  AT_CHECK(self.scalar_type() == at::ScalarType::Byte,
+  TORCH_CHECK(self.scalar_type() == at::ScalarType::Byte,
     "any only supports torch.uint8 dtype");
 
   Tensor result = at::empty({0}, self.options());
@@ -556,10 +591,10 @@ Tensor any(const Tensor& self, int64_t dim, bool keepdim) {
 }
 
 Tensor &any_out(Tensor &result, const Tensor &self, int64_t dim, bool keepdim) {
-  AT_CHECK(self.type().backend() == Backend::CPU ||
+  TORCH_CHECK(self.type().backend() == Backend::CPU ||
     self.type().backend() == Backend::CUDA, "any only supports CPU AND CUDA "
     "backend, got: ", toString(self.type().backend()));
-  AT_CHECK(self.scalar_type() == at::ScalarType::Byte,
+  TORCH_CHECK(self.scalar_type() == at::ScalarType::Byte,
     "any only supports torch.uint8 dtype");
   dim = maybe_wrap_dim(dim, self.dim());
   if (_dimreduce_return_trivial(result, self, 0, dim, keepdim)) {
@@ -578,7 +613,7 @@ Tensor min_values(const Tensor& self, IntArrayRef dims, bool keepdim) {
     Tensor result = at::empty({0}, self.options());
     ScalarType dtype = get_dtype(result, self, {}, true);
     auto iter = make_reduction("min_values", result, self, dims, keepdim, dtype);
-    AT_CHECK(iter->numel() > 0, "min_values on a tensor with no elements is not defined.");
+    TORCH_CHECK(iter->numel() > 0, "min_values on a tensor with no elements is not defined.");
     min_values_stub(iter->device_type(), *iter);
     return result;
   }
@@ -591,16 +626,16 @@ Tensor max_values(const Tensor& self, IntArrayRef dims, bool keepdim) {
     Tensor result = at::empty({0}, self.options());
     ScalarType dtype = get_dtype(result, self, {}, true);
     auto iter = make_reduction("max_values", result, self, dims, keepdim, dtype);
-    AT_CHECK(iter->numel() > 0, "max_values on a tensor with no elements is not defined.");
+    TORCH_CHECK(iter->numel() > 0, "max_values on a tensor with no elements is not defined.");
     max_values_stub(iter->device_type(), *iter);
     return result;
   }
 }
 
 static Tensor &std_var_out(Tensor &result, const Tensor &self, IntArrayRef dim, bool unbiased, bool keepdim, bool take_sqrt) {
-  AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA,
+  TORCH_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA,
            "std and var only support CPU AND CUDA backend, got: ", toString(self.type().backend()));
-  AT_CHECK(at::isFloatingType(self.scalar_type()), "std and var only support floating-point dtypes");
+  TORCH_CHECK(at::isFloatingType(self.scalar_type()), "std and var only support floating-point dtypes");
   ScalarType dtype = get_dtype(result, self, {}, true);
   auto iter = make_reduction("std or var", result, self, dim, keepdim, dtype);
   if (iter->numel() == 0) {
@@ -611,10 +646,72 @@ static Tensor &std_var_out(Tensor &result, const Tensor &self, IntArrayRef dim,
   return result;
 }
 
+static std::tuple<Tensor&,Tensor&> std_var_mean_out(const char* fname, Tensor &result1, Tensor &result2, const Tensor &self, IntArrayRef dim, bool unbiased, bool keepdim, bool take_sqrt) {
+  AT_ASSERT(result1.defined() && result2.defined());
+  TORCH_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA,
+           fname, " only support CPU and CUDA backend, got: ", toString(self.type().backend()));
+  TORCH_CHECK(at::isFloatingType(self.type().scalarType()), fname, " only support floating-point dtypes");
+  TORCH_CHECK(result1.type().scalarType() == result2.type().scalarType(),
+           "provided by result1 dtype must match dtype of result2. Got ",
+           toString(result1.type().scalarType()),
+           " and ",
+           toString(result2.type().scalarType()),
+           ".");
+  ScalarType dtype = get_dtype(result1, self, {}, true);
+  auto iter = make_reduction(fname, result1, result2, self, dim, keepdim, dtype);
+  if (iter->numel() == 0) {
+    result1.fill_(NAN);
+    result2.fill_(NAN);
+  } else {
+    std_var_stub(iter->device_type(), *iter, unbiased, take_sqrt);
+  }
+  return std::tuple<Tensor&, Tensor&>(result1, result2);
+}
+
+std::tuple<Tensor&,Tensor&> var_mean_out(Tensor &result1, Tensor &result2, const Tensor &self, IntArrayRef dim, bool unbiased, bool keepdim) {
+  return std_var_mean_out("var_mean", result1, result2, self, dim, unbiased, keepdim, false);
+}
+
+std::tuple<Tensor&,Tensor&> std_mean_out(Tensor &result1, Tensor &result2, const Tensor &self, IntArrayRef dim, bool unbiased, bool keepdim) {
+  return std_var_mean_out("std_mean", result1, result2, self, dim, unbiased, keepdim, true);
+}
+
+std::tuple<Tensor&,Tensor&> var_mean_out(Tensor &result1, Tensor &result2, const Tensor &self, bool unbiased) {
+  return std_var_mean_out("var_mean", result1, result2, self, {}, unbiased, false, false);
+}
+
+std::tuple<Tensor&,Tensor&> std_mean_out(Tensor &result1, Tensor &result2, const Tensor &self, bool unbiased) {
+  return std_var_mean_out("std_mean", result1, result2, self, {}, unbiased, false, true);
+}
+
+std::tuple<Tensor,Tensor> var_mean(const Tensor& self, IntArrayRef dim, bool unbiased, bool keepdim) {
+  Tensor result1 = at::empty({0}, self.options());
+  Tensor result2 = at::empty({0}, self.options());
+  return at::native::var_mean_out(result1, result2, self, dim, unbiased, keepdim);
+}
+
+std::tuple<Tensor,Tensor> std_mean(const Tensor& self, IntArrayRef dim, bool unbiased, bool keepdim) {
+  Tensor result1 = at::empty({0}, self.options());
+  Tensor result2 = at::empty({0}, self.options());
+  return at::native::std_mean_out(result1, result2, self, dim, unbiased, keepdim);
+}
+
+std::tuple<Tensor,Tensor> std_mean(const Tensor& self, bool unbiased) {
+  Tensor result1 = at::empty({0}, self.options());
+  Tensor result2 = at::empty({0}, self.options());
+  return at::native::std_mean_out(result1, result2, self, unbiased);
+}
+
+std::tuple<Tensor,Tensor> var_mean(const Tensor& self, bool unbiased) {
+  Tensor result1 = at::empty({0}, self.options());
+  Tensor result2 = at::empty({0}, self.options());
+  return at::native::var_mean_out(result1, result2, self, unbiased);
+}
+
 Tensor var(const Tensor& self, bool unbiased) {
-  AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA,
+  TORCH_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA,
            "var only supports CPU AND CUDA backend, got: ", toString(self.type().backend()));
-  AT_CHECK(at::isFloatingType(self.scalar_type()), "var only supports floating-point dtypes");
+  TORCH_CHECK(at::isFloatingType(self.scalar_type()), "var only supports floating-point dtypes");
   auto trivial_return = _allreduce_return_trivial(self, std::numeric_limits<double>::quiet_NaN());
   return trivial_return.has_value() ? trivial_return.value() : at::legacy::th::_th_var(self, unbiased);
 }
@@ -629,9 +726,9 @@ Tensor &var_out(Tensor &result, const Tensor &self, IntArrayRef dim, bool unbias
 }
 
 Tensor std(const Tensor& self, bool unbiased) {
-  AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA,
+  TORCH_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA,
            "std only supports CPU AND CUDA backend, got: ", toString(self.type().backend()));
-  AT_CHECK(at::isFloatingType(self.scalar_type()), "std only supports floating-point dtypes");
+  TORCH_CHECK(at::isFloatingType(self.scalar_type()), "std only supports floating-point dtypes");
   auto trivial_return = _allreduce_return_trivial(self, std::numeric_limits<double>::quiet_NaN());
   return trivial_return.has_value() ? trivial_return.value() : at::legacy::th::_th_std(self, unbiased);
 }
diff --git a/aten/src/ATen/native/ReflectionPad.cpp b/aten/src/ATen/native/ReflectionPad.cpp
index 5689f02bea57..22889b08a0bc 100644
--- a/aten/src/ATen/native/ReflectionPad.cpp
+++ b/aten/src/ATen/native/ReflectionPad.cpp
@@ -61,7 +61,7 @@ void reflection_pad1d_out_template(
   int64_t dim_w = 1;
   int64_t nbatch = 1;
 
-  AT_CHECK(input_.numel() > 0 &&
+  TORCH_CHECK(input_.numel() > 0 &&
     (input_.ndimension() == 2 || input_.ndimension() == 3), "non-empty 2D "
     "or 3D (batch mode) tensor expected for input, but got: ", input_);
 
@@ -79,11 +79,11 @@ void reflection_pad1d_out_template(
   int64_t input_w = input_.size(dim_w);
   int64_t output_w  = input_w + pad_l + pad_r;
 
-  AT_CHECK(pad_l < input_w && pad_r < input_w, "Argument #4: Padding size "
+  TORCH_CHECK(pad_l < input_w && pad_r < input_w, "Argument #4: Padding size "
     "should be less than the corresponding input dimension, but got: padding (",
     pad_l, ", ", pad_r, ") at dimension ", dim_w, " of input ", input_.sizes());
 
-  AT_CHECK(output_w >= 1 , 2,
+  TORCH_CHECK(output_w >= 1 , 2,
     "input (W: ", input_w, ")is too small. Calculated output W: ", output_w);
 
   /* get contiguous input */
@@ -179,7 +179,7 @@ void reflection_pad1d_backward_out_template(
   int64_t input_w = input.size(dim_w);
   int64_t output_w  = input_w + pad_l + pad_r;
 
-  AT_CHECK(output_w == grad_output_.size(dim_w), "grad_output width unexpected."
+  TORCH_CHECK(output_w == grad_output_.size(dim_w), "grad_output width unexpected."
     " Expected: ", output_w, ", Got: ", grad_output_.size(dim_w));
 
   /* get contiguous grad_output */
@@ -280,7 +280,7 @@ void reflection_pad2d_out_template(
   int dim_slices = 0;
   int64_t nbatch = 1;
 
-  AT_CHECK(input_.numel() > 0 &&
+  TORCH_CHECK(input_.numel() > 0 &&
     (input_.ndimension() == 3 || input_.ndimension() == 4), "non-empty 3D or "
     "4D (batch mode) tensor expected for input, but got: ", input_);
 
@@ -303,17 +303,17 @@ void reflection_pad2d_out_template(
   int64_t output_h = input_h + pad_t + pad_b;
   int64_t output_w  = input_w + pad_l + pad_r;
 
-  AT_CHECK(pad_l < input_w && pad_r < input_w,
+  TORCH_CHECK(pad_l < input_w && pad_r < input_w,
     "Argument #4: Padding size should be less than the corresponding "
     "input dimension, but got: padding (", pad_l, ", ", pad_r,
     ") at dimension ", dim_w, " of input ", input_.ndimension());
 
-  AT_CHECK(pad_t < input_h && pad_b < input_h,
+  TORCH_CHECK(pad_t < input_h && pad_b < input_h,
     "Argument #6: Padding size should be less than the corresponding "
     "input dimension, but got: padding (", pad_t, ", ", pad_b,
     ") at dimension ", dim_h, " of input ", input_.ndimension());
 
-  AT_CHECK(output_w >= 1 || output_h >= 1,
+  TORCH_CHECK(output_w >= 1 || output_h >= 1,
     "input (H: ", input_h, ", W: ", input_w, ")is too small. Calculated "
     "output H: ", output_h, " W: ", output_w);
 
@@ -435,11 +435,11 @@ void reflection_pad2d_backward_out_template(
   int64_t output_h = input_h + pad_t + pad_b;
   int64_t output_w  = input_w + pad_l + pad_r;
 
-  AT_CHECK(output_w == grad_output_.size(dim_w),
+  TORCH_CHECK(output_w == grad_output_.size(dim_w),
     "gradOutput width unexpected. Expected: ", output_w, ", Got: ",
     grad_output_.size(dim_w));
 
-  AT_CHECK(output_h == grad_output_.size(dim_h),
+  TORCH_CHECK(output_h == grad_output_.size(dim_h),
     "gradOutput height unexpected. Expected: ", output_h, ", Got: ",
     grad_output_.size(dim_h));
 
diff --git a/aten/src/ATen/native/Repeat.cpp b/aten/src/ATen/native/Repeat.cpp
index 0137c80098c9..d63d0a511d61 100644
--- a/aten/src/ATen/native/Repeat.cpp
+++ b/aten/src/ATen/native/Repeat.cpp
@@ -32,7 +32,7 @@ Tensor repeat_interleave(const Tensor &self, const Tensor &repeats, c10::optiona
     if (repeats.dim() == 0 || (repeats.dim() == 1 && repeats.size(0) == 1)) {
         repeats_ = repeats.reshape({1}).expand({input.size(dim.value())});
     } else if (repeats.dim() == 1) {
-        AT_CHECK(repeats.size(0) == input.size(dim.value()), "repeats must have the same size as input along dim")
+        TORCH_CHECK(repeats.size(0) == input.size(dim.value()), "repeats must have the same size as input along dim")
     } else {
         AT_ERROR("repeats must be 0-dim or 1-dim tensor");
     }
diff --git a/aten/src/ATen/native/Repeat.h b/aten/src/ATen/native/Repeat.h
index a1ba07527d8f..e44e32b6bbdc 100644
--- a/aten/src/ATen/native/Repeat.h
+++ b/aten/src/ATen/native/Repeat.h
@@ -6,9 +6,9 @@ namespace at { namespace native {
 
 template <void compute(int64_t *, int64_t *, int64_t *, int64_t)>
 static inline Tensor repeat_interleave_common(const Tensor &repeats) {
-    AT_CHECK(repeats.dim() == 1, "repeat_interleave only accept 1D vector as repeat");
-    AT_CHECK(repeats.scalar_type() == at::kLong, "repeats has to be Long tensor");
-    AT_CHECK((repeats >= 0).all().item<uint8_t>(), "repeats can not be negative");
+    TORCH_CHECK(repeats.dim() == 1, "repeat_interleave only accept 1D vector as repeat");
+    TORCH_CHECK(repeats.scalar_type() == at::kLong, "repeats has to be Long tensor");
+    TORCH_CHECK((repeats >= 0).all().item<uint8_t>(), "repeats can not be negative");
     Tensor repeats_ = repeats.contiguous();
     Tensor cumsum = repeats.cumsum(0);
     int64_t total = cumsum[-1].item<int64_t>();
diff --git a/aten/src/ATen/native/ReplicationPadding.cpp b/aten/src/ATen/native/ReplicationPadding.cpp
index 562f11215037..4ecab611a231 100644
--- a/aten/src/ATen/native/ReplicationPadding.cpp
+++ b/aten/src/ATen/native/ReplicationPadding.cpp
@@ -67,11 +67,11 @@ void replication_pad1d_out_cpu_template(
   int dimw = 1;
   int dimslices = 0;
   long nbatch = 1;
-  AT_CHECK(paddingSize.size() == 2, "padding size is expected to be 2");
+  TORCH_CHECK(paddingSize.size() == 2, "padding size is expected to be 2");
   int pad_l = paddingSize[0];
   int pad_r = paddingSize[1];
 
-  AT_CHECK(input_.numel() > 0
+  TORCH_CHECK(input_.numel() > 0
       && (input_.ndimension() == 2 || input_.ndimension() == 3),
       "non-empty 2D or 3D (batch mode) tensor expected for input");
 
@@ -87,7 +87,7 @@ void replication_pad1d_out_cpu_template(
   long iwidth = input_.size(dimw);
   long owidth  = iwidth + pad_l + pad_r;
 
-  AT_CHECK(owidth >= 1,
+  TORCH_CHECK(owidth >= 1,
       "input (W: ", iwidth, ") is too small."
       " Calculated output W: ", owidth);
 
@@ -193,7 +193,7 @@ Tensor& replication_pad1d_backward_out_cpu_template(
   int dimw = 1;
   int dimslices = 0;
   long nbatch = 1;
-  AT_CHECK(paddingSize.size() == 2, "padding size is expected to be 2");
+  TORCH_CHECK(paddingSize.size() == 2, "padding size is expected to be 2");
   int pad_l = paddingSize[0];
   int pad_r = paddingSize[1];
 
@@ -209,7 +209,7 @@ Tensor& replication_pad1d_backward_out_cpu_template(
   long iwidth = input.size(dimw);
   long owidth  = iwidth + pad_l + pad_r;
 
-  AT_CHECK(owidth == gradOutput_.size(dimw),
+  TORCH_CHECK(owidth == gradOutput_.size(dimw),
       "gradOutput width unexpected. Expected: ", owidth,
       " Got: ", gradOutput_.size(dimw));
 
@@ -329,7 +329,7 @@ void replication_pad2d_out_cpu_template(Tensor& output,
     const Tensor& input_,
     IntArrayRef paddingSize)
 {
-  AT_CHECK(paddingSize.size() == 4, "padding size is expected to be 4");
+  TORCH_CHECK(paddingSize.size() == 4, "padding size is expected to be 4");
   int pad_l = paddingSize[0];
   int pad_r = paddingSize[1];
   int pad_t = paddingSize[2];
@@ -339,7 +339,7 @@ void replication_pad2d_out_cpu_template(Tensor& output,
   int dimslices = 0;
   int64_t nbatch = 1;
 
-  AT_CHECK(input_.numel() > 0 && (input_.dim() == 3 || input_.dim() == 4),
+  TORCH_CHECK(input_.numel() > 0 && (input_.dim() == 3 || input_.dim() == 4),
       "3D or 4D (batch mode) tensor expected for input, but got: ", input_);
 
   if (input_.dim() == 4)
@@ -357,7 +357,7 @@ void replication_pad2d_out_cpu_template(Tensor& output,
   int64_t oheight = iheight + pad_t + pad_b;
   int64_t owidth  = iwidth + pad_l + pad_r;
 
-  AT_CHECK(owidth >= 1 || oheight >= 1,
+  TORCH_CHECK(owidth >= 1 || oheight >= 1,
       "input (H: ", iheight, ", W: ", iwidth, " ) is too small."
       " Calculated output H: ", oheight, " W: ", owidth);
 
@@ -473,7 +473,7 @@ Tensor& replication_pad2d_backward_out_cpu_template(
     const Tensor& input,
     IntArrayRef paddingSize)
 {
-  AT_CHECK(paddingSize.size() == 4, "padding size is expected to be 4");
+  TORCH_CHECK(paddingSize.size() == 4, "padding size is expected to be 4");
   int pad_l = paddingSize[0];
   int pad_r = paddingSize[1];
   int pad_t = paddingSize[2];
@@ -498,10 +498,10 @@ Tensor& replication_pad2d_backward_out_cpu_template(
   int64_t oheight = iheight + pad_t + pad_b;
   int64_t owidth  = iwidth + pad_l + pad_r;
 
-  AT_CHECK(owidth == gradOutput_.size(dimw),
+  TORCH_CHECK(owidth == gradOutput_.size(dimw),
       "gradOutput width unexpected. Expected: ", owidth, ", Got: ",
       gradOutput_.size(dimw));
-  AT_CHECK(oheight == gradOutput_.size(dimh),
+  TORCH_CHECK(oheight == gradOutput_.size(dimh),
       "gradOutput height unexpected. Expected: ", oheight, ", Got: ",
       gradOutput_.size(dimh));
 
@@ -557,7 +557,7 @@ static inline void shapeCheck3d(
   int dimd = 1;
   int dimslices = 0;
 
-  AT_CHECK(input.numel() > 0 && (input.dim() == 4 || input.dim() == 5),
+  TORCH_CHECK(input.numel() > 0 && (input.dim() == 4 || input.dim() == 5),
       "non-empty 4D or 5D (batch mode) tensor expected for input, but got: ", input);
 
   if (input.dim() == 5)
@@ -577,7 +577,7 @@ static inline void shapeCheck3d(
   int64_t oheight = iheight + ptop + pbottom;
   int64_t owidth  = iwidth + pleft + pright;
 
-  AT_CHECK(owidth >= 1 || oheight >= 1 || odepth >= 1,
+  TORCH_CHECK(owidth >= 1 || oheight >= 1 || odepth >= 1,
       "input (D: ", idepth, " H: ", iheight, ", W: ", iwidth,
       ") is too small."
       " Calculated output D: ", odepth, " H: ", oheight, " W: ", owidth);
@@ -674,7 +674,7 @@ void replication_pad3d_out_cpu_template(
     const Tensor& input_,
     IntArrayRef paddingSize)
 {
-  AT_CHECK(paddingSize.size() == 6, "padding size is expected to be 6");
+  TORCH_CHECK(paddingSize.size() == 6, "padding size is expected to be 6");
   int pleft = paddingSize[0];
   int pright = paddingSize[1];
   int ptop = paddingSize[2];
@@ -832,7 +832,7 @@ Tensor& replication_pad3d_backward_out_cpu_template(
     const Tensor& input,
     IntArrayRef paddingSize)
 {
-  AT_CHECK(paddingSize.size() == 6, "padding size is expected to be 6");
+  TORCH_CHECK(paddingSize.size() == 6, "padding size is expected to be 6");
   int pleft = paddingSize[0];
   int pright = paddingSize[1];
   int ptop = paddingSize[2];
diff --git a/aten/src/ATen/native/Resize.h b/aten/src/ATen/native/Resize.h
index 27ceb5fcf8ed..1e8e661eaf09 100644
--- a/aten/src/ATen/native/Resize.h
+++ b/aten/src/ATen/native/Resize.h
@@ -63,7 +63,7 @@ static inline void checkInBoundsForStorage(
     return;
   }
   int64_t new_storage_size = new_storage.numel();
-  AT_CHECK(
+  TORCH_CHECK(
       storage_offset + storage_size <= new_storage_size,
       "setStorage: sizes ", size, ", strides ", stride, ","
       " and storage offset ", storage_offset,
@@ -84,7 +84,7 @@ inline void setStrided(
   checkInBoundsForStorage(size, stride, storage_offset, self_->storage());
 
   /* storage offset */
-  AT_CHECK(storage_offset >= 0, "Tensor: invalid storage offset ", storage_offset);
+  TORCH_CHECK(storage_offset >= 0, "Tensor: invalid storage offset ", storage_offset);
   self_->set_storage_offset(storage_offset);
 
   /* size and stride */
diff --git a/aten/src/ATen/native/Scalar.cpp b/aten/src/ATen/native/Scalar.cpp
index c018b3a3cc40..7e9cdf30d032 100644
--- a/aten/src/ATen/native/Scalar.cpp
+++ b/aten/src/ATen/native/Scalar.cpp
@@ -6,7 +6,7 @@ namespace native {
 
 Scalar item(const Tensor& self) {
   int64_t numel = self.numel();
-  AT_CHECK(numel == 1, "a Tensor with ", numel, " elements cannot be converted to Scalar");
+  TORCH_CHECK(numel == 1, "a Tensor with ", numel, " elements cannot be converted to Scalar");
   if (self.is_sparse()) {
     if (self._nnz() == 0) return Scalar(0);
     if (self.is_coalesced()) return at::_local_scalar_dense(self._values());
diff --git a/aten/src/ATen/native/SharedReduceOps.h b/aten/src/ATen/native/SharedReduceOps.h
index 144d38e5400c..7831ef5035d5 100644
--- a/aten/src/ATen/native/SharedReduceOps.h
+++ b/aten/src/ATen/native/SharedReduceOps.h
@@ -6,9 +6,11 @@
 #if defined(__CUDACC__)
 #include <THC/THCDeviceUtils.cuh>
 #include <ATen/native/cuda/DeviceSqrt.cuh>
+#include <thrust/tuple.h>
 #elif defined(__HIPCC__)
 #include <THH/THHDeviceUtils.cuh>
 #include <ATen/native/hip/DeviceSqrt.cuh>
+#include <thrust/tuple.h>
 #else
 #include <cmath>
 #define device_sqrt std::sqrt
@@ -42,7 +44,7 @@ struct WelfordData {
 };
 
 
-template <typename scalar_t, typename acc_scalar_t, typename index_t, typename combine_t>
+template <typename scalar_t, typename acc_scalar_t, typename index_t, typename combine_t, typename res_t>
 struct WelfordOps {
   bool unbiased;
   bool take_sqrt;
@@ -80,12 +82,18 @@ struct WelfordOps {
       new_count
     };
   }
-  inline C10_DEVICE scalar_t project(acc_t acc) const {
+  inline C10_DEVICE res_t project(acc_t acc) const {
+    auto mean = acc.mean;
     combine_t divisor = unbiased ? (acc.nf - 1) : acc.nf;
     auto ret = (divisor > 0) ?
       (take_sqrt ? device_sqrt(acc.m2 / divisor) : (acc.m2 / divisor))
       : NAN;
-    return (scalar_t) ret;
+#if defined(__CUDACC__) || defined(__HIPCC__)
+    thrust::tuple<scalar_t, scalar_t> results((scalar_t) ret, (scalar_t) mean);
+#else
+    std::tuple<scalar_t, scalar_t> results{(scalar_t) ret, (scalar_t) mean};
+#endif
+    return results;
   }
 #if defined(__CUDACC__) || defined(__HIPCC__)
   inline __device__ acc_t warp_shfl_down(acc_t acc, int offset) const {
diff --git a/aten/src/ATen/native/SobolEngineOps.cpp b/aten/src/ATen/native/SobolEngineOps.cpp
index 1e0af21473b7..b0b1c36664f6 100644
--- a/aten/src/ATen/native/SobolEngineOps.cpp
+++ b/aten/src/ATen/native/SobolEngineOps.cpp
@@ -16,9 +16,9 @@ namespace native {
 /// `sobolstate`.
 std::tuple<Tensor, Tensor> _sobol_engine_draw(const Tensor& quasi, int64_t n, const Tensor& sobolstate,
                                               int64_t dimension, int64_t num_generated, optional<ScalarType> dtype) {
-  AT_CHECK(sobolstate.dtype() == at::kLong,
+  TORCH_CHECK(sobolstate.dtype() == at::kLong,
            "sobolstate needs to be of type ", at::kLong);
-  AT_CHECK(quasi.dtype() == at::kLong,
+  TORCH_CHECK(quasi.dtype() == at::kLong,
            "quasi needs to be of type ", at::kLong);
 
   Tensor wquasi = quasi.clone();
@@ -55,9 +55,9 @@ std::tuple<Tensor, Tensor> _sobol_engine_draw(const Tensor& quasi, int64_t n, co
 /// specified above.
 Tensor& _sobol_engine_ff_(Tensor& quasi, int64_t n, const Tensor& sobolstate,
                         int64_t dimension, int64_t num_generated) {
-  AT_CHECK(sobolstate.dtype() == at::kLong,
+  TORCH_CHECK(sobolstate.dtype() == at::kLong,
            "sobolstate needs to be of type ", at::kLong);
-  AT_CHECK(quasi.dtype() == at::kLong,
+  TORCH_CHECK(quasi.dtype() == at::kLong,
            "quasi needs to be of type ", at::kLong);
 
   // We deal with `data` and `strides` due to performance issues.
@@ -82,7 +82,7 @@ Tensor& _sobol_engine_ff_(Tensor& quasi, int64_t n, const Tensor& sobolstate,
 /// and a list of random lower triangular matrices consisting of 0s and 1s. `dimension` is
 /// passed explicitly again.
 Tensor& _sobol_engine_scramble_(Tensor& sobolstate, const Tensor& ltm, int64_t dimension) {
-  AT_CHECK(sobolstate.dtype() == at::kLong,
+  TORCH_CHECK(sobolstate.dtype() == at::kLong,
            "sobolstate needs to be of type ", at::kLong);
 
   /// Require a tensor accessor for `sobolstate`
@@ -121,7 +121,7 @@ Tensor& _sobol_engine_scramble_(Tensor& sobolstate, const Tensor& ltm, int64_t d
 /// This is a core function to initialize the main state variable of a `SobolEngine`.
 /// `dimension` is passed explicitly as well (see why above)
 Tensor& _sobol_engine_initialize_state_(Tensor& sobolstate, int64_t dimension) {
-  AT_CHECK(sobolstate.dtype() == at::kLong,
+  TORCH_CHECK(sobolstate.dtype() == at::kLong,
            "sobolstate needs to be of type ", at::kLong);
 
   /// First row of `sobolstate` is 1
diff --git a/aten/src/ATen/native/SoftMax.cpp b/aten/src/ATen/native/SoftMax.cpp
index 60ba37ef69e1..aa81f09bee2f 100644
--- a/aten/src/ATen/native/SoftMax.cpp
+++ b/aten/src/ATen/native/SoftMax.cpp
@@ -127,7 +127,7 @@ Tensor softmax_cpu(const Tensor& input_, const int64_t dim_, const bool half_to_
   }
  if (input.dim() == 0)
     input = input.view(1);
-  AT_CHECK(
+  TORCH_CHECK(
       dim >= 0 && dim < input.dim(),
       "dim must be non-negative and less than input dimensions");
   if (input.ndimension() > 0 && dim == input.ndimension() - 1) {
@@ -151,7 +151,7 @@ Tensor log_softmax_cpu(const Tensor& input_, const int64_t dim_, const bool half
   }
   if (input.dim() == 0)
     input = input.view(1);
-  AT_CHECK(
+  TORCH_CHECK(
       dim >= 0 && dim < input.dim(),
       "dim must be non-negative and less than input dimensions");
   if (input.ndimension() > 0 && dim == input.ndimension() - 1) {
@@ -183,7 +183,7 @@ Tensor softmax_backward_cpu(
     grad = grad.view(1);
   if (output.dim() == 0)
     output = output.view(1);
-  AT_CHECK(
+  TORCH_CHECK(
       dim >= 0 && dim < grad.dim(),
       "dim must be non-negative and less than input dimensions");
   if (grad.ndimension() > 0 && dim == grad.ndimension() - 1) {
@@ -215,7 +215,7 @@ Tensor log_softmax_backward_cpu(
     grad = grad.view(1);
   if (output.dim() == 0)
     output = output.view(1);
-  AT_CHECK(
+  TORCH_CHECK(
       dim >= 0 && dim < grad.dim(),
       "dim must be non-negative and less than input dimensions");
   if (grad.ndimension() > 0 && dim == grad.ndimension() - 1) {
diff --git a/aten/src/ATen/native/Sorting.cpp b/aten/src/ATen/native/Sorting.cpp
index a2b387ae621c..4fee5be8da34 100644
--- a/aten/src/ATen/native/Sorting.cpp
+++ b/aten/src/ATen/native/Sorting.cpp
@@ -138,11 +138,11 @@ std::tuple<Tensor&, Tensor&> kthvalue_out_cpu(
   // FIXME: This seems bogus, I only do this because it was the old behaviour.
   //        The reductions are fine, as long as the axis being reduced along
   //        isn't of 0 elements (and the output has elements).
-  AT_CHECK(
+  TORCH_CHECK(
       self.numel() > 0,
       "cannot perform reduction function kthvalue",
       " on tensor with no elements because the operation does not have an identity");
-  AT_CHECK(
+  TORCH_CHECK(
       k > 0 && k <= (self.dim() > 0 ? self.size(dim) : 1),
       "selected index k out of range");
 
@@ -225,7 +225,7 @@ std::tuple<Tensor, Tensor> median(
 
 // this does not reduce to median with dim beause we don't want to copy twice
 Tensor median_cpu(const Tensor& self) {
-  AT_CHECK(self.numel() > 0, "median cannot be called with empty tensor");
+  TORCH_CHECK(self.numel() > 0, "median cannot be called with empty tensor");
   if (self.dim() == 0 && self.numel() == 1) {
     return self.clone();
   }
diff --git a/aten/src/ATen/native/SortingUtils.h b/aten/src/ATen/native/SortingUtils.h
index 9cc7afb8b4c1..b8d97750a97e 100644
--- a/aten/src/ATen/native/SortingUtils.h
+++ b/aten/src/ATen/native/SortingUtils.h
@@ -17,7 +17,7 @@ static void _reduction_with_indices_allocate_or_resize_output(
     result_sizes[dim] = 1;
   }
   if (values.defined()) {
-    AT_CHECK(
+    TORCH_CHECK(
         self.type() == values.type(),
         "output values must be of same type as input");
     if (!keepdim && values.dim() == self.dim() - 1) {
@@ -29,9 +29,9 @@ static void _reduction_with_indices_allocate_or_resize_output(
     values = at::empty(result_sizes, self.options());
   }
   if (indices.defined()) {
-    AT_CHECK(
+    TORCH_CHECK(
         indices.dtype() == kLong, "output indices must be of scalar type Long");
-    AT_CHECK(
+    TORCH_CHECK(
         indices.device() == self.device(),
         "output indices must be on same device as input");
     if (!keepdim && indices.dim() == self.dim() - 1) {
diff --git a/aten/src/ATen/native/SpectralOps.cpp b/aten/src/ATen/native/SpectralOps.cpp
index d00169c75f3b..83d37452e479 100644
--- a/aten/src/ATen/native/SpectralOps.cpp
+++ b/aten/src/ATen/native/SpectralOps.cpp
@@ -24,10 +24,10 @@ static inline Tensor _fft(const Tensor &self, const int64_t signal_ndim,
            const bool inverse, IntArrayRef signal_sizes, const bool normalized,
            const bool onesided) {
 
-  AT_CHECK(signal_ndim >= 1 && signal_ndim <= 3,
+  TORCH_CHECK(signal_ndim >= 1 && signal_ndim <= 3,
            "Expected signal_ndim to be 1, 2, or 3, but got signal_ndim=",
            signal_ndim);
-  AT_CHECK(at::isFloatingType(self.scalar_type()),
+  TORCH_CHECK(at::isFloatingType(self.scalar_type()),
            "Expected an input tensor of floating types, but got input=",
            self.type(), self.sizes());
 
@@ -62,14 +62,14 @@ static inline Tensor _fft(const Tensor &self, const int64_t signal_ndim,
   // now we assume that input is batched as [ B x signal_dims... ]
 
   if (complex_input) {
-    AT_CHECK(input.size(signal_ndim + 1) == 2,
+    TORCH_CHECK(input.size(signal_ndim + 1) == 2,
              "Expected an input tensor with a last dimension of size 2 "
              "representing real + imaginary components, but got input ",
              self.type(), self.sizes());
   }
 
   // build signal_sizes and output_size
-  AT_CHECK(signal_sizes.size() == 0 || static_cast<int64_t>(signal_sizes.size()) == signal_ndim,
+  TORCH_CHECK(signal_sizes.size() == 0 || static_cast<int64_t>(signal_sizes.size()) == signal_ndim,
            "Expected signal_sizes to be empty (default) or of signal_ndim=",
            signal_ndim, "D, but got signal_sizes=", signal_sizes);
   std::vector<int64_t> output_sizes(signal_ndim + 1 + static_cast<int64_t>(complex_output));
@@ -98,7 +98,7 @@ static inline Tensor _fft(const Tensor &self, const int64_t signal_ndim,
         output_sizes[i + 1] = input_size;
       }
       checked_signal_sizes[i] = input_size;
-      AT_CHECK(signal_sizes.size() == 0 || signal_sizes[i] == checked_signal_sizes[i],
+      TORCH_CHECK(signal_sizes.size() == 0 || signal_sizes[i] == checked_signal_sizes[i],
                "Expected given signal_sizes=", signal_sizes," to have same "
                "shape with input at signal dimension ", i, ", but got "
                "signal_sizes=", signal_sizes, " and input=", self.type(),
diff --git a/aten/src/ATen/native/TensorCompare.cpp b/aten/src/ATen/native/TensorCompare.cpp
index 9d0c2be2eb9e..9aaf55f47b35 100644
--- a/aten/src/ATen/native/TensorCompare.cpp
+++ b/aten/src/ATen/native/TensorCompare.cpp
@@ -119,7 +119,7 @@ std::tuple<Tensor, Tensor> mode(const Tensor& self, int64_t dim, bool keepdim) {
 
 std::tuple<Tensor &,Tensor &> mode_out(Tensor& values, Tensor& indices,
                                        const Tensor& self, int64_t dim, bool keepdim) {
-  AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA,
+  TORCH_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA,
            "mode only supports CPU AND CUDA backend, got: ", toString(self.type().backend()));
   dim = maybe_wrap_dim(dim, self.dim());
   if (_dimreduce_return_trivial_no_ident(values, self, dim, keepdim, "mode")) {
@@ -154,7 +154,7 @@ std::tuple<Tensor, Tensor> max(const Tensor& self, int64_t dim, bool keepdim) {
 
 std::tuple<Tensor &,Tensor &> max_out(Tensor& max, Tensor& max_indices,
                                       const Tensor& self, int64_t dim, bool keepdim) {
-  AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA,
+  TORCH_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA,
            "max only supports CPU AND CUDA backend, got: ", toString(self.type().backend()));
   dim = maybe_wrap_dim(dim, self.dim());
   if (_dimreduce_return_trivial_no_ident(max, self, dim, keepdim, "max")) {
@@ -193,7 +193,7 @@ std::tuple<Tensor, Tensor> min(const Tensor& self, int64_t dim, bool keepdim) {
 
 std::tuple<Tensor &,Tensor &> min_out(Tensor& min, Tensor& min_indices,
                                       const Tensor& self, int64_t dim, bool keepdim) {
-  AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA,
+  TORCH_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA,
            "min only supports CPU AND CUDA backend, got: ", toString(self.type().backend()));
   dim = maybe_wrap_dim(dim, self.dim());
   if (_dimreduce_return_trivial_no_ident(min, self, dim, keepdim, "min")) {
diff --git a/aten/src/ATen/native/TensorConversions.cpp b/aten/src/ATen/native/TensorConversions.cpp
index a4006f93b182..75a74f112cc1 100644
--- a/aten/src/ATen/native/TensorConversions.cpp
+++ b/aten/src/ATen/native/TensorConversions.cpp
@@ -26,12 +26,12 @@ static inline Tensor to_impl(const Tensor& self, const TensorOptions& options, b
 }
 
 Tensor to(const Tensor& self, const TensorOptions& options, bool non_blocking, bool copy) {
-  AT_CHECK(options.requires_grad_opt() == c10::nullopt,
+  TORCH_CHECK(options.requires_grad_opt() == c10::nullopt,
            "to(options) expects unset requires_grad flag, but got "
            "options.requires_grad set as ", options.requires_grad());
 
   const auto & layout_opt = options.layout_opt();
-  AT_CHECK(!layout_opt || self.layout() == layout_opt.value(),
+  TORCH_CHECK(!layout_opt || self.layout() == layout_opt.value(),
            "to(options) doesn't support converting to a different layout, "
            "but got self.layout being ", self.layout(),
            " and options.layout set as ", options.layout());
diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp
index 8a75f1851f1d..5cb6e18a6ffb 100644
--- a/aten/src/ATen/native/TensorFactories.cpp
+++ b/aten/src/ATen/native/TensorFactories.cpp
@@ -35,17 +35,17 @@ void window_function_checks(
     const char* function_name,
     const TensorOptions& options,
     int64_t window_length) {
-  AT_CHECK(
+  TORCH_CHECK(
       options.layout() != kSparse,
       function_name,
       " is not implemented for sparse types, got: ",
       options);
-  AT_CHECK(
+  TORCH_CHECK(
       at::isFloatingType(typeMetaToScalarType(options.dtype())),
       function_name,
       " expects floating point dtypes, got: ",
       options);
-  AT_CHECK(
+  TORCH_CHECK(
       window_length >= 0,
       function_name,
       " requires non-negative window_length, got window_length=",
@@ -182,7 +182,7 @@ Tensor& eye_out_cpu(Tensor& result, int64_t n) {
 }
 
 Tensor& eye_out_cpu(Tensor& result, int64_t n, int64_t m) {
-  AT_CHECK(n >= 0, "n must be greater or equal to 0, got ", n);
+  TORCH_CHECK(n >= 0, "n must be greater or equal to 0, got ", n);
 
   if(m < 0) {
     m = n;
@@ -459,7 +459,7 @@ Tensor& randperm_out(Tensor& result, int64_t n) {
 }
 
 Tensor& randperm_out_cpu(Tensor& result, int64_t n, Generator* generator) {
-  AT_CHECK(n >= 0, "n must be non-negative, got", n);
+  TORCH_CHECK(n >= 0, "n must be non-negative, got", n);
   result.resize_({n});
   auto gen = get_generator(generator);
   AT_DISPATCH_ALL_TYPES(result.scalar_type(), "randperm", [&]() -> void {
@@ -738,7 +738,7 @@ AT_FORALL_SCALAR_TYPES_EXCEPT_HALF_AND_QINT(TENSOR)
 #undef TENSOR
 
 Tensor from_file(std::string filename, c10::optional<bool> shared, c10::optional<int64_t> size, const TensorOptions& options) {
-    AT_CHECK(!options.pinned_memory(), "tensors constructed from a file cannot be pinned");
+    TORCH_CHECK(!options.pinned_memory(), "tensors constructed from a file cannot be pinned");
     size_t my_size = size.value_or(0);
     int flags = shared.value_or(false) ? TH_ALLOCATOR_MAPPED_SHARED : 0;
     auto dtype = options.dtype();
diff --git a/aten/src/ATen/native/TensorFactories.h b/aten/src/ATen/native/TensorFactories.h
index c17b4c1b3d10..08916147d3f6 100644
--- a/aten/src/ATen/native/TensorFactories.h
+++ b/aten/src/ATen/native/TensorFactories.h
@@ -49,10 +49,10 @@ inline int64_t get_tril_size(int64_t row, int64_t col, int64_t offset) {
 
 inline void check_args(
     int64_t row, int64_t col, const TensorOptions& options) {
-  AT_CHECK(row >= 0, "row must be non-negative, got", row);
-  AT_CHECK(col >= 0, "col must be non-negative, got", col);
+  TORCH_CHECK(row >= 0, "row must be non-negative, got", row);
+  TORCH_CHECK(col >= 0, "col must be non-negative, got", col);
   if (options.has_layout()) {
-    AT_CHECK(
+    TORCH_CHECK(
       options.layout() == at::kStrided,
       "only support layout=torch.strided, got",
       options.layout())
@@ -61,7 +61,7 @@ inline void check_args(
 
 inline void check_size_nonnegative(IntArrayRef size) {
   for (auto x: size) {
-    AT_CHECK(x >= 0, "Trying to create tensor with negative dimension ", x, ": ", size);
+    TORCH_CHECK(x >= 0, "Trying to create tensor with negative dimension ", x, ": ", size);
   }
 }
 } // namespace native
diff --git a/aten/src/ATen/native/TensorIterator.cpp b/aten/src/ATen/native/TensorIterator.cpp
index ae823a30f7b1..b731b93b1dac 100644
--- a/aten/src/ATen/native/TensorIterator.cpp
+++ b/aten/src/ATen/native/TensorIterator.cpp
@@ -409,6 +409,23 @@ bool TensorIterator::is_trivial_1d() const {
   return ndim() == 1;
 }
 
+bool TensorIterator::is_contiguous() const {
+  if (numel() == 1) {
+    return true;
+  }
+  if (ndim() != 1) {
+    return false;
+  }
+  int num_tensors = ntensors();
+  for (int i = 0; i < num_tensors; i++) {
+    if (strides(i)[0] != element_size(i)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+
 bool TensorIterator::is_scalar(int arg) const {
   const auto& stride = operands_[arg].stride_bytes;
   for (int i = 0; i < ndim(); i++) {
@@ -468,7 +485,7 @@ void TensorIterator::select_all_keeping_dim(int start_dim, IntArrayRef indices)
 std::unique_ptr<TensorIterator> TensorIterator::binary_op(Tensor& out, const Tensor& a, const Tensor& b) {
   auto builder = TensorIterator::Builder();
   if (a.device().is_cuda() && b.device().is_cuda()) {
-    AT_CHECK(a.device() == b.device(),
+    TORCH_CHECK(a.device() == b.device(),
       "binary_op(): expected both inputs to be on same device, but input a "
       "is on ", a.device(), " and input b is on ", b.device());
   }
@@ -486,6 +503,14 @@ std::unique_ptr<TensorIterator> TensorIterator::unary_op(Tensor& out, const Tens
   return builder.build();
 }
 
+std::unique_ptr<TensorIterator> TensorIterator::nullary_op(Tensor& out) {
+  auto builder = TensorIterator::Builder();
+  builder.add_output(out);
+  // FIXME: workaround for bug: https://github.com/pytorch/pytorch/issues/20342
+  builder.iter_->resize_outputs_ = false;
+  return builder.build();
+}
+
 std::unique_ptr<TensorIterator> TensorIterator::reduce_op(Tensor& out, const Tensor& a) {
   AT_ASSERT(out.defined());
   auto builder = TensorIterator::Builder();
@@ -497,6 +522,28 @@ std::unique_ptr<TensorIterator> TensorIterator::reduce_op(Tensor& out, const Ten
   return builder.build();
 }
 
+std::unique_ptr<TensorIterator> TensorIterator::reduce_op(Tensor& out1, Tensor& out2, const Tensor& a) {
+  AT_ASSERT(out1.defined());
+  AT_ASSERT(out2.defined());
+  TORCH_CHECK((!a.is_cuda() && !out1.is_cuda() && !out2.is_cuda()) || (a.device() == out1.device() && out1.device() == out2.device()),
+      "reduce_op(): expected input and both outputs to be on same device, but input is on ", a.device(),
+      ", output1 is on ", out1.device(), " and output2 is on", out2.device());
+  TORCH_CHECK(out1.dim() == out2.dim(), "reduce_op(): expected both outputs to have same number of dims, but output1 has ", out1.dim(),
+      " and output2 has ", out2.dim());
+  TORCH_CHECK(out1.sizes() == out2.sizes(), "reduce_op(): expected both outputs to have same sizes, but output1 has ", out1.sizes(),
+      " and output2 has ", out2.sizes());
+  TORCH_CHECK(out1.strides() == out2.strides(), "reduce_op(): expected both outputs to have same strides, but output1 has ", out1.strides(),
+           " and output2 has ", out2.strides());
+  auto builder = TensorIterator::Builder();
+  builder.add_output(out1);
+  builder.add_output(out2);
+  builder.add_input(a);
+  builder.iter_->promote_gpu_output_dtypes_ = true;
+  builder.iter_->resize_outputs_ = false;
+  builder.iter_->is_reduction_ = true;
+  return builder.build();
+}
+
 void TensorIterator::mark_outputs() {
   for (int i = 0; i < num_outputs_; i++) {
     operands_[i].is_output = true;
diff --git a/aten/src/ATen/native/TensorIterator.h b/aten/src/ATen/native/TensorIterator.h
index 6a9ca8ca2150..9bbcda9531ee 100644
--- a/aten/src/ATen/native/TensorIterator.h
+++ b/aten/src/ATen/native/TensorIterator.h
@@ -64,6 +64,7 @@ struct DimCounter {
   DimVector values;
   int64_t offset;
 };
+
 struct CAFFE2_API OperandInfo {
   OperandInfo() {}
   explicit OperandInfo(const Tensor& t, const Backend backend=Backend::Undefined, const ScalarType dtype=ScalarType::Undefined)
@@ -146,12 +147,16 @@ struct CAFFE2_API TensorIterator {
 
   static std::unique_ptr<TensorIterator> binary_op(Tensor& out, const Tensor& a, const Tensor& b);
   static std::unique_ptr<TensorIterator> unary_op(Tensor& out, const Tensor& a);
+  static std::unique_ptr<TensorIterator> nullary_op(Tensor& out);
   static std::unique_ptr<TensorIterator> reduce_op(Tensor& out, const Tensor& a);
+  static std::unique_ptr<TensorIterator> reduce_op(Tensor& out1, Tensor& out2, const Tensor& a);
 
   int ndim() const { return shape_.size(); }
   IntArrayRef shape() const { return shape_; }
   int64_t numel() const;
   int ntensors() const { return operands_.size(); }
+  int noutputs() const { return num_outputs_; }
+  int ninputs() const { return ntensors() - noutputs(); }
 
   /// number of elements in the output operand. this is the same as numel() for
   /// operations that are not reductions.
@@ -162,6 +167,8 @@ struct CAFFE2_API TensorIterator {
 
   /// 1-dimensional iteration and no buffering or type conversion
   bool is_trivial_1d() const;
+  /// Reducible to 1-dimensional and all operands are contiguous
+  bool is_contiguous() const;
   bool is_dim_reduced(int dim) const;
 
   /// Accessors for each operand
@@ -169,6 +176,7 @@ struct CAFFE2_API TensorIterator {
   void* data_ptr(int arg) const;
   ScalarType dtype(int arg=0) const { return operands_[arg].dtype; }
   DeviceType device_type(int arg=0) const { return backendToDeviceType(operands_[arg].backend); }
+  Device device(int arg=0) const { return operands_[arg].tensor.device(); }
   int64_t element_size(int arg) const { return elementSize(dtype(arg)); }
   bool is_scalar(int arg) const;
   bool is_cpu_scalar(int arg) const;
@@ -181,6 +189,11 @@ struct CAFFE2_API TensorIterator {
     return operands_[arg].tensor;
   }
 
+  Tensor input(int arg=0) const {
+    AT_ASSERT(arg >= 0 && arg < ntensors() - num_outputs_);
+    return operands_[num_outputs_ + arg].tensor;
+  }
+
   /// Removes an operand from this iterator
   void remove_operand(int arg);
   /// Removes a dimension from this iterator
diff --git a/aten/src/ATen/native/TensorIteratorReduce.cpp b/aten/src/ATen/native/TensorIteratorReduce.cpp
index 228bb1337774..e0dd738f7491 100644
--- a/aten/src/ATen/native/TensorIteratorReduce.cpp
+++ b/aten/src/ATen/native/TensorIteratorReduce.cpp
@@ -14,7 +14,7 @@ static void two_pass_reduction(TensorIterator& iter, const loop2d_t& loop);
 static void parallel_dim_reduction(TensorIterator& iter, const loop2d_t& loop);
 
 void TensorIterator::parallel_reduce(const loop2d_t& loop) {
-  AT_CHECK(ntensors() == 2, "parallel_reduce only supports one input and one output");
+  TORCH_CHECK(ntensors() == 2, "parallel_reduce only supports one input and one output");
   int64_t numel = this->numel();
   if (numel < at::internal::GRAIN_SIZE || at::get_num_threads() == 1 ||
       at::in_parallel_region()) {
@@ -27,13 +27,13 @@ void TensorIterator::parallel_reduce(const loop2d_t& loop) {
 }
 
 static bool use_two_pass_reduction(TensorIterator& iter) {
-  return iter.tensor(0).numel() == 1;
+  return iter.output(0).numel() == 1;
 }
 
 static void two_pass_reduction(TensorIterator& iter, const loop2d_t& loop) {
   int max_threads = at::get_num_threads();
 
-  auto& dst = iter.tensor(0);
+  auto dst = iter.output(0);
   auto buffer_shape = DimVector(dst.sizes());
   buffer_shape.insert(buffer_shape.begin(), max_threads);
   auto buffer = at::empty(buffer_shape, dst.options());
@@ -47,7 +47,7 @@ static void two_pass_reduction(TensorIterator& iter, const loop2d_t& loop) {
     auto slice = buffer[thread_num];
     slice.copy_(dst);
 
-    auto sub_iter = TensorIterator::reduce_op(slice, iter.tensor(1));
+    auto sub_iter = TensorIterator::reduce_op(slice, iter.input(0));
     sub_iter->serial_for_each(loop, {begin, end});
   });
 
@@ -117,13 +117,14 @@ static void parallel_dim_reduction(TensorIterator& iter, const loop2d_t& loop) {
 }
 
 void TensorIterator::foreach_reduced_elt(const loop_subiter_t &loop, bool parallelize) {
-  AT_ASSERT(ntensors() == 2 && num_outputs_ == 1);
+  AT_ASSERT(ninputs() == 1);
+  AT_ASSERT(noutputs() >= 1);
 
   auto shape = this->shape();
-  if (tensor(0).numel() == 0) {
+  if (output(0).numel() == 0) {
     return;
   }
-  if (tensor(0).numel() == 1) {
+  if (output(0).numel() == 1) {
     loop(*this);
   }
   else if (numel() < at::internal::GRAIN_SIZE || at::get_num_threads() == 1 ||
diff --git a/aten/src/ATen/native/TensorProperties.cpp b/aten/src/ATen/native/TensorProperties.cpp
index 64ee1fb1b5cd..1b6adb17aebd 100644
--- a/aten/src/ATen/native/TensorProperties.cpp
+++ b/aten/src/ATen/native/TensorProperties.cpp
@@ -53,11 +53,37 @@ Tensor & detach_(Tensor & self) {
 }
 
 Tensor contiguous(const Tensor & self) {
-  if (self.is_contiguous()) {
-    return self;
-  }
-  return self.clone();
+  return contiguous(self, MemoryFormat::Contiguous);
 }
 
+Tensor contiguous(const Tensor& self, MemoryFormat memory_format) {
+  if (self.is_contiguous(memory_format)) {
+    return self;
+  }
+  auto result = at::empty_like(self);
+  switch (memory_format) {
+    case MemoryFormat::Any: // Back compatibility with old defaults
+    case MemoryFormat::Contiguous: {
+      break;
+    }
+    case MemoryFormat::ChannelsLast: {
+      AT_CHECK(
+          result.dim() == 4,
+          " required rank 4 tensor to use channels_last format");
+      std::vector<int64_t> newStrides(self.dim());
+      auto sizes = result.sizes();
+      newStrides[1] = 1;
+      newStrides[3] = sizes[1];
+      newStrides[2] = newStrides[3] * sizes[3];
+      newStrides[0] = newStrides[2] * sizes[2];
+      result = result.as_strided(sizes, newStrides);
+      break;
+    }
+    default: {
+      AT_CHECK(false, " unsupported memory format");
+    }
+  }
+  return result.copy_(self);
 }
+} // namespace native
 }
diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index 857585be9b55..277d715ee846 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -19,7 +19,7 @@ namespace at {
 namespace native {
 
 Tensor _reshape_from_tensor(const Tensor& self, const Tensor& shape_tensor) {
-  AT_CHECK(shape_tensor.dim() == 1);
+  TORCH_CHECK(shape_tensor.dim() == 1);
   std::vector<int64_t> shape;
   auto accessor = shape_tensor.accessor<int64_t, 1>();
   for (size_t i = 0; i < shape_tensor.numel(); ++i) {
@@ -40,7 +40,7 @@ std::vector<Tensor> broadcast_tensors(TensorList tensors) {
 static void check_cat_no_zero_dim(TensorList tensors) {
   for(size_t i = 0; i < tensors.size(); ++i) {
     auto& t = tensors[i];
-    AT_CHECK(t.dim() > 0,
+    TORCH_CHECK(t.dim() > 0,
              "zero-dimensional tensor (at position ", i, ") cannot be concatenated");
   }
 }
@@ -71,12 +71,12 @@ static void check_cat_sparse_dims(Tensor const &t,
   int64_t wrapped,
   int64_t sparse_dim,
   int64_t dense_dim) {
-    AT_CHECK(t.is_sparse(),
+    TORCH_CHECK(t.is_sparse(),
             "Concatenating sparse tensors, but a dense tensor was found at position ", pos, ".");
-    AT_CHECK(sizes_match_except(sizes, t.sizes(), wrapped),
+    TORCH_CHECK(sizes_match_except(sizes, t.sizes(), wrapped),
             "All tensors must have the same shape: ", sizes, " (except in the concatenating dimension),"
             " but found shape: ", t.sizes(), " at position ", pos, ".");
-    AT_CHECK(t.sparse_dim() == sparse_dim && t.dense_dim() == dense_dim,
+    TORCH_CHECK(t.sparse_dim() == sparse_dim && t.dense_dim() == dense_dim,
             "All tensors must have the same sparse_dim and dense_dim: ", sparse_dim, ", ", dense_dim,
             ", but tensor at position ", pos, " has ", t.sparse_dim(), ", ", t.dense_dim(), ".");
 }
@@ -182,9 +182,9 @@ Tensor cat(TensorList tensors, int64_t dim) {
 }
 
 std::vector<Tensor> chunk(const Tensor& self, int64_t chunks, int64_t dim) {
-  AT_CHECK(self.dim() > 0,
+  TORCH_CHECK(self.dim() > 0,
            "chunk expects at least a 1-dimensional tensor");
-  AT_CHECK(chunks > 0,
+  TORCH_CHECK(chunks > 0,
            "chunk expects `chunks` to be greater than 0, got: ", chunks);
 
   int64_t split_size = (self.size(dim) + chunks - 1) / chunks;
@@ -210,7 +210,7 @@ Tensor diagonal(const Tensor& self, int64_t offset, int64_t dim1_, int64_t dim2_
   int64_t nDims = self.dim();
   int64_t dim1 = maybe_wrap_dim(dim1_, nDims);
   int64_t dim2 = maybe_wrap_dim(dim2_, nDims);
-  AT_CHECK(dim1 != dim2, "diagonal dimensions cannot be identical ", dim1_, ", ", dim2_);
+  TORCH_CHECK(dim1 != dim2, "diagonal dimensions cannot be identical ", dim1_, ", ", dim2_);
   int64_t diag_size;
   int64_t storage_offset = self.storage_offset();
   // compute storage offset and size for the diagonal
@@ -256,7 +256,7 @@ Tensor diag_embed(const Tensor& self, int64_t offset, int64_t dim1_, int64_t dim
   int64_t nDims = self.dim() + 1;
   int64_t dim1 = maybe_wrap_dim(dim1_, nDims);
   int64_t dim2 = maybe_wrap_dim(dim2_, nDims);
-  AT_CHECK(dim1 != dim2, "diagonal dimensions cannot be identical ", dim1_, ", ", dim2_);
+  TORCH_CHECK(dim1 != dim2, "diagonal dimensions cannot be identical ", dim1_, ", ", dim2_);
   int64_t new_dim_len = std::abs(offset) + self.size(-1);
   auto sizes = self.sizes().vec();
   sizes.pop_back();
@@ -275,7 +275,7 @@ Tensor expand(const Tensor& self, IntArrayRef size, bool implicit) {
   // distinguish between expands inserted by broadcasts and those explicitly
   // requested by the user, because it is legal to remove implicit expands
   // from the graph, but not legal to remove the explicit ones.
-  AT_CHECK(size.size() >= (size_t)self.dim(),
+  TORCH_CHECK(size.size() >= (size_t)self.dim(),
            "expand(", self.type(), "{", self.sizes(), "}, size=", size,
            "): the number of sizes provided (", size.size(), ") ",
            "must be greater or equal to the number of dimensions in the tensor (",
@@ -293,7 +293,7 @@ Tensor expand_as(const Tensor& self, const Tensor& other) {
 }
 
 Tensor sum_to_size(const Tensor& self, IntArrayRef size) {
-  AT_CHECK(is_expandable_to(size, self.sizes()),
+  TORCH_CHECK(is_expandable_to(size, self.sizes()),
            "size {", size, "} is not expandable to size {", self.sizes(), "}.");
 
   return sum_to(self, size);
@@ -302,7 +302,7 @@ Tensor sum_to_size(const Tensor& self, IntArrayRef size) {
 Tensor as_strided_tensorimpl(const Tensor& self, IntArrayRef size, IntArrayRef stride, optional<int64_t> storage_offset_) {
   auto storage_offset = storage_offset_.value_or(self.storage_offset());
   auto tid = self.type_id();
-  AT_CHECK(
+  TORCH_CHECK(
       tid == CPUTensorId() || tid == CUDATensorId(),
       "as_strided is only implemented for strided CPU, CUDA and QuantizedCPU tensors.");
   auto result = detail::make_tensor<TensorImpl>(Storage(self.storage()), tid);
@@ -313,7 +313,7 @@ Tensor as_strided_tensorimpl(const Tensor& self, IntArrayRef size, IntArrayRef s
 Tensor as_strided_qtensorimpl(const Tensor& self, IntArrayRef size, IntArrayRef stride, optional<int64_t> storage_offset_) {
   auto storage_offset = storage_offset_.value_or(self.storage_offset());
   auto tid = self.type_id();
-  AT_CHECK(
+  TORCH_CHECK(
       tid == QuantizedCPUTensorId(),
       "as_strided is only implemented for strided CPU, CUDA and QuantizedCPU tensors.");
   auto result = detail::make_tensor<QTensorImpl>(Storage(self.storage()), tid, get_qtensorimpl(self)->quantizer());
@@ -330,10 +330,10 @@ Tensor &as_strided_(Tensor& self, IntArrayRef size, IntArrayRef stride, optional
 Tensor narrow_copy_sparse(const Tensor& self, int64_t dim, int64_t start, int64_t length) {
   int64_t allDim = self.dim();
   int64_t end = start+length;
-  AT_CHECK(allDim > 0, "narrow() cannot be applied to a 0-dim tensor.");
-  AT_CHECK(dim >= 0 && dim < allDim,
+  TORCH_CHECK(allDim > 0, "narrow() cannot be applied to a 0-dim tensor.");
+  TORCH_CHECK(dim >= 0 && dim < allDim,
     "Dimension ", dim, " out of range. Expecting 0 <= dim < ", allDim, ".");
-  AT_CHECK(start >= 0 && length >= 0 && end <= self.size(dim),
+  TORCH_CHECK(start >= 0 && length >= 0 && end <= self.size(dim),
     "Invalid range to narrow. range(start, start+length) must be a subset of range(0, ", self.size(dim), ").")
   Tensor indices = self._indices();
   int64_t sparse_dim = self.sparse_dim();
@@ -366,19 +366,19 @@ Tensor narrow_copy_dense(const Tensor& self, int64_t dim, int64_t start, int64_t
 }
 
 Tensor narrow(const Tensor& self, int64_t dim, int64_t start, int64_t length) {
-  AT_CHECK(self.dim() > 0, "narrow() cannot be applied to a 0-dim tensor.");
+  TORCH_CHECK(self.dim() > 0, "narrow() cannot be applied to a 0-dim tensor.");
   auto cur_size = self.size(dim);
   if (start != cur_size) {  // start being the end is valid, but not a valid dim specification.
     start = maybe_wrap_dim(start, cur_size);
   }
-  AT_CHECK(length >= 0 && start <= cur_size - length,
+  TORCH_CHECK(length >= 0 && start <= cur_size - length,
            "start (", start, ") + length (", length, ") exceeds dimension size (", cur_size, ").");
   return at::slice(self, dim, start, start + length, 1);
 }
 
 Tensor permute(const Tensor& self, IntArrayRef dims) {
   auto nDims = self.dim();
-  AT_CHECK(dims.size() == (size_t)nDims,
+  TORCH_CHECK(dims.size() == (size_t)nDims,
            "number of dims don't match in permute");
   auto oldSizes = self.sizes();
   auto oldStrides = self.strides();
@@ -387,7 +387,7 @@ Tensor permute(const Tensor& self, IntArrayRef dims) {
   std::vector<bool> seen(nDims);
   for (int64_t i = 0; i < nDims; i++) {
     auto dim = maybe_wrap_dim(dims[i], nDims);
-    AT_CHECK(!seen[dim],
+    TORCH_CHECK(!seen[dim],
              "repeated dim in permute");
     seen[dim] = true;
     newSizes[i] = oldSizes[dim];
@@ -397,7 +397,7 @@ Tensor permute(const Tensor& self, IntArrayRef dims) {
 }
 
 Tensor repeat(const Tensor& self, IntArrayRef repeats) {
-  AT_CHECK(repeats.size() >= (size_t)self.dim(),
+  TORCH_CHECK(repeats.size() >= (size_t)self.dim(),
            "Number of dimensions of repeat dims can not be smaller than number of dimensions of tensor");
 
   // Add new leading dimensions to the tensor if the
@@ -477,7 +477,7 @@ Tensor slice(const Tensor& self, int64_t dim, int64_t start, int64_t end, int64_
   auto sizes = self.sizes().vec();
   auto strides = self.strides().vec();
   // TODO: support negative strides
-  AT_CHECK(step > 0, "slice step must be positive");
+  TORCH_CHECK(step > 0, "slice step must be positive");
   if (start < 0) {
     start += sizes[dim];
   }
@@ -502,10 +502,10 @@ Tensor slice(const Tensor& self, int64_t dim, int64_t start, int64_t end, int64_
 }
 
 std::vector<Tensor> split(const Tensor& self, int64_t split_size, int64_t dim) {
-  AT_CHECK(self.dim() != 0, "split expects at least a 1-dimensional tensor");
-  AT_CHECK(split_size >= 0,  "split expects split_size be non-negative, but got split_size=", split_size);
+  TORCH_CHECK(self.dim() != 0, "split expects at least a 1-dimensional tensor");
+  TORCH_CHECK(split_size >= 0,  "split expects split_size be non-negative, but got split_size=", split_size);
   int64_t dim_size = self.size(dim);
-  AT_CHECK(split_size > 0 || self.size(dim) == 0,
+  TORCH_CHECK(split_size > 0 || self.size(dim) == 0,
            "split_size can only be 0 if dimension size is 0, "
            "but got dimension size of ", dim_size);
   // if split_size is 0 and dimension size is 0, there is 1 split.
@@ -526,7 +526,7 @@ std::vector<Tensor> split(const Tensor& self, int64_t split_size, int64_t dim) {
 }
 
 std::vector<Tensor> split_with_sizes(const Tensor& self, IntArrayRef split_sizes, int64_t dim) {
-  AT_CHECK(self.dim() != 0, "split expects at least a 1-dimensional tensor");
+  TORCH_CHECK(self.dim() != 0, "split expects at least a 1-dimensional tensor");
   int64_t dim_size = self.size(dim);
   int64_t num_splits = split_sizes.size();
   std::vector<Tensor> splits(num_splits);
@@ -535,13 +535,13 @@ std::vector<Tensor> split_with_sizes(const Tensor& self, IntArrayRef split_sizes
 
   for (i = 0; i < num_splits; ++i) {
     auto length = split_sizes[i];
-    AT_CHECK(length >= 0,
+    TORCH_CHECK(length >= 0,
              "split_with_sizes expects split_sizes have only non-negative ",
              "entries, but got split_sizes=", split_sizes);
     splits[i] = self.narrow(dim, start_idx, length);
     start_idx += length;
   }
-  AT_CHECK(start_idx == dim_size,
+  TORCH_CHECK(start_idx == dim_size,
            "split_with_sizes expects split_sizes to sum exactly to ", dim_size,
            " (input tensor's size at dimension ", dim, "), ", "but got split_sizes=", split_sizes);
   return splits;
@@ -556,14 +556,14 @@ static inline std::vector<Tensor> get_stack_inputs(TensorList tensors, int64_t d
 }
 
 Tensor stack(TensorList tensors, int64_t dim) {
-  AT_CHECK(tensors.size() > 0,
+  TORCH_CHECK(tensors.size() > 0,
            "stack expects a non-empty TensorList");
   dim = maybe_wrap_dim(dim, tensors[0].dim() + 1);
   return at::cat(get_stack_inputs(tensors, dim), dim);
 }
 
 Tensor& stack_out(Tensor& result, TensorList tensors, int64_t dim) {
-  AT_CHECK(tensors.size() > 0,
+  TORCH_CHECK(tensors.size() > 0,
            "stack expects a non-empty TensorList");
   dim = maybe_wrap_dim(dim, tensors[0].dim() + 1);
   return at::cat_out(result, get_stack_inputs(tensors, dim), dim);
@@ -571,7 +571,7 @@ Tensor& stack_out(Tensor& result, TensorList tensors, int64_t dim) {
 
 static inline Tensor & sparse_transpose_(Tensor & self, int64_t dim0, int64_t dim1) {
   int64_t nsparse_dim = self.sparse_dim();
-  AT_CHECK(dim0 < nsparse_dim && dim1 < nsparse_dim,
+  TORCH_CHECK(dim0 < nsparse_dim && dim1 < nsparse_dim,
            "sparse transpose: transposed dimensions must be sparse ",
            "Got sparse_dim: ", nsparse_dim, ", d0: ", dim0, ", d1: ", dim1);
 
@@ -644,11 +644,11 @@ static void check_t(const Tensor& self, const char *fn) {
   if (self.is_sparse()) {
     int64_t sparse_dim = self.sparse_dim();
     int64_t dense_dim = self.dense_dim();
-    AT_CHECK(sparse_dim <= 2 && dense_dim == 0,
+    TORCH_CHECK(sparse_dim <= 2 && dense_dim == 0,
              fn, " expects a tensor with <= 2 sparse and 0 dense dimensions, but got ",
              sparse_dim, " sparse and ", dense_dim, " dense dimensions");
   } else {
-    AT_CHECK(self.dim() <= 2,
+    TORCH_CHECK(self.dim() <= 2,
              fn, " expects a tensor with <= 2 dimensions, but self is ", self.dim(), "D");
   }
 }
@@ -790,7 +790,7 @@ Tensor & unsqueeze_(Tensor& self, int64_t dim) {
 Tensor flatten(const Tensor& self, int64_t start_dim, int64_t end_dim) {
   start_dim = maybe_wrap_dim(start_dim, self.dim());
   end_dim = maybe_wrap_dim(end_dim, self.dim());
-  AT_CHECK(start_dim <= end_dim, "flatten() has invalid args: start_dim cannot come after end_dim");
+  TORCH_CHECK(start_dim <= end_dim, "flatten() has invalid args: start_dim cannot come after end_dim");
 
   if (start_dim == end_dim) {
     return self;
@@ -834,7 +834,7 @@ std::vector<Tensor> unbind(const Tensor &self, int64_t dim) {
 
 std::vector<Tensor> meshgrid(TensorList tensors) {
   int64_t size = tensors.size();
-  AT_CHECK(size > 0, "meshgrid expects a non-empty TensorList");
+  TORCH_CHECK(size > 0, "meshgrid expects a non-empty TensorList");
   std::vector<int64_t> shape(size);
   for(int64_t i = 0; i < size; i++) {
     switch (tensors[i].dim()) {
@@ -849,8 +849,8 @@ std::vector<Tensor> meshgrid(TensorList tensors) {
     }
   }
   for(int64_t i = 0; i < size - 1; i++){
-      AT_CHECK(tensors[i].dtype() == tensors[i+1].dtype(), "meshgrid expects all tensors to have the same dtype");
-      AT_CHECK(tensors[i].device() == tensors[i+1].device(), "meshgrid expects all tensors to have the same device");
+      TORCH_CHECK(tensors[i].dtype() == tensors[i+1].dtype(), "meshgrid expects all tensors to have the same dtype");
+      TORCH_CHECK(tensors[i].device() == tensors[i+1].device(), "meshgrid expects all tensors to have the same device");
   }
   std::vector<Tensor> grids;
   for(int64_t i = 0; i < size; i++) {
diff --git a/aten/src/ATen/native/TensorTransformations.cpp b/aten/src/ATen/native/TensorTransformations.cpp
index 86b7feb405b7..8bfd15ee63bb 100644
--- a/aten/src/ATen/native/TensorTransformations.cpp
+++ b/aten/src/ATen/native/TensorTransformations.cpp
@@ -107,21 +107,21 @@ Tensor roll_cpu(const Tensor& self, IntArrayRef shifts, IntArrayRef dims) {
 Tensor rot90(const Tensor& self, int64_t k, IntArrayRef dims) {
   const int64_t total_dims = self.dim(), total_rot_dims = dims.size();
 
-  AT_CHECK(total_rot_dims == 2,
+  TORCH_CHECK(total_rot_dims == 2,
     "expected total rotation dims == 2, but got dims = ", total_rot_dims);
 
-  AT_CHECK(total_dims >= 2,
+  TORCH_CHECK(total_dims >= 2,
     "expected total dims >= 2, but got total dims = ", total_dims);
 
-  AT_CHECK(dims[0] != dims[1] && std::abs(dims[0] - dims[1]) != total_dims,
+  TORCH_CHECK(dims[0] != dims[1] && std::abs(dims[0] - dims[1]) != total_dims,
     "expected rotation dims to be different, but got dim0 = ", dims[0],
     " and dim1 = ", dims[1]);
 
   // check range of dims
-  AT_CHECK(dims[0] < total_dims && dims[0] >= -total_dims,
+  TORCH_CHECK(dims[0] < total_dims && dims[0] >= -total_dims,
     "Rotation dim0 out of range, dim0 = ", dims[0]);
 
-  AT_CHECK(dims[1] < total_dims && dims[1] >= -total_dims,
+  TORCH_CHECK(dims[1] < total_dims && dims[1] >= -total_dims,
     "Rotation dim1 out of range, dim1 = ", dims[1]);
 
   // handle modulo with negative k
diff --git a/aten/src/ATen/native/TensorTransformations.h b/aten/src/ATen/native/TensorTransformations.h
index 7e54c46bfb7f..c8435c736bff 100644
--- a/aten/src/ATen/native/TensorTransformations.h
+++ b/aten/src/ATen/native/TensorTransformations.h
@@ -31,18 +31,18 @@ static inline void flip_check_errors(int64_t total_dims, int64_t flip_dims_size,
   // check duplicates in dims
   wrap_all_dims(flip_dims_v, total_dims);
   flip_dims_v.erase(std::unique(flip_dims_v.begin(), flip_dims_v.end()), flip_dims_v.end());
-  AT_CHECK((int64_t)flip_dims_v.size() == flip_dims_size,
+  TORCH_CHECK((int64_t)flip_dims_v.size() == flip_dims_size,
     "dims has duplicates, original flip dims size=", flip_dims_size,
     ", but unique flip dims size=", flip_dims_v.size());
 }
 
 static inline Tensor roll_common(const Tensor& self, IntArrayRef shifts, IntArrayRef dims) {
-  AT_CHECK(shifts.size() > 0, "`shifts` required");
+  TORCH_CHECK(shifts.size() > 0, "`shifts` required");
   if (dims.size() == 0 && shifts.size() == 1) {
     auto flattened = self.contiguous().view(self.numel());
     return roll(flattened, shifts[0], 0).view(self.sizes());
   }
-  AT_CHECK(
+  TORCH_CHECK(
     shifts.size() == dims.size(),
     "shifts and dimensions must align. shifts: ", shifts.size(), ", dims:", dims.size()
   );
diff --git a/aten/src/ATen/native/UnaryOps.cpp b/aten/src/ATen/native/UnaryOps.cpp
index 9121a36041de..2f5d47ffccb5 100644
--- a/aten/src/ATen/native/UnaryOps.cpp
+++ b/aten/src/ATen/native/UnaryOps.cpp
@@ -85,31 +85,38 @@ Tensor& _clamp_min_out_cpu(Tensor& result, const Tensor& self, Scalar min) {
   return legacy::th::_th_clamp_min_out(result, self, min);
 }
 
+Tensor& fill_out(Tensor& self, const Scalar value) {
+  auto iter = TensorIterator::nullary_op(self);
+  fill_stub(iter->device_type(), *iter, value);
+  return self;
+}
+
 Tensor& fill_(Tensor& self, Scalar value) {
-  return at::legacy::th::_th_fill_(self, value);
+  return fill_out(self, value);
 }
 
 Tensor& fill_(Tensor& self, const Tensor& value) {
-  return at::legacy::th::_th_fill_(self, value);
+  TORCH_CHECK(value.dim() == 0, "fill_ only supports 0-dimension value tensor but got tensor with ", value.dim(), " dimensions.");
+  return fill_out(self, value.item());
 }
 
 Tensor mvlgamma(const Tensor& self, int64_t p) {
-  AT_CHECK(at::isFloatingType(self.scalar_type()),
+  TORCH_CHECK(at::isFloatingType(self.scalar_type()),
            "mvlgamma is not implemented for ", self.type());
-  AT_CHECK((self > 0.5 * (p - 1.)).all().item<uint8_t>(),
+  TORCH_CHECK((self > 0.5 * (p - 1.)).all().item<uint8_t>(),
            "Condition for computing multivariate log-gamma not met");
-  AT_CHECK(p >= 1, "p has to be greater than or equal to 1");
+  TORCH_CHECK(p >= 1, "p has to be greater than or equal to 1");
   Tensor args = native::arange(-p / 2. + 0.5, 0.5, 0.5, self.options());
   args = args.add(self.unsqueeze(-1));
   return args.lgamma_().sum(-1).add_(p * (p - 1) * std::log(M_PI) / 4.);
 }
 
 Tensor& mvlgamma_(Tensor& self, int64_t p) {
-  AT_CHECK(at::isFloatingType(self.scalar_type()),
+  TORCH_CHECK(at::isFloatingType(self.scalar_type()),
            "mvlgamma is not implemented for ", self.type());
-  AT_CHECK((self > 0.5 * (p - 1.)).all().item<uint8_t>(),
+  TORCH_CHECK((self > 0.5 * (p - 1.)).all().item<uint8_t>(),
            "Condition for computing multivariate log-gamma not met");
-  AT_CHECK(p >= 1, "p has to be greater than or equal to 1");
+  TORCH_CHECK(p >= 1, "p has to be greater than or equal to 1");
   Tensor args = native::arange(-p / 2. + 0.5, 0.5, 0.5, self.options());
   args = args.add(self.unsqueeze(-1));
   return self.copy_(args.lgamma_().sum(-1).add_(p * (p - 1) * std::log(M_PI) / 4.));
@@ -136,7 +143,8 @@ Tensor& _sigmoid_out_cpu(Tensor& result, const Tensor& self) {
 #define IMPLEMENT_UNARY_OP_VEC(op)                              \
   Tensor op(const Tensor& self) {                               \
     Tensor result = at::empty({0}, self.options());             \
-    return at::op##_out(result, self);                          \
+    at::op##_out(result, self);                                 \
+    return result;                                              \
   }                                                             \
   Tensor& _##op##__cpu(Tensor& self) {                          \
     return at::op##_out(self, self);                            \
@@ -152,7 +160,8 @@ Tensor& _sigmoid_out_cpu(Tensor& result, const Tensor& self) {
 #define IMPLEMENT_UNARY_OP_TH(op)                               \
   Tensor op(const Tensor& self) {                               \
     Tensor result = at::empty({0}, self.options());             \
-    return at::op##_out(result, self);                          \
+    at::op##_out(result, self);                                 \
+    return result;                                              \
   }                                                             \
   Tensor& _##op##__cpu(Tensor& self) {                          \
     return at::op##_out(self, self);                            \
@@ -220,6 +229,6 @@ DEFINE_DISPATCH(sqrt_stub);
 DEFINE_DISPATCH(tan_stub);
 DEFINE_DISPATCH(tanh_stub);
 DEFINE_DISPATCH(trunc_stub);
-
+DEFINE_DISPATCH(fill_stub);
 }
 } // namespace at
diff --git a/aten/src/ATen/native/UnaryOps.h b/aten/src/ATen/native/UnaryOps.h
index b6758ca71ebd..a74d2b72763f 100644
--- a/aten/src/ATen/native/UnaryOps.h
+++ b/aten/src/ATen/native/UnaryOps.h
@@ -11,6 +11,8 @@ namespace at { namespace native {
 
 using unary_fn = void(*)(TensorIterator&);
 
+DECLARE_DISPATCH(void(*)(TensorIterator&, Scalar), fill_stub);
+
 DECLARE_DISPATCH(unary_fn, abs_stub);
 DECLARE_DISPATCH(unary_fn, acos_stub);
 DECLARE_DISPATCH(unary_fn, asin_stub);
@@ -46,7 +48,6 @@ DECLARE_DISPATCH(void(*)(Tensor&, const double, Generator *), bernoulli_mkl_stub
 // digamma
 // lgamma
 // erfinv
-// fill
 // clone
 // contiguous
 // clamp/_min/_max
diff --git a/aten/src/ATen/native/UpSample.h b/aten/src/ATen/native/UpSample.h
index 7291fda83778..058baa6086ed 100644
--- a/aten/src/ATen/native/UpSample.h
+++ b/aten/src/ATen/native/UpSample.h
@@ -1,33 +1,11 @@
 #include <math.h>
 
 #include <ATen/ATen.h>
+#include <ATen/TensorUtils.h>
 
 namespace at {
 namespace native {
 
-// Corresponds to THNN_CHECK_DIM_SIZE
-static inline void check_dim_size(
-    const Tensor& data,
-    int64_t dim,
-    int64_t dim_size,
-    int64_t size) {
-  /* Check dimension size of a tensor */
-  AT_CHECK(
-      data.dim() == dim && data.size(dim_size) == size,
-      "Expected tensor of dimension ",
-      dim,
-      " and tensor.size[",
-      dim_size,
-      "] == ",
-      size,
-      " but got: dimension ",
-      data.dim(),
-      " and tensor.size[",
-      dim_size,
-      "] = ",
-      data.size(dim_size));
-}
-
 static inline void upsample_1d_shape_check(
     const Tensor& input,
     const Tensor& grad_output,
@@ -35,7 +13,7 @@ static inline void upsample_1d_shape_check(
     int64_t nchannels,
     int64_t input_width,
     int64_t output_width) {
-  AT_CHECK(
+  TORCH_CHECK(
       input_width > 0 && output_width > 0,
       "Input and output sizes should be greater than 0, but got input (W: ",
       input_width,
@@ -44,7 +22,7 @@ static inline void upsample_1d_shape_check(
       ")");
 
   if (input.defined()) {
-    AT_CHECK(
+    TORCH_CHECK(
         input.numel() != 0 && input.dim() == 3,
         "Non-empty 3D data tensor expected but got a tensor with sizes ",
         input.sizes());
@@ -64,7 +42,7 @@ static inline void upsample_2d_shape_check(
     int64_t input_width,
     int64_t output_height,
     int64_t output_width) {
-  AT_CHECK(
+  TORCH_CHECK(
       input_height > 0 && input_width > 0 && output_height > 0 &&
           output_width > 0,
       "Input and output sizes should be greater than 0,"
@@ -79,7 +57,7 @@ static inline void upsample_2d_shape_check(
       ")");
 
   if (input.defined()) {
-    AT_CHECK(
+    TORCH_CHECK(
         input.numel() != 0 && input.dim() == 4,
         "Non-empty 4D data tensor expected but got a tensor with sizes ",
         input.sizes());
@@ -102,7 +80,7 @@ static inline void upsample_3d_shape_check(
     int64_t output_depth,
     int64_t output_height,
     int64_t output_width) {
-  AT_CHECK(
+  TORCH_CHECK(
       input_depth > 0 && input_height > 0 && input_width > 0 &&
           output_depth > 0 && output_height > 0 && output_width > 0,
       "Input and output sizes should be greater than 0, but got input (D: ",
@@ -120,7 +98,7 @@ static inline void upsample_3d_shape_check(
       ")");
 
   if (input.defined()) {
-    AT_CHECK(
+    TORCH_CHECK(
         input.numel() != 0 && input.dim() == 5,
         "Non-empty 5D data tensor expected but got a tensor with sizes ",
         input.sizes());
diff --git a/aten/src/ATen/native/UpSampleBicubic2d.cpp b/aten/src/ATen/native/UpSampleBicubic2d.cpp
index 95e219879702..056520893676 100644
--- a/aten/src/ATen/native/UpSampleBicubic2d.cpp
+++ b/aten/src/ATen/native/UpSampleBicubic2d.cpp
@@ -166,7 +166,7 @@ static void upsample_bicubic2d_out_cpu_template(
     const Tensor& input_,
     IntArrayRef output_size,
     bool align_corners) {
-  AT_CHECK(
+  TORCH_CHECK(
       output_size.size() == 2,
       "It is expected output_size equals to 2, but got size ",
       output_size.size());
@@ -217,12 +217,12 @@ static void upsample_bicubic2d_backward_out_cpu_template(
     IntArrayRef output_size,
     IntArrayRef input_size,
     bool align_corners) {
-  AT_CHECK(
+  TORCH_CHECK(
       output_size.size() == 2,
       "It is expected output_size equals to 2, but got size ",
       output_size.size());
 
-  AT_CHECK(
+  TORCH_CHECK(
       input_size.size() == 4,
       "It is expected input_size equals to 4, but got size ",
       input_size.size());
diff --git a/aten/src/ATen/native/UpSampleBilinear2d.cpp b/aten/src/ATen/native/UpSampleBilinear2d.cpp
index 6c91d688c791..3d273dca2ef7 100644
--- a/aten/src/ATen/native/UpSampleBilinear2d.cpp
+++ b/aten/src/ATen/native/UpSampleBilinear2d.cpp
@@ -159,7 +159,7 @@ static void upsample_bilinear2d_out_cpu_template(
     const Tensor& input_,
     IntArrayRef output_size,
     bool align_corners) {
-  AT_CHECK(
+  TORCH_CHECK(
       output_size.size() == 2,
       "It is expected output_size equals to 2, but got size ",
       output_size.size());
@@ -214,12 +214,12 @@ static void upsample_bilinear2d_backward_out_cpu_template(
     IntArrayRef output_size,
     IntArrayRef input_size,
     bool align_corners) {
-  AT_CHECK(
+  TORCH_CHECK(
       output_size.size() == 2,
       "It is expected output_size equals to 2, but got size ",
       output_size.size());
 
-  AT_CHECK(
+  TORCH_CHECK(
       input_size.size() == 4,
       "It is expected input_size equals to 4, but got size ",
       input_size.size());
diff --git a/aten/src/ATen/native/UpSampleLinear1d.cpp b/aten/src/ATen/native/UpSampleLinear1d.cpp
index 3f34aabcf583..07dd58b63f57 100644
--- a/aten/src/ATen/native/UpSampleLinear1d.cpp
+++ b/aten/src/ATen/native/UpSampleLinear1d.cpp
@@ -112,7 +112,7 @@ static void upsample_linear1d_out_cpu_template(
     const Tensor& input_,
     IntArrayRef output_size,
     bool align_corners) {
-  AT_CHECK(
+  TORCH_CHECK(
       output_size.size() == 1,
       "It is expected output_size equals to 1, but got size ",
       output_size.size());
@@ -159,12 +159,12 @@ static void upsample_linear1d_backward_out_cpu_template(
     IntArrayRef output_size,
     IntArrayRef input_size,
     bool align_corners) {
-  AT_CHECK(
+  TORCH_CHECK(
       output_size.size() == 1,
       "It is expected output_size equals to 1, but got size ",
       output_size.size());
 
-  AT_CHECK(
+  TORCH_CHECK(
       input_size.size() == 3,
       "It is expected input_size equals to 3, but got size ",
       input_size.size());
diff --git a/aten/src/ATen/native/UpSampleNearest1d.cpp b/aten/src/ATen/native/UpSampleNearest1d.cpp
index 4d99943b727a..176aab3bad18 100644
--- a/aten/src/ATen/native/UpSampleNearest1d.cpp
+++ b/aten/src/ATen/native/UpSampleNearest1d.cpp
@@ -94,7 +94,7 @@ static void upsample_nearest1d_out_cpu_template(
     Tensor& output,
     const Tensor& input_,
     IntArrayRef output_size) {
-  AT_CHECK(
+  TORCH_CHECK(
       output_size.size() == 1,
       "It is expected output_size equals to 1, but got size ",
       output_size.size());
@@ -139,12 +139,12 @@ static void upsample_nearest1d_backward_out_cpu_template(
     const Tensor& grad_output_,
     IntArrayRef output_size,
     IntArrayRef input_size) {
-  AT_CHECK(
+  TORCH_CHECK(
       output_size.size() == 1,
       "It is expected output_size equals to 1, but got size ",
       output_size.size());
 
-  AT_CHECK(
+  TORCH_CHECK(
       input_size.size() == 3,
       "It is expected input_size equals to 3, but got size ",
       input_size.size());
diff --git a/aten/src/ATen/native/UpSampleNearest2d.cpp b/aten/src/ATen/native/UpSampleNearest2d.cpp
index eb9d5fc477aa..797c8e9d5123 100644
--- a/aten/src/ATen/native/UpSampleNearest2d.cpp
+++ b/aten/src/ATen/native/UpSampleNearest2d.cpp
@@ -119,7 +119,7 @@ static void upsample_nearest2d_out_cpu_template(
     Tensor& output,
     const Tensor& input_,
     IntArrayRef output_size) {
-  AT_CHECK(
+  TORCH_CHECK(
       output_size.size() == 2,
       "It is expected output_size equals to 2, but got size ",
       output_size.size());
@@ -170,12 +170,12 @@ static void upsample_nearest2d_backward_out_cpu_template(
     const Tensor& grad_output_,
     IntArrayRef output_size,
     IntArrayRef input_size) {
-  AT_CHECK(
+  TORCH_CHECK(
       output_size.size() == 2,
       "It is expected output_size equals to 2, but got size ",
       output_size.size());
 
-  AT_CHECK(
+  TORCH_CHECK(
       input_size.size() == 4,
       "It is expected input_size equals to 4, but got size ",
       input_size.size());
diff --git a/aten/src/ATen/native/UpSampleNearest3d.cpp b/aten/src/ATen/native/UpSampleNearest3d.cpp
index fd550fdf0bd8..37d763613426 100644
--- a/aten/src/ATen/native/UpSampleNearest3d.cpp
+++ b/aten/src/ATen/native/UpSampleNearest3d.cpp
@@ -154,7 +154,7 @@ static void upsample_nearest3d_out_cpu_template(
     Tensor& output,
     const Tensor& input_,
     IntArrayRef output_size) {
-  AT_CHECK(
+  TORCH_CHECK(
       output_size.size() == 3,
       "It is expected output_size equals to 3, but got size ",
       output_size.size());
@@ -213,12 +213,12 @@ static void upsample_nearest3d_backward_out_cpu_template(
     const Tensor& grad_output_,
     IntArrayRef output_size,
     IntArrayRef input_size) {
-  AT_CHECK(
+  TORCH_CHECK(
       output_size.size() == 3,
       "It is expected output_size equals to 3, but got size ",
       output_size.size());
 
-  AT_CHECK(
+  TORCH_CHECK(
       input_size.size() == 5,
       "It is expected input_size equals to 5, but got size ",
       input_size.size());
diff --git a/aten/src/ATen/native/UpSampleTrilinear3d.cpp b/aten/src/ATen/native/UpSampleTrilinear3d.cpp
index 37f6082cd0fb..34096dfa09e3 100644
--- a/aten/src/ATen/native/UpSampleTrilinear3d.cpp
+++ b/aten/src/ATen/native/UpSampleTrilinear3d.cpp
@@ -222,7 +222,7 @@ static void upsample_trilinear3d_out_cpu_template(
     const Tensor& input_,
     IntArrayRef output_size,
     bool align_corners) {
-  AT_CHECK(
+  TORCH_CHECK(
       output_size.size() == 3,
       "It is expected output_size equals to 3, but got size ",
       output_size.size());
@@ -284,12 +284,12 @@ static void upsample_trilinear3d_backward_out_cpu_template(
     IntArrayRef output_size,
     IntArrayRef input_size,
     bool align_corners) {
-  AT_CHECK(
+  TORCH_CHECK(
       output_size.size() == 3,
       "It is expected output_size equals to 3, but got size ",
       output_size.size());
 
-  AT_CHECK(
+  TORCH_CHECK(
       input_size.size() == 5,
       "It is expected input_size equals to 5, but got size ",
       input_size.size());
diff --git a/aten/src/ATen/native/WeightNorm.cpp b/aten/src/ATen/native/WeightNorm.cpp
index caaeed77b7de..d1bc46809c53 100644
--- a/aten/src/ATen/native/WeightNorm.cpp
+++ b/aten/src/ATen/native/WeightNorm.cpp
@@ -38,7 +38,7 @@ Tensor _weight_norm
    int64_t dim)
 {
 
-  AT_CHECK(
+  TORCH_CHECK(
     v_in.device() == g_in.device(),
     "weight_norm: expected v_in and g_in to be on the same device, but v_in is "
     "on ", v_in.device(), " and g_in is on ", g_in.device());
@@ -73,17 +73,17 @@ std::tuple<Tensor, Tensor> _weight_norm_differentiable_backward
   // In Functions.cpp, the HardshrinkBackward object supplies "grad.contiguous()"
   // as the first argument, so grad_w should be contiguous here.
   // All these checks should succeed:
-  AT_CHECK(grad_w.is_contiguous(), "grad_w must be contiguous");
-  AT_CHECK(saved_v.is_contiguous(), "saved_v must be contiguous");
-  AT_CHECK(saved_g.is_contiguous(), "saved_g must be contiguous");
-  AT_CHECK(saved_norms.is_contiguous(), "saved_norms must be contiguous");
+  TORCH_CHECK(grad_w.is_contiguous(), "grad_w must be contiguous");
+  TORCH_CHECK(saved_v.is_contiguous(), "saved_v must be contiguous");
+  TORCH_CHECK(saved_g.is_contiguous(), "saved_g must be contiguous");
+  TORCH_CHECK(saved_norms.is_contiguous(), "saved_norms must be contiguous");
 
   int64_t last_dim = saved_v.dim() - 1;
   int64_t last_size = saved_v.size(last_dim);
 
   // Like weight_norm_fused_backward, weight_norm_differentiable_backward should only ever be called
   // through a WeightNormFusedBackward object, so we expect that dim == 0 || dim == saved_v.size(-1)
-  AT_CHECK(dim == 0 || dim == last_dim, "Expected dim to be the first or last dimension");
+  TORCH_CHECK(dim == 0 || dim == last_dim, "Expected dim to be the first or last dimension");
 
   // saved_g and saved_norms are already shaped to broadcast over the correct dimensions
 
diff --git a/aten/src/ATen/native/cpu/CopyKernel.cpp b/aten/src/ATen/native/cpu/CopyKernel.cpp
index 529075a1e20f..86331a31ecf6 100644
--- a/aten/src/ATen/native/cpu/CopyKernel.cpp
+++ b/aten/src/ATen/native/cpu/CopyKernel.cpp
@@ -1,68 +1,53 @@
-#include <ATen/native/cpu/CopyKernel.h>
-
 #include <ATen/ATen.h>
-#include <ATen/CPUApplyUtils.h>
+
 #include <ATen/Dispatch.h>
-#include <ATen/cpu/vec256/vec256.h>
+#include <ATen/native/Copy.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/cpu/Loops.h>
-#include <ATen/native/Copy.h>
 
 namespace at {
 namespace native {
 namespace {
 
 template <typename self_T>
-void copy_kernel_cast_t_impl(Tensor& self, const Tensor& src) {
-  auto builder = TensorIterator::Builder();
-  builder.add_output(self);
-  builder.add_input(src);
-  builder.dont_resize_outputs();
-  builder.dont_compute_common_dtype();
-  auto iter = builder.build();
-
+void copy_kernel_cast(TensorIterator& iter) {
   AT_DISPATCH_ALL_TYPES_AND2(
-      at::ScalarType::Half,
-      at::ScalarType::Bool,
-      src.scalar_type(),
+      ScalarType::Half,
+      ScalarType::Bool,
+      iter.dtype(1),
       "copy_kernel_cast",
       [&] {
-        at::native::unary_kernel(*iter, [=](scalar_t a) -> self_T {
+        at::native::unary_kernel(iter, [=](scalar_t a) -> self_T {
           return static_cast<self_T>(
               static_cast<at::native::inter_copy_type_t<self_T>>(a));
         });
       });
 }
 
-static void copy_kernel_cast_impl(Tensor& self, const Tensor& src) {
-  AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::Bool,
-      self.scalar_type(), "copy_kernel_cast", [&]() { copy_kernel_cast_t_impl<scalar_t>(self, src); });
-}
-
-static void copy_kernel_same_type_impl(Tensor& self, const Tensor& src) {
-  auto builder = TensorIterator::Builder();
-  builder.add_output(self);
-  builder.add_input(src);
-  builder.dont_resize_outputs();
-  auto iter = builder.build();
-
-  if (self.scalar_type() == at::ScalarType::Half) {
-    unary_kernel(*iter, [=](at::Half a) -> at::Half { return a; });
+static void copy_kernel(TensorIterator& iter, bool non_blocking) {
+  ScalarType dtype = iter.dtype(0);
+  if (dtype == iter.dtype(1)) {
+    if (dtype == ScalarType::Half) {
+      unary_kernel(iter, [=](at::Half a) -> at::Half { return a; });
+    } else {
+      AT_DISPATCH_ALL_TYPES_AND(
+          ScalarType::Bool, dtype, "copy_kernel", [&] {
+            unary_kernel_vec(
+                iter,
+                [=](scalar_t a) -> scalar_t { return a; },
+                [=](Vec256<scalar_t> a) { return a; });
+          });
+    }
   } else {
-    AT_DISPATCH_ALL_TYPES_AND(
-        at::ScalarType::Bool, self.scalar_type(), "copy_kernel_same_type", [&] {
-          unary_kernel_vec(
-              *iter,
-              [=](scalar_t a) -> scalar_t { return a; },
-              [=](Vec256<scalar_t> a) { return a; });
-        });
+    AT_DISPATCH_ALL_TYPES_AND2(ScalarType::Half, ScalarType::Bool, dtype, "copy_", [&] {
+      copy_kernel_cast<scalar_t>(iter);
+    });
   }
 }
 
 } // anonymous namespace
 
-REGISTER_DISPATCH(copy_kernel_same_type, &copy_kernel_same_type_impl);
-REGISTER_DISPATCH(copy_kernel_cast, &copy_kernel_cast_impl);
+REGISTER_DISPATCH(copy_stub, &copy_kernel);
 
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/cpu/CopyKernel.h b/aten/src/ATen/native/cpu/CopyKernel.h
deleted file mode 100644
index 917c546bffbe..000000000000
--- a/aten/src/ATen/native/cpu/CopyKernel.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#pragma once
-
-#include <ATen/ATen.h>
-#include <ATen/native/DispatchStub.h>
-
-namespace at {
-namespace native {
-
-using forward_fn = void (*)(Tensor&, const Tensor&);
-
-DECLARE_DISPATCH(forward_fn, copy_kernel_same_type);
-DECLARE_DISPATCH(forward_fn, copy_kernel_cast);
-
-} // namespace native
-} // namespace at
diff --git a/aten/src/ATen/native/cpu/Loops.h b/aten/src/ATen/native/cpu/Loops.h
index 320ad862e4be..26bb69e8e589 100644
--- a/aten/src/ATen/native/cpu/Loops.h
+++ b/aten/src/ATen/native/cpu/Loops.h
@@ -47,6 +47,11 @@ static inline bool is_unary_contiguous_s1(const int64_t* strides) {
          strides[1] == 0;
 }
 
+template <typename traits>
+static inline bool is_nullary_contiguous(const int64_t* strides) {
+  return strides[0] == sizeof(typename traits::result_type);
+}
+
 // result is
 static inline bool is_reduction(char** data, const int64_t* strides) {
   return strides[0] == 0 &&
@@ -93,6 +98,77 @@ static inline bool is_reduction(char** data, const int64_t* strides) {
   const char* in1_ptr = data[1]; \
   const char* in2_ptr = data[2];
 
+#define NULLARY_LOOP_HEADER(func_t, data, strides) \
+  using traits = nullary_function_traits<func_t>; \
+  using arg0_t = typename traits::result_type; \
+  char* out_ptr = data[0]; \
+  int64_t s0 = strides[0];
+
+ #define NULLARY_VEC_HEADER(func_t) \
+  using traits = nullary_function_traits<func_t>; \
+  using scalar_t = typename traits::result_type; \
+  using Vec = Vec256<scalar_t>;
+
+ #define NULLARY_VEC_LOOP_HEADER(func_t, data) \
+  NULLARY_VEC_HEADER(func_t) \
+  char* out_ptr = data[0];
+
+
+// Basic loop fill operation (zero inputs, one output). May be auto-vectorized
+// by the compiler.
+template <typename func_t>
+static inline void nullary_loop(char** data, const int64_t* strides, int64_t i, int64_t n, func_t op) {
+  NULLARY_LOOP_HEADER(func_t, data, strides)
+  for (; i < n; i++) {
+    arg0_t out = op();
+    *(arg0_t*)(out_ptr + i * s0) = out;
+  }
+}
+
+ // computes out = op()
+template <typename func_t, typename vec_func_t>
+static inline void vectorized_nullary_loop(char** data, int64_t n, func_t op, vec_func_t vop) {
+  NULLARY_VEC_LOOP_HEADER(func_t, data)
+  int64_t i = 0;
+  for (; i <= n - 2 * Vec::size(); i += 2 * Vec::size()) {
+    auto out1 = vop();
+    auto out2 = vop();
+    out1.store(out_ptr + i * sizeof(scalar_t));
+    out2.store(out_ptr + (i + Vec::size()) * sizeof(scalar_t));
+  }
+  int64_t strides[] = { sizeof(scalar_t) };
+  nullary_loop(data, strides, i, n, op);
+}
+
+template <typename func_t>
+void nullary_kernel(TensorIterator& iter, func_t op) {
+  AT_ASSERT(iter.ntensors() > 0)
+  using traits = nullary_function_traits<func_t>;
+
+   iter.for_each([&](int ntensor, char** data, const int64_t* strides, int64_t n) {
+    // Specializations to encourage auto-vectorization (trick from Numpy's loops.c.src)
+    if (is_nullary_contiguous<traits>(strides)) {
+      nullary_loop(data, strides, 0, n, op);
+    } else {
+      nullary_loop(data, strides, 0, n, op);
+    }
+  });
+}
+
+ template <typename func_t, typename vec_func_t>
+void nullary_kernel_vec(TensorIterator& iter, func_t op, vec_func_t vop) {
+  AT_ASSERT(iter.ntensors() > 0)
+  using traits = nullary_function_traits<func_t>;
+
+   iter.for_each([&](int ntensor, char** data, const int64_t* strides, int64_t n) {
+    if (is_nullary_contiguous<traits>(strides)) {
+      vectorized_nullary_loop(data, n, op, vop);
+    } else {
+      nullary_loop(data, strides, 0, n, op);
+    }
+  });
+}
+
 // Basic loop unary operation (one input, one output). May be auto-vectorized
 // by the compiler.
 template <typename func_t>
diff --git a/aten/src/ATen/native/cpu/Reduce.h b/aten/src/ATen/native/cpu/Reduce.h
index dbc469a73342..b6adf4ae5717 100644
--- a/aten/src/ATen/native/cpu/Reduce.h
+++ b/aten/src/ATen/native/cpu/Reduce.h
@@ -25,6 +25,44 @@ static inline bool is_outer_reduction(const int64_t* strides) {
          strides[3] == sizeof(typename traits::arg2_t);
 }
 
+template<typename traits, typename res_t>
+static void set_result(const int index, const res_t result, const TensorIterator &iter, const int num_outputs) {
+  static_assert(std::is_same<res_t, typename traits::arg2_t>::value, "data types must match");
+  if (index < num_outputs) {
+    char *out = (char *) iter.data_ptr(index);
+    *(res_t *) out = result;
+  }
+}
+
+template<typename traits, typename res_t>
+static void set_results(const res_t result, const TensorIterator &iter, const int num_outputs) {
+  AT_ASSERT(num_outputs == 1);
+  set_result<traits>(0, result, iter, num_outputs);
+}
+
+template<typename traits, std::size_t i = 0, typename... tuple_t>
+static inline typename std::enable_if<i == sizeof...(tuple_t), std::size_t>::type
+for_each_in_tuple(const std::tuple<tuple_t...>& t, const TensorIterator &iter, const int num_outputs) {
+  return i;
+}
+
+template<typename traits, std::size_t i = 0, typename... tuple_t>
+static inline typename std::enable_if<i < sizeof...(tuple_t), std::size_t>::type
+for_each_in_tuple(const std::tuple<tuple_t...>& t, const TensorIterator &iter, const int num_outputs) {
+  if (i < num_outputs) {
+    set_result<traits>(i, std::get<i>(t), iter, num_outputs);
+    return for_each_in_tuple<traits, i + 1, tuple_t...>(t, iter, num_outputs);
+  }
+  return i;
+}
+
+template<typename traits, typename... res_t>
+static void set_results(const std::tuple<res_t...>& result, const TensorIterator &iter, const int num_outputs) {
+  AT_ASSERT(num_outputs >= 1);
+  std::size_t result_size = for_each_in_tuple<traits>(result, iter, num_outputs);
+  AT_ASSERT(num_outputs == result_size);
+}
+
 template <typename T, typename... Args>
 struct all_same : c10::guts::conjunction<
   std::is_same<T, Args>...
@@ -64,7 +102,7 @@ void binary_kernel_reduce(TensorIterator& iter, ops_t ops, init_t init) {
   using c_traits = binary_function_traits<cf_t>;
   using p_traits = unary_function_traits<pf_t>;
   using acc_t = typename p_traits::arg1_t;
-  using data_t = typename p_traits::result_type;
+  using data_t = typename r_traits::arg2_t;
   static_assert(
     all_same<
       acc_t,
@@ -75,19 +113,17 @@ void binary_kernel_reduce(TensorIterator& iter, ops_t ops, init_t init) {
       typename c_traits::arg2_t,
       typename c_traits::result_type>::value,
     "all accumulate types must match");
-  static_assert(
-    std::is_same<data_t, typename r_traits::arg2_t>::value,
-    "all data types must match");
   static_assert(
     std::is_default_constructible<acc_t>::value,
     "the accumulate type must be default-constructible"
   );
-  iter.foreach_reduced_elt([&](TensorIterator &sub_iter) {
-    auto reduction_body = [&](acc_t acc, int64_t begin, int64_t end) -> acc_t {
-      sub_iter.serial_for_each([&acc, &ops](int ntensors, char** data, const int64_t* strides, int64_t size) {
-        AT_ASSERT(ntensors == 2);
-        char *in = data[1];
-        int64_t stride = strides[1];
+  const int num_outputs = iter.noutputs();
+  iter.foreach_reduced_elt([&ops, &init, num_outputs](TensorIterator &sub_iter) {
+    auto reduction_body = [&ops, &sub_iter, num_outputs](acc_t acc, int64_t begin, int64_t end) -> acc_t {
+      sub_iter.serial_for_each([&acc, &ops, num_outputs](int ntensors, char** data, const int64_t* strides, int64_t size) {
+        AT_ASSERT(ntensors - num_outputs == 1);
+        char *in = data[ntensors - 1];
+        int64_t stride = strides[ntensors - 1];
         for (int64_t i = 0; i < size; ++i) {
           acc = ops.reduce(acc, *(data_t*)in);
           in += stride;
@@ -118,8 +154,7 @@ void binary_kernel_reduce(TensorIterator& iter, ops_t ops, init_t init) {
         total_acc = ops.combine(total_acc, buffer[i]);
       }
     }
-    char *out = (char *)sub_iter.data_ptr(0);
-    *(data_t*)out = ops.project(total_acc);
+    set_results<r_traits>(ops.project(total_acc), sub_iter, num_outputs);
   });
 }
 
diff --git a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
index b895636b2ffd..95a1b9c1bfb2 100644
--- a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
@@ -38,7 +38,7 @@ static void std_var_kernel_impl(TensorIterator &iter, bool unbiased, bool take_s
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.dtype(), "std_cpu", [&] {
     binary_kernel_reduce(
       iter,
-      WelfordOps<scalar_t, double, int64_t, double> { unbiased, take_sqrt },
+      WelfordOps<scalar_t, double, int64_t, double, std::tuple<scalar_t, scalar_t>> { unbiased, take_sqrt },
       WelfordData<double, int64_t, double>()
     );
   });
diff --git a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
index 9ede5385367f..3f9aecac5474 100644
--- a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
@@ -54,6 +54,25 @@ static void abs_kernel(TensorIterator& iter) {
   });
 }
 
+static void fill_kernel(TensorIterator& iter, Scalar value_scalar) {
+  if( iter.dtype() == ScalarType::Half ) {
+    auto value = value_scalar.to<at::Half>().x;
+    using H = decltype(value);
+    nullary_kernel_vec(
+        iter,
+        [=]() -> H { return value; },
+        [=]() { return Vec256<H>(value); });
+  } else {
+    AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Bool, iter.dtype(), "fill_cpu", [&]() {
+      scalar_t value = value_scalar.to<scalar_t>();
+      nullary_kernel_vec(
+          iter,
+          [=]() -> scalar_t { return value; },
+          [=]() { return Vec256<scalar_t>(value); });
+    });
+  }
+}
+
 static void frac_kernel(TensorIterator& iter) {
   AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "frac_cpu", [&]() {
     unary_kernel_vec(
@@ -192,6 +211,7 @@ REGISTER_DISPATCH(abs_stub, &abs_kernel);
 REGISTER_DISPATCH(frac_stub, &frac_kernel);
 REGISTER_DISPATCH(reciprocal_stub, &reciprocal_kernel);
 REGISTER_DISPATCH(neg_stub, &neg_kernel);
+REGISTER_DISPATCH(fill_stub, &fill_kernel);
 
 // IMPLEMENT_FLOAT_KERNEL(ALL, abs)
 IMPLEMENT_FLOAT_KERNEL(FLOATING, acos)
diff --git a/aten/src/ATen/native/cpu/layer_norm_kernel.cpp b/aten/src/ATen/native/cpu/layer_norm_kernel.cpp
new file mode 100644
index 000000000000..638d010371d9
--- /dev/null
+++ b/aten/src/ATen/native/cpu/layer_norm_kernel.cpp
@@ -0,0 +1,79 @@
+#include <ATen/native/cpu/layer_norm_kernel.h>
+
+#include <ATen/ATen.h>
+#include <ATen/CPUApplyUtils.h>
+#include <ATen/Dispatch.h>
+
+namespace at {
+namespace native {
+
+namespace {
+
+template <typename T>
+void LayerNormKernelImplInternal(
+    const Tensor& X,
+    const Tensor& gamma,
+    const Tensor& beta,
+    int64_t M,
+    int64_t N,
+    T eps,
+    Tensor* Y,
+    Tensor* mean,
+    Tensor* rstd) {
+  DCHECK_EQ(X.numel(), M * N);
+  DCHECK(!gamma.defined() || gamma.numel() == N);
+  DCHECK(!beta.defined() || beta.numel() == N);
+  const T* X_data = X.data<T>();
+  const T* gamma_data = gamma.defined() ? gamma.data<T>() : nullptr;
+  const T* beta_data = beta.defined() ? beta.data<T>() : nullptr;
+  T* Y_data = Y->data<T>();
+  T* mean_data = mean->data<T>();
+  T* rstd_data = rstd->data<T>();
+  const T c = T(1) / static_cast<T>(N);
+  const bool gamma_null = gamma_data == nullptr;
+  const bool beta_null = beta_data == nullptr;
+  for (int64_t i = 0; i < M; ++i) {
+    const T* X_ptr = X_data + i * N;
+    T* Y_ptr = Y_data + i * N;
+    T mean_val = T(0);
+    T rstd_val = T(0);
+    for (int64_t j = 0; j < N; ++j) {
+      mean_val += X_ptr[j];
+      rstd_val += X_ptr[j] * X_ptr[j];
+    }
+    mean_val *= c;
+    rstd_val = T(1) / std::sqrt(rstd_val * c - mean_val * mean_val + eps);
+    const T scale = rstd_val;
+    const T bias = -rstd_val * mean_val;
+    for (int64_t j = 0; j < N; ++j) {
+      const T gamma_v = gamma_null ? T(1) : gamma_data[j];
+      const T beta_v = beta_null ? T(0) : beta_data[j];
+      Y_ptr[j] = (X_ptr[j] * scale + bias) * gamma_v + beta_v;
+    }
+    mean_data[i] = mean_val;
+    rstd_data[i] = rstd_val;
+  }
+}
+
+void LayerNormKernelImpl(
+    const Tensor& X,
+    const Tensor& gamma,
+    const Tensor& beta,
+    int64_t M,
+    int64_t N,
+    double eps,
+    Tensor* Y,
+    Tensor* mean,
+    Tensor* rstd) {
+  AT_DISPATCH_FLOATING_TYPES(X.scalar_type(), "LayerNormKernelImpl", [&]() {
+    LayerNormKernelImplInternal<scalar_t>(
+        X, gamma, beta, M, N, static_cast<scalar_t>(eps), Y, mean, rstd);
+  });
+}
+
+} // namespace
+
+REGISTER_DISPATCH(LayerNormKernel, &LayerNormKernelImpl);
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/cpu/layer_norm_kernel.h b/aten/src/ATen/native/cpu/layer_norm_kernel.h
new file mode 100644
index 000000000000..ae39aa76e5e8
--- /dev/null
+++ b/aten/src/ATen/native/cpu/layer_norm_kernel.h
@@ -0,0 +1,26 @@
+#ifndef ATEN_SRC_NATIVE_CPU_LAYER_NORM_KERNEL_H_
+#define ATEN_SRC_NATIVE_CPU_LAYER_NORM_KERNEL_H_
+
+#include <ATen/ATen.h>
+#include <ATen/native/DispatchStub.h>
+
+namespace at {
+namespace native {
+
+using forward_fn = void (*)(
+    const Tensor& /* X */,
+    const Tensor& /* gamma */,
+    const Tensor& /* beta */,
+    int64_t /* M */,
+    int64_t /* N */,
+    double /* eps */,
+    Tensor* /* Y */,
+    Tensor* /* mean */,
+    Tensor* /* rstd */);
+
+DECLARE_DISPATCH(forward_fn, LayerNormKernel);
+
+} // namespace native
+} // namespace at
+
+#endif // ATEN_SRC_NATIVE_CPU_LAYER_NORM_KERNEL_H_
diff --git a/aten/src/ATen/native/cuda/Activation.cu b/aten/src/ATen/native/cuda/Activation.cu
index e0c2148f35e0..8c95313c4a09 100644
--- a/aten/src/ATen/native/cuda/Activation.cu
+++ b/aten/src/ATen/native/cuda/Activation.cu
@@ -47,14 +47,14 @@ __global__ void prelu_cuda_kernel_multi_weights(
 }
 
 Tensor prelu_cuda(const Tensor& self, const Tensor& weight_) {
-  AT_CHECK(self.is_cuda());
-  AT_CHECK(weight_.is_cuda());
+  TORCH_CHECK(self.is_cuda());
+  TORCH_CHECK(weight_.is_cuda());
 
   auto input = self.contiguous();
   auto weight = weight_.contiguous();
 
-  AT_CHECK(input.is_contiguous());
-  AT_CHECK(weight.is_contiguous());
+  TORCH_CHECK(input.is_contiguous());
+  TORCH_CHECK(weight.is_contiguous());
 
   int64_t weight_num = weight.numel();
   Tensor result = at::empty_like(input);
@@ -71,7 +71,7 @@ Tensor prelu_cuda(const Tensor& self, const Tensor& weight_) {
   }
   else { // case2: multiple weights, one for each channel
     int64_t input_ndim = input.dim();
-    AT_CHECK(input_ndim > 0, "Not allow zero-dim input tensor.");
+    TORCH_CHECK(input_ndim > 0, "Not allow zero-dim input tensor.");
 
     int64_t channel_size = 1; // channel_size default to 1
     int64_t input_stride0 = 1, input_stride1 = 1;
@@ -81,7 +81,7 @@ Tensor prelu_cuda(const Tensor& self, const Tensor& weight_) {
       input_stride0 = strides[0];
       input_stride1 = strides[1];
     }
-    AT_CHECK(channel_size == weight_num,
+    TORCH_CHECK(channel_size == weight_num,
       "Mismatch of parameter numbers and input channel size. Found parameter numbers = ", weight_num,
       " and channel size = ", channel_size, ".");
 
@@ -92,7 +92,7 @@ Tensor prelu_cuda(const Tensor& self, const Tensor& weight_) {
     int curDevice = -1;
     cudaGetDevice(&curDevice);
     cudaStream_t stream = at::cuda::getCurrentCUDAStream(curDevice);
-    AT_CHECK(cuda::getApplyGrid(input_numel, grid, curDevice), "prelu: input too large or too many dimensions");
+    TORCH_CHECK(cuda::getApplyGrid(input_numel, grid, curDevice), "prelu: input too large or too many dimensions");
 
     AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "prelu_cuda", [&] {
       prelu_cuda_kernel_multi_weights<scalar_t>
@@ -155,17 +155,17 @@ __global__ void prelu_cuda_backward_kernel_multi_weights(
 }
 
 std::tuple<Tensor, Tensor> prelu_backward_cuda(const Tensor& grad_out_, const Tensor& self, const Tensor& weight_) {
-  AT_CHECK(grad_out_.is_cuda());
-  AT_CHECK(self.is_cuda());
-  AT_CHECK(weight_.is_cuda());
+  TORCH_CHECK(grad_out_.is_cuda());
+  TORCH_CHECK(self.is_cuda());
+  TORCH_CHECK(weight_.is_cuda());
 
   auto input = self.contiguous();
   auto grad_out = grad_out_.contiguous();
   auto weight = weight_.contiguous();
 
-  AT_CHECK(input.is_contiguous());
-  AT_CHECK(weight.is_contiguous());
-  AT_CHECK(grad_out.is_contiguous());
+  TORCH_CHECK(input.is_contiguous());
+  TORCH_CHECK(weight.is_contiguous());
+  TORCH_CHECK(grad_out.is_contiguous());
 
   int64_t weight_num = weight.numel();
   auto strides = input.strides();
@@ -187,7 +187,7 @@ std::tuple<Tensor, Tensor> prelu_backward_cuda(const Tensor& grad_out_, const Te
   }
   else { // case2: multiple parameters, one for each channel
     int64_t input_ndim = input.dim();
-    AT_CHECK(input_ndim > 0, "Not allow zero-dim input tensor.");
+    TORCH_CHECK(input_ndim > 0, "Not allow zero-dim input tensor.");
 
     int64_t channel_size = 1; // channel_size default to 1
     int64_t input_stride0 = 1, input_stride1 = 1;
@@ -197,7 +197,7 @@ std::tuple<Tensor, Tensor> prelu_backward_cuda(const Tensor& grad_out_, const Te
       input_stride0 = strides[0];
       input_stride1 = strides[1];
     }
-    AT_CHECK(channel_size == weight_num,
+    TORCH_CHECK(channel_size == weight_num,
       "Mismatch of parameter numbers and input channel size. Found parameter numbers = ", weight_num,
       " and channel size = ", channel_size, ".");
 
@@ -208,7 +208,7 @@ std::tuple<Tensor, Tensor> prelu_backward_cuda(const Tensor& grad_out_, const Te
     int curDevice = -1;
     cudaGetDevice(&curDevice);
     cudaStream_t stream = at::cuda::getCurrentCUDAStream(curDevice);
-    AT_CHECK(cuda::getApplyGrid(input_numel, grid, curDevice), "prelu_backward_cuda: input too large or too many dimensions");
+    TORCH_CHECK(cuda::getApplyGrid(input_numel, grid, curDevice), "prelu_backward_cuda: input too large or too many dimensions");
 
     AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "prelu_backward_cuda", [&] {
       prelu_cuda_backward_kernel_multi_weights<scalar_t>
diff --git a/aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu b/aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu
index 7211aa3e895b..7f963f1c7a2e 100644
--- a/aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu
+++ b/aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu
@@ -214,13 +214,13 @@ namespace {
     checkAllSameGPU("cudnn_adaptive_avg_pooling2d", {input_arg, output_arg});
 
     for (int64_t i = 0; i < input.ndimension(); i++) {
-      AT_CHECK(input.size(i) > 0,
+      TORCH_CHECK(input.size(i) > 0,
         "adaptive_avg_pooling2d(): expected input to have non-empty spatial dimensions, "
         "but input has sizes ", input.sizes(), " with dimension ", i, " being "
         "empty");
     }
 
-    AT_CHECK((input.ndimension() == 3 || input.ndimension() == 4),
+    TORCH_CHECK((input.ndimension() == 3 || input.ndimension() == 4),
       "non-empty 3D or 4D (batch mode) tensor expected for input");
     Tensor input_ = input;
     int64_t grid_x = input.size(-3);
diff --git a/aten/src/ATen/native/cuda/AdaptiveAveragePooling3d.cu b/aten/src/ATen/native/cuda/AdaptiveAveragePooling3d.cu
new file mode 100644
index 000000000000..a088dcc3f95d
--- /dev/null
+++ b/aten/src/ATen/native/cuda/AdaptiveAveragePooling3d.cu
@@ -0,0 +1,517 @@
+#include <ATen/ATen.h>
+#include <ATen/NativeFunctions.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/Utils.h>
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+#include <ATen/cuda/CUDAContext.h>
+#include <THC/THCGeneral.h>
+#include <THC/THCNumerics.cuh>
+#include <THC/THCAtomics.cuh>  // for atomicAdd
+#include <c10/util/Exception.h>
+
+#include <algorithm>
+#include <cfloat>
+#include <cmath>
+
+namespace at {
+namespace native {
+
+namespace {
+
+__device__ inline int start_index(int a, int b, int c) {
+  return (int)std::floor((float)(a * c) / b);
+}
+
+__device__ inline int end_index(int a, int b, int c) {
+  return (int)std::ceil((float)((a + 1) * c) / b);
+}
+
+// 5d tensor B x D x T x H x W
+// All kernels view batch dim B and dim D as collapsed.
+
+/*
+ * Description:
+ *    this function adaptively average pools an input 5D tensor along dimensions
+ * 2, 3, and 4 5D input, 5D output
+ *
+ *    gridDim.y blocks work together on a single 2D output plane specified by
+ *    (blockIdx.x + offsetZ).
+ */
+template <typename scalar_t>
+__global__ void adaptiveaveragepool(
+    scalar_t *input, scalar_t *output,
+    int isizeT, int isizeH, int isizeW,
+    int osizeT, int osizeH, int osizeW,
+    int64_t istrideD,
+    int64_t istrideT, int64_t istrideH, int64_t istrideW,
+    int64_t offsetZ) {
+  // iterates on output pixels
+  int ot, oh, ow;
+
+  // compute offsets based on thread/block ID
+  int ostartH = blockIdx.y * blockDim.y + threadIdx.y;
+  int oendH = osizeH;
+  int ostepH = gridDim.y * blockDim.y;
+  int ostartW = threadIdx.x;
+  int oendW = osizeW;
+  int ostepW = blockDim.x;
+
+  // select output plane
+  int64_t o_plane = blockIdx.x + offsetZ;
+  ot = o_plane % osizeT; // output frame/time
+  int d = o_plane / osizeT; // slice/feature
+
+  // input frame/time range is fixed.
+  int istartT = start_index(ot, osizeT, isizeT);
+  int iendT = end_index(ot, osizeT, isizeT);
+  int kT = iendT - istartT;
+
+  // input offset by slice/feature and earliest relevant frame/time
+  scalar_t *input_dt = input + d*istrideD + istartT*istrideT;
+  // output offset by slice/feature and frame/time
+  scalar_t *output_dt = output + o_plane*osizeH*osizeW;
+
+  // For all output pixels...
+  for (oh = ostartH; oh < oendH; oh += ostepH) {
+    int istartH = start_index(oh, osizeH, isizeH);
+    int iendH = end_index(oh, osizeH, isizeH);
+    int kH = iendH - istartH;
+
+    for (ow = ostartW; ow < oendW; ow += ostepW) {
+      int istartW = start_index(ow, osizeW, isizeW);
+      int iendW = end_index(ow, osizeW, isizeW);
+      int kW = iendW - istartW;
+
+      // Compute the average pooling from corresponding input pixels
+      scalar_t *ptr_input = input_dt + istartH*istrideH + istartW*istrideW;
+      scalar_t *ptr_output = output_dt + oh*osizeW + ow;
+      scalar_t sum = ScalarConvert<int, scalar_t>::to(0);
+
+      int it, ih, iw;
+      for (it = 0; it < kT; ++it) {
+        for (ih = 0; ih < kH; ++ih) {
+          for (iw = 0; iw < kW; ++iw) {
+            scalar_t val = ptr_input[ih*istrideH + iw*istrideW];
+            sum += val;
+          }
+        }
+        ptr_input += istrideT; // next input frame
+      }
+      // Update output
+      *ptr_output = sum / kT / kH / kW;
+    }
+  }
+}
+
+template <typename scalar_t>
+void adaptiveaveragepool_loop(
+    scalar_t *input_data, scalar_t *output_data,
+    int64_t totalZ,
+    int isizeT, int isizeH, int isizeW,
+    int osizeT, int osizeH, int osizeW,
+    int64_t istrideD, int64_t istrideT, int64_t istrideH, int64_t istrideW) {
+  int64_t offsetZ = 0;
+  dim3 threads(32, 8);
+  // each H*W plane is processed by blocksH thread blocks
+  int blocksH = std::max((int)(16L / totalZ), 1);
+  while (totalZ > 0) {
+    dim3 blocks(totalZ > 65535 ? 65535 : totalZ, blocksH);
+    adaptiveaveragepool<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+        input_data, output_data,
+        isizeT, isizeH, isizeW,
+        osizeT, osizeH, osizeW,
+        istrideD,
+        istrideT, istrideH, istrideW,
+        offsetZ);
+
+    totalZ -= 65535;
+    offsetZ += 65535;
+    AT_CUDA_CHECK(cudaGetLastError());
+  }
+}
+
+/*
+ * Description:
+ *    This function computes the gradInput from gradOutput.
+ *
+ *    gridDim.y blocks work together on a single 2D output plane specified by
+ *    (blockIdx.x + offsetZ).
+ */
+template <typename scalar_t>
+__global__ void adaptiveaveragegradinput(
+    scalar_t *gradInput, scalar_t *gradOutput,
+    int isizeT, int isizeH, int isizeW,
+    int osizeT, int osizeH, int osizeW,
+    int64_t offsetZ)
+{
+  // iterators on input pixels
+  int it, ih, iw;
+
+  // compute offsets based on thread/block ID
+  int istartH = blockIdx.y * blockDim.y + threadIdx.y;
+  int iendH = isizeH;
+  int istepH = gridDim.y * blockDim.y;
+  int istartW = threadIdx.x;
+  int iendW = isizeW;
+  int istepW = blockDim.x;
+
+  // select input plane
+  int64_t i_plane = blockIdx.x + offsetZ;
+  it = i_plane % isizeT; // output frame/time
+  int d = i_plane / isizeT; // slice/feature
+
+  // output frame/time range is fixed.
+  int ostartT = start_index(it, isizeT, osizeT);
+  int oendT = end_index(it, isizeT, osizeT);
+
+  // gradInput offset by slice/feature and frame/time.
+  scalar_t *gradInput_dt = gradInput + i_plane*isizeH*isizeW;
+  // gradOutput offset by slice/feature and earliest relevant frame/time
+  scalar_t *gradOutput_dt = gradOutput + (d*osizeT + ostartT)*osizeH*osizeW;
+
+  // For all input pixels...
+  for (ih = istartH; ih < iendH; ih += istepH) {
+    int ostartH = start_index(ih, isizeH, osizeH);
+    int oendH = end_index(ih, isizeH, osizeH);
+
+    for (iw = istartW; iw < iendW; iw += istepW) {
+      int ostartW = start_index(iw, isizeW, osizeW);
+      int oendW = end_index(iw, isizeW, osizeW);
+
+      // Compute the gradients from corresponding output pixels
+      scalar_t *ptr_gradInput = gradInput_dt + ih*isizeW + iw;
+      scalar_t *ptr_gradOutput = gradOutput_dt;
+
+      // for all relevant output pixels
+      int ot, oh, ow;
+      for (ot = ostartT; ot < oendT; ++ot) {
+        int kT = end_index(ot, osizeT, isizeT) - start_index(ot, osizeT, isizeT);
+        for (oh = ostartH; oh < oendH; ++oh) {
+          int kH = end_index(oh, osizeH, isizeH) - start_index(oh, osizeH, isizeH);
+          for (ow = ostartW; ow < oendW; ++ow) {
+            int kW = end_index(ow, osizeW, isizeW) - start_index(ow, osizeW, isizeW);
+            scalar_t grad_delta = ptr_gradOutput[oh*isizeW + ow] / kW / kH / kT;
+            *ptr_gradInput += grad_delta;
+          }
+        }
+        ptr_gradOutput += osizeH*osizeW; // next output frame
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+void adaptiveaveragegradinput_loop(
+    scalar_t *gradInput_data, scalar_t *gradOutput_data,
+    int64_t totalZ,
+    int isizeT, int isizeH, int isizeW,
+    int osizeT, int osizeH, int osizeW) {
+  int64_t offsetZ = 0;
+  dim3 threads(32, 8);
+  // each H*W plane is processed by blocksH thread blocks
+  int blocksH = std::max((int)(16L / totalZ), 1);
+  while (totalZ > 0) {
+    dim3 blocks(totalZ > 65535 ? 65535 : totalZ, blocksH);
+    adaptiveaveragegradinput<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+        gradInput_data, gradOutput_data,
+        isizeT, isizeH, isizeW,
+        osizeT, osizeH, osizeW,
+        offsetZ);
+
+    totalZ -= 65535;
+    offsetZ += 65535;
+    AT_CUDA_CHECK(cudaGetLastError());
+  }
+}
+
+/*
+ * Description:
+ *    This function computes the gradInput from gradOutput.
+ *
+ *    gridDim.y blocks work together on a single 2D output plane specified by
+ *    (blockIdx.x + offsetZ).
+ *
+ *    (uses atomic add)
+ *
+ */
+template <typename scalar_t>
+__global__ void atomicadaptiveaveragegradinput(
+    scalar_t *gradInput, scalar_t *gradOutput,
+    int isizeT, int isizeH, int isizeW,
+    int osizeT, int osizeH, int osizeW,
+    int64_t offsetZ)
+{
+  // iterators on output pixels
+  int ot, oh, ow;
+
+  // compute offsets based on thread/block ID
+  int ostartH = blockIdx.y * blockDim.y + threadIdx.y;
+  int oendH = osizeH;
+  int ostepH = gridDim.y * blockDim.y;
+  int ostartW = threadIdx.x;
+  int oendW = osizeW;
+  int ostepW = blockDim.x;
+
+  // select output plane
+  int64_t o_plane = blockIdx.x + offsetZ;
+  ot = o_plane % osizeT; // output frame/time
+  int d = o_plane / osizeT; // output slice/feature
+
+  // input frame/time range is fixed.
+  int istartT = start_index(ot, osizeT, isizeT);
+  int iendT = end_index(ot, osizeT, isizeT);
+  int kT = iendT - istartT;
+
+  // gradInput offset by slice/feature and earliest relevant frame/time
+  scalar_t *gradInput_nt = gradInput + (d*isizeT + istartT)*isizeH*isizeW;
+  // gradOutput offset by slice/feature and frame/time
+  scalar_t *gradOutput_nt = gradOutput + o_plane*osizeH*osizeW;
+
+  // For all output pixels...
+  for (oh = ostartH; oh < oendH; oh += ostepH) {
+    int istartH = start_index(oh, osizeH, isizeH);
+    int iendH = end_index(oh, osizeH, isizeH);
+    int kH = iendH - istartH;
+
+    for (ow = ostartW; ow < oendW; ow += ostepW) {
+      int istartW = start_index(ow, osizeW, isizeW);
+      int iendW = end_index(ow, osizeW, isizeW);
+      int kW = iendW - istartW;
+
+      // Compute the gradients from corresponding input pixels
+      scalar_t *ptr_gradInput = gradInput_nt + istartH*isizeW + istartW;
+      scalar_t *ptr_gradOutput = gradOutput_nt + oh*osizeW + ow;
+      scalar_t grad_delta = *ptr_gradOutput / kT / kH / kW;
+
+      int it, ih, iw;
+      for (it = 0; it < kT; ++it) {
+        for (ih = 0; ih < kH; ++ih) {
+          for (iw = 0; iw < kW; ++iw) {
+            atomicAdd(&(ptr_gradInput[ih*isizeW + iw]), grad_delta);
+          }
+        }
+        ptr_gradInput += isizeH*isizeW; // next input frame
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+void atomicadaptiveaveragegradinput_loop(
+    scalar_t* gradInput_data, scalar_t* gradOutput_data,
+    int64_t totalZ,
+    int isizeT, int isizeH, int isizeW,
+    int osizeT, int osizeH, int osizeW) {
+  int64_t offsetZ = 0;
+  dim3 threads(32, 8);
+  int blocksH = std::max((int)(16L / totalZ), 1);
+  while (totalZ > 0) {
+    dim3 blocks(totalZ > 65535 ? 65535 : totalZ, blocksH);
+    atomicadaptiveaveragegradinput<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+        gradInput_data, gradOutput_data,
+        isizeT, isizeH, isizeW,
+        osizeT, osizeH, osizeW,
+        offsetZ);
+
+    totalZ -= 65535;
+    offsetZ += 65535;
+    AT_CUDA_CHECK(cudaGetLastError());
+  }
+}
+
+// 5D tensor B x D x T x H x w
+
+void adaptive_avg_pool3d_out_cuda_template(
+    Tensor& output,
+    const Tensor& input_,
+    IntArrayRef& output_size) {
+  TensorArg output_arg{output, "output", 1};
+  TensorArg input_arg{input_, "input_", 2};
+
+  checkAllSameGPU("adaptive_avg_pool3d_cuda", {output_arg, input_arg});
+
+  for (int64_t i = 0; i < input_.ndimension(); i++) {
+    TORCH_CHECK(
+        input_.size(i) > 0,
+        "adaptive_avg_pool3d_cuda(): expected input to have non-empty spatial dimensions, "
+        "but input has sizes ", input_.sizes(),
+        " with dimension ", i, " being empty");
+  }
+
+  TORCH_CHECK(
+      (input_.ndimension() == 4 || input_.ndimension() == 5),
+      "non-empty 4D or 5D (batch mode) tensor expected for input");
+
+  // the jit sometimes passes output_size.size() == 1
+  TORCH_CHECK(
+      output_size.size() == 1 || output_size.size() == 3,
+      "adaptive_avg_pool3d: internal error: output_size.size() must be 1 or 3");
+
+  int64_t osizeT = output_size[0];
+  int64_t osizeH = output_size[1];
+  int64_t osizeW = output_size[2];
+
+  int64_t sizeD, isizeT, isizeH, isizeW;
+  int64_t istrideD, istrideT, istrideH, istrideW;
+  int64_t totalZ;
+
+  const Tensor& input = input_.ndimension() == 4 ? input_ : input_.contiguous();
+
+  if (input.ndimension() == 4) {
+    sizeD = input.size(0);
+    isizeT = input.size(1);
+    isizeH = input.size(2);
+    isizeW = input.size(3);
+
+    istrideD = input.stride(0);
+    istrideT = input.stride(1);
+    istrideH = input.stride(2);
+    istrideW = input.stride(3);
+
+    output.resize_({sizeD, osizeT, osizeH, osizeW});
+
+    totalZ = sizeD * osizeT;
+  } else {
+    int64_t sizeB = input.size(0);
+    sizeD = input.size(1);
+    isizeT = input.size(2);
+    isizeH = input.size(3);
+    isizeW = input.size(4);
+
+    istrideD = input.stride(1);
+    istrideT = input.stride(2);
+    istrideH = input.stride(3);
+    istrideW = input.stride(4);
+
+    output.resize_({sizeB, sizeD, osizeT, osizeH, osizeW});
+
+    totalZ = sizeB * sizeD * osizeT;
+  }
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "adaptive_avg_pool3d_cuda", [&] {
+        scalar_t* input_data = input.data<scalar_t>();
+        scalar_t* output_data = output.data<scalar_t>();
+
+        adaptiveaveragepool_loop(
+            input_data, output_data,
+            totalZ,
+            isizeT, isizeH, isizeW,
+            osizeT, osizeH, osizeW,
+            istrideD, istrideT, istrideH, istrideW);
+      });
+}
+
+void adaptive_avg_pool3d_backward_out_cuda_template(
+    Tensor& gradInput,
+    const Tensor& gradOutput_,
+    const Tensor& input) {
+  TensorArg grad_input_arg{gradInput, "gradInput", 1};
+  TensorArg grad_output_arg{gradOutput_, "gradOutput_", 2};
+  TensorArg input_arg{input, "input", 3};
+
+  checkAllSameGPU(
+      "adaptive_avg_pool3d_out_cuda",
+      {grad_input_arg, grad_output_arg, input_arg});
+
+  const Tensor gradOutput = gradOutput_.contiguous();
+
+  gradInput.resize_as_(input);
+  gradInput.zero_();
+
+  int64_t sizeD, isizeT, isizeH, isizeW;
+  int64_t osizeT, osizeH, osizeW;
+  int64_t totalZ;
+
+  if (input.ndimension() == 4) {
+    sizeD = input.size(0);
+    isizeT = input.size(1);
+    isizeH = input.size(2);
+    isizeW = input.size(3);
+
+    osizeT = gradOutput.size(1);
+    osizeH = gradOutput.size(2);
+    osizeW = gradOutput.size(3);
+  } else {
+    sizeD = input.size(1);
+    isizeT = input.size(2);
+    isizeH = input.size(3);
+    isizeW = input.size(4);
+
+    osizeT = gradOutput.size(2);
+    osizeH = gradOutput.size(3);
+    osizeW = gradOutput.size(4);
+  }
+
+  bool atomic = (isizeW%osizeW != 0) || (isizeH%osizeH != 0) || (isizeT%osizeT != 0);
+
+  if (input.ndimension() == 4) {
+    totalZ = atomic ? sizeD * osizeT : sizeD * isizeT;
+  } else {
+    int sizeB = input.size(0);
+    totalZ = atomic ? sizeB * sizeD * osizeT : sizeB * sizeD * isizeT;
+  }
+
+  if (atomic) {
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        input.scalar_type(), "adaptive_avg_pool3d_backward_cuda", [&] {
+          scalar_t* gradInput_data = gradInput.data<scalar_t>();
+          scalar_t* gradOutput_data = gradOutput.data<scalar_t>();
+
+          atomicadaptiveaveragegradinput_loop(
+              gradInput_data, gradOutput_data,
+              totalZ,
+              isizeT, isizeH, isizeW,
+              osizeT, osizeH, osizeW);
+        });
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        input.scalar_type(), "adaptive_avg_pool3d_backward_cuda", [&] {
+          scalar_t* gradInput_data = gradInput.data<scalar_t>();
+          scalar_t* gradOutput_data = gradOutput.data<scalar_t>();
+
+          adaptiveaveragegradinput_loop(
+              gradInput_data, gradOutput_data,
+              totalZ,
+              isizeT, isizeH, isizeW,
+              osizeT, osizeH, osizeW);
+        });
+  }
+}
+
+} // namespace
+
+Tensor& adaptive_avg_pool3d_out_cuda(
+    Tensor& output,
+    const Tensor& input,
+    IntArrayRef output_size) {
+  adaptive_avg_pool3d_out_cuda_template(output, input, output_size);
+  return output;
+}
+
+Tensor adaptive_avg_pool3d_cuda(
+    const Tensor& input,
+    IntArrayRef output_size) {
+  auto output = at::empty({0}, input.options());
+  adaptive_avg_pool3d_out_cuda_template(output, input, output_size);
+  return output;
+}
+
+Tensor& adaptive_avg_pool3d_backward_out_cuda(
+    Tensor& gradInput,
+    const Tensor& gradOutput_,
+    const Tensor& input) {
+  adaptive_avg_pool3d_backward_out_cuda_template(gradInput, gradOutput_, input);
+  return gradInput;
+}
+
+Tensor adaptive_avg_pool3d_backward_cuda(
+    const Tensor& gradOutput_,
+    const Tensor& input) {
+  auto gradInput = at::zeros_like(input);
+  adaptive_avg_pool3d_backward_out_cuda_template(gradInput, gradOutput_, input);
+  return gradInput;
+}
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/cuda/AdaptiveMaxPooling2d.cu b/aten/src/ATen/native/cuda/AdaptiveMaxPooling2d.cu
index b5b47dbf6e2d..5c9ec4f8ceb1 100644
--- a/aten/src/ATen/native/cuda/AdaptiveMaxPooling2d.cu
+++ b/aten/src/ATen/native/cuda/AdaptiveMaxPooling2d.cu
@@ -204,16 +204,16 @@ void adaptive_max_pool2d_out_cuda_template(
   checkAllSameGPU("adaptive_max_pool2d_cuda", {output_arg, indices_arg, input_arg});
 
   for (int64_t i = 0; i < input.ndimension(); i++) {
-     AT_CHECK(input.size(i) > 0,
+     TORCH_CHECK(input.size(i) > 0,
         "adaptive_max_pool2d_cuda(): expected input to have non-empty spatial dimensions, "
         "but input has sizes ", input.sizes(), " with dimension ", i, " being "
         "empty");
   }
 
-  AT_CHECK((input.ndimension() == 3 || input.ndimension() == 4),
+  TORCH_CHECK((input.ndimension() == 3 || input.ndimension() == 4),
     "non-empty 3D or 4D (batch mode) tensor expected for input");
 
-  AT_CHECK(output_size.size() == 2,
+  TORCH_CHECK(output_size.size() == 2,
     "adaptive_max_pool2d: internal error: output_size.size() must be 2");
 
   int64_t osizeH = output_size[0];
diff --git a/aten/src/ATen/native/cuda/AdaptiveMaxPooling3d.cu b/aten/src/ATen/native/cuda/AdaptiveMaxPooling3d.cu
index 9ee9d70280e8..21c57c0ce382 100644
--- a/aten/src/ATen/native/cuda/AdaptiveMaxPooling3d.cu
+++ b/aten/src/ATen/native/cuda/AdaptiveMaxPooling3d.cu
@@ -307,16 +307,16 @@ void adaptive_max_pool3d_out_cuda_template(
   checkAllSameGPU("adaptive_max_pool3d_cuda", {output_arg, indices_arg, input_arg});
 
   for (int64_t i = 0; i < input_.ndimension(); i++) {
-    AT_CHECK(input_.size(i) > 0,
+    TORCH_CHECK(input_.size(i) > 0,
       "adaptive_max_pool3d_cuda(): expected input to have non-empty spatial dimensions, "
       "but input has sizes ", input_.sizes(), " with dimension ", i, " being "
       "empty");
   }
 
-  AT_CHECK((input_.ndimension() == 4 || input_.ndimension() == 5),
+  TORCH_CHECK((input_.ndimension() == 4 || input_.ndimension() == 5),
     "non-empty 4D or 5D (batch mode) tensor expected for input");
 
-  AT_CHECK(output_size.size() == 3,
+  TORCH_CHECK(output_size.size() == 3,
     "adaptive_max_pool3d: internal error: output_size.size() must be 3");
 
   int64_t osizeT = output_size[0];
diff --git a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
index d91cfdab329a..0411850316f2 100644
--- a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
+++ b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
@@ -63,6 +63,18 @@ void magmaLuNoPivBatched(
   AT_ERROR("lu only takes float or double Tensors");
 }
 
+template<class scalar_t>
+inline magma_int_t magmaGetriOptimalBlocksize(magma_int_t n) {
+  AT_ERROR("getri only takes float or double Tensors");
+}
+
+template<class scalar_t>
+void magmaGetri(
+    magma_int_t n, scalar_t* dA, magma_int_t ldda, magma_int_t* ipiv, scalar_t* dwork,
+    magma_int_t lwork, magma_int_t* info) {
+  AT_ERROR("getri only takes float or double Tensors");
+}
+
 template<class scalar_t>
 void magmaGetriBatched(
     magma_int_t n, scalar_t** dA_array, magma_int_t ldda,
@@ -202,6 +214,30 @@ void magmaLuNoPivBatched<float>(
   magma_sgetrf_nopiv_batched(m, n, dA_array, ldda, info_array, batchsize, magma_queue.get_queue());
 }
 
+template<>
+inline magma_int_t magmaGetriOptimalBlocksize<double>(magma_int_t n) {
+  return magma_get_dgetri_nb(n);
+}
+
+template<>
+inline magma_int_t magmaGetriOptimalBlocksize<float>(magma_int_t n) {
+  return magma_get_sgetri_nb(n);
+}
+
+template<>
+void magmaGetri<double>(
+    magma_int_t n, double* dA, magma_int_t ldda, magma_int_t* ipiv, double* dwork,
+    magma_int_t lwork, magma_int_t* info) {
+  magma_dgetri_gpu(n, dA, ldda, ipiv, dwork, lwork, info);
+}
+
+template<>
+void magmaGetri<float>(
+    magma_int_t n, float* dA, magma_int_t ldda, magma_int_t* ipiv, float* dwork,
+    magma_int_t lwork, magma_int_t* info) {
+  magma_sgetri_gpu(n, dA, ldda, ipiv, dwork, lwork, info);
+}
+
 template<>
 void magmaGetriBatched<double>(
     magma_int_t n, double** dA_array, magma_int_t ldda,
@@ -382,7 +418,7 @@ std::tuple<Tensor, Tensor> _solve_helper_cuda(const Tensor& self, const Tensor&
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ inverse ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 template <typename scalar_t>
-static void apply_inverse(Tensor& self, Tensor& self_inv, std::vector<int64_t>& infos) {
+static void apply_batched_inverse(Tensor& self, Tensor& self_inv, std::vector<int64_t>& infos) {
 #ifndef USE_MAGMA
 AT_ERROR("inverse: MAGMA library not found in "
     "compilation. Please rebuild with MAGMA.");
@@ -429,17 +465,47 @@ AT_ERROR("inverse: MAGMA library not found in "
 #endif
 }
 
-// Because this is out-of-place inverse, the predefined macros will
-// not work
+template <typename scalar_t>
+static void apply_single_inverse(Tensor& self, int64_t& info) {
+#ifndef USE_MAGMA
+AT_ERROR("inverse: MAGMA library not found in "
+    "compilation. Please rebuild with MAGMA.");
+#else
+  auto self_data = self.data<scalar_t>();
+  magma_int_t n = magma_int_cast(self.size(-2), "self.size(-2)");
+  magma_int_t lwork = n * magmaGetriOptimalBlocksize<scalar_t>(n);
+  magma_int_t info_tmp = 0;
+
+  Tensor ipiv = at::empty({n}, at::kInt);
+  Tensor dwork = at::empty({lwork}, self.options());
+  magmaLu<scalar_t>(n, n, self_data, n, ipiv.data<magma_int_t>(), &info_tmp);
+  if (info_tmp != 0) {
+    info = info_tmp;
+    return;
+  }
+  magmaGetri<scalar_t>(
+    n, self_data, n, ipiv.data<magma_int_t>(), dwork.data<scalar_t>(), lwork, &info_tmp);
+  info = info_tmp;
+#endif
+}
+
 Tensor _inverse_helper_cuda(const Tensor& self) {
-  std::vector<int64_t> infos(batchCount(self), 0);
-  auto self_working_copy = cloneBatchedColumnMajor(self);
   auto self_inv_working_copy = cloneBatchedColumnMajor(self);
-  AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "inverse_cuda", [&]{
-    apply_inverse<scalar_t>(
-      self_working_copy, self_inv_working_copy, infos);
-  });
-  batchCheckErrors(infos, "inverse_cuda");
+  if (self.dim() > 2) {
+    std::vector<int64_t> infos(batchCount(self), 0);
+    auto self_working_copy = cloneBatchedColumnMajor(self);
+    AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "inverse_cuda", [&]{
+      apply_batched_inverse<scalar_t>(
+        self_working_copy, self_inv_working_copy, infos);
+    });
+    batchCheckErrors(infos, "inverse_cuda");
+  } else {
+    int64_t info = 0;
+    AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "inverse_cuda", [&]{
+      apply_single_inverse<scalar_t>(self_inv_working_copy, info);
+    });
+    singleCheckErrors(info, "inverse_cuda");
+  }
   return self_inv_working_copy;
 }
 
@@ -497,7 +563,7 @@ Tensor _cholesky_solve_helper_cuda(const Tensor& self, const Tensor& A, bool upp
   AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "cholesky_solve_cuda", [&]{
     apply_cholesky_solve<scalar_t>(self_working_copy, A_working_copy, upper, info);
   });
-  AT_CHECK(info == 0, "MAGMA cholesky_solve : invalid argument: ", -info);
+  TORCH_CHECK(info == 0, "MAGMA cholesky_solve : invalid argument: ", -info);
   return self_working_copy;
 }
 
@@ -633,7 +699,7 @@ AT_ERROR("lu: MAGMA library not found in "
 }
 
 std::tuple<Tensor, Tensor, Tensor> _lu_with_info_cuda(const Tensor& self, bool pivot, bool check_errors) {
-  AT_CHECK(self.dim() >= 2,
+  TORCH_CHECK(self.dim() >= 2,
            "expected tensor with 2 or more dimensions, got size: ", self.sizes(),
            " instead");
   squareCheckInputs(self);
diff --git a/aten/src/ATen/native/cuda/BinaryOpsKernel.cu b/aten/src/ATen/native/cuda/BinaryOpsKernel.cu
index 2b8e33837348..acfb268e7ada 100644
--- a/aten/src/ATen/native/cuda/BinaryOpsKernel.cu
+++ b/aten/src/ATen/native/cuda/BinaryOpsKernel.cu
@@ -7,8 +7,7 @@
 #include <limits>
 
 
-// NOTE: CUDA 8 does not allow __device__ lambdas (GPU_LAMBDA) to be defined
-// inside other lambdas. CUDA on Windows requires that the enclosing function
+// NOTE: CUDA on Windows requires that the enclosing function
 // of a __device__ lambda not have internal linkage.
 
 namespace at { namespace native {
diff --git a/aten/src/ATen/native/cuda/Copy.cu b/aten/src/ATen/native/cuda/Copy.cu
index 9ddab337401e..318454dbf09b 100644
--- a/aten/src/ATen/native/cuda/Copy.cu
+++ b/aten/src/ATen/native/cuda/Copy.cu
@@ -1,75 +1,40 @@
 #include <ATen/ATen.h>
 #include <ATen/Context.h>
 #include <ATen/Dispatch.h>
-#include <ATen/NativeFunctions.h>
-#include <ATen/cuda/CUDAApplyUtils.cuh>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/CUDAEvent.h>
 #include <c10/cuda/CUDAStream.h>
 #include <ATen/native/Copy.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/cuda/Loops.cuh>
+#include <THC/THC.h>
 
-namespace {
+namespace at {
+namespace native {
 
-using namespace at;
 using namespace at::cuda;
 
-// Copy operator for the pointwise apply kernel
-template <typename dst_T, typename src_T>
-struct CopyOp {
-  static void apply(Tensor& dst, const Tensor& src) {
-    CUDA_tensor_apply2<dst_T, src_T>(
-        dst, src, [] __device__(dst_T & dst_val, const src_T& src_val) {
-#if __CUDA_ARCH__ >= 350
-          dst_val = static_cast<dst_T>(
-              static_cast<native::inter_copy_type_t<dst_T>>(__ldg(&src_val)));
-#else
-          dst_val = static_cast<dst_T>(static_cast<native::inter_copy_type_t<dst_T>>(src_val));
-#endif
-      });
-  }
-};
-
-template<typename dst_T>
-struct CopyOp<dst_T, bool> {
-  static void apply(Tensor& dst, const Tensor& src) {
-    CUDA_tensor_apply2<dst_T, bool>(
-      dst, src, [] __device__(dst_T & dst_val, const bool& src_val) {
-        dst_val = static_cast<dst_T>(static_cast<native::inter_copy_type_t<dst_T>>(src_val));
-      });
-  }
-};
+template <typename dst_t, typename src_t>
+void copy_kernel_impl(TensorIterator& iter) {
+  gpu_unary_kernel(iter, []GPU_LAMBDA(src_t x) -> dst_t {
+    return static_cast<dst_t>(static_cast<native::inter_copy_type_t<dst_t>>(x));
+  });
+}
 
 // device-to-device copy, does type conversion
-template <typename dst_T, typename src_T>
-void copy_device_to_device(Tensor& dst, const Tensor& src) {
-  auto numel = dst.numel();
-  if (dst.is_same(src) || numel == 0) {
-    return;
-  }
+static void copy_device_to_device(TensorIterator& iter, bool non_blocking) {
+  int64_t numel = iter.numel();
 
-  // We can memcpy the memory if:
-  // -both tensors are contiguous; or,
-  // -there is only one element to copy; or,
-  // -FIXME: if both tensors have matching size and stride arrays, and no
-  // holes within (in other words, there is some permutation that can be applied
-  // to the size/strides such that the resulting tensor is
-  // contiguous).
-  // -AND: both tensors have the same type.
-  bool same_type = std::is_same<dst_T, src_T>::value;
-  bool memcpy_eligible =
-      ((src.is_contiguous() && dst.is_contiguous()) || (numel == 1)) &&
-      same_type;
+  // We can memcpy the memory if both tensors have the same type AND both
+  // tensors are contiguous after dimension coalescing and reordering.
+  bool same_type = iter.dtype(0) == iter.dtype(1);
+  bool memcpy_eligible = same_type && iter.is_contiguous();
 
-  Device src_device = src.device();
-  Device dst_device = dst.device();
+  Device dst_device = iter.device(0);
+  Device src_device = iter.device(1);
 
   CUDAGuard device_guard(src_device);
 
-  // Try to enable p2p access. This also handles the case src_device ==
-  // dst_device.
-  bool p2pEnabled = THCState_getPeerToPeerAccess(
-      globalContext().getTHCState(), src_device.index(), dst_device.index());
-
   // We always perform the copy on the source device, using the current stream
   // on the source device, and we fully synchronize on both src and dst's
   // current streams for completion of the copy. We have to explicitly do this
@@ -94,61 +59,18 @@ void copy_device_to_device(Tensor& dst, const Tensor& src) {
   if (memcpy_eligible) {
     // Perform the copy
     AT_CUDA_CHECK(cudaMemcpyAsync(
-        dst.data<dst_T>(),
-        src.data<src_T>(),
-        numel * sizeof(dst_T),
+        iter.data_ptr(0),
+        iter.data_ptr(1),
+        numel * iter.element_size(0),
         cudaMemcpyDeviceToDevice,
         copy_stream));
   } else {
-    // Non-contiguous copy or a type-conversion copy
-
-    // We avoid creating temporary memory copies if possible.
-    // If both src and dst are on the same device, or if they are on
-    // different devices and p2p access is enabled, perform the copy
-    // by a pointwise copy kernel.
-    // Otherwise, we'll have to make contiguous (which will in fact
-    // invoke copy() again), and then perform the copy.
-    // FIXME: might want to consider only running the pointwise kernel
-    // if both src and dst innermost dimensions are contiguous. If
-    // they are not, then taking the hit of the memory allocation/free
-    // might be worth it to avoid non-coalesced reads or writes.
-    if (p2pEnabled) {
-      CopyOp<dst_T, src_T>::apply(dst, src);
-    } else {
-      // GPUs can't access each other directly, but the tensors
-      // involved are non-contiguous and/or are different types.
-
-      // Make sure the src is contiguous and in the same type as dst
-      Tensor src_contig;
-      if (same_type) {
-        src_contig = src.contiguous();
-      } else {
-        // Types are different
-        // Copy into the new format, contiguous, on the source device
-        src_contig = at::empty_like(dst, src.options().dtype(dst.dtype()));
-
-        CopyOp<dst_T, src_T>::apply(src_contig, src);
-      }
-
-      // Make sure the dst is contiguous
-      device_guard.set_device(dst_device);
-      Tensor dst_contig = dst.contiguous();
-
-      // Now, we are ready for a cross-device memcpy of contiguous
-      // data, of the same layout and type
-      device_guard.set_device(src_device);
-
-      AT_CUDA_CHECK(cudaMemcpyAsync(
-          dst_contig.data<dst_T>(),
-          src_contig.data<dst_T>(),
-          numel * sizeof(dst_T),
-          cudaMemcpyDeviceToDevice,
-          copy_stream));
-
-      if (!dst.is_contiguous()) {
-        copy_device_to_device<dst_T, dst_T>(dst, dst_contig);
-      }
-    }
+    AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBool, iter.dtype(0), "copy_", [&] {
+      using dst_t = scalar_t;
+      AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBool, iter.dtype(1), "copy_", [&] {
+        copy_kernel_impl<dst_t, scalar_t>(iter);
+      });
+    });
   }
 
   if (src_device != dst_device) {
@@ -166,143 +88,103 @@ void copy_device_to_device(Tensor& dst, const Tensor& src) {
   AT_CUDA_CHECK(cudaGetLastError());
 }
 
-void copy_from_cpu(Tensor& dst, const Tensor& src) {
-  Tensor dst_contig = dst.contiguous();
-  Tensor src_contig = src.contiguous();
+static bool copy_requires_temporaries(TensorIterator& iter) {
+  Device dst_device = iter.device(0);
+  Device src_device = iter.device(1);
 
-  CUDAStream stream = getCurrentCUDAStream();
+  if (dst_device == src_device) {
+    // We never require temporaries for copies on the same GPU.
+    TORCH_INTERNAL_ASSERT(dst_device.is_cuda() && src_device.is_cuda());
+    return false;
+  }
 
-  AT_CUDA_CHECK(cudaMemcpyAsync(
-      dst_contig.data_ptr(),
-      src_contig.data_ptr(),
-      src.numel() * src.element_size(),
-      cudaMemcpyHostToDevice,
-      stream));
-  AT_CUDA_CHECK(cudaStreamSynchronize(stream));
-  AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::Bool, src.scalar_type(), "copy_from_cpu", [&]() {
-    copy_device_to_device<scalar_t, scalar_t>(dst, dst_contig);
-  });
+  bool same_dtype = iter.dtype(0) == iter.dtype(1);
+  if (same_dtype && iter.is_contiguous()) {
+    // Contiguous same-dtype copies can always use cudaMemcpyAsync
+    return false;
+  } else if (dst_device.is_cuda() && src_device.is_cuda()) {
+    // Copies between GPUs can use the copy kernel if P2P is supported
+    return !THCState_getPeerToPeerAccess(
+        globalContext().getTHCState(), src_device.index(), dst_device.index());
+  } else {
+    // The remaining cases require temporaries. For example, this includes
+    // non-contiguous copies between CPU and GPU.
+    return true;
+  }
 }
 
-void copy_to_cpu(Tensor& dst, const Tensor& src) {
-  Tensor dst_contig = dst.contiguous();
-  Tensor src_contig = src.contiguous();
-
-  CUDAGuard device_guard(src.device());
-  CUDAStream stream = getCurrentCUDAStream();
-
-  AT_CUDA_CHECK(cudaMemcpyAsync(
-      dst_contig.data_ptr(),
-      src_contig.data_ptr(),
-      src.numel() * src.element_size(),
-      cudaMemcpyDeviceToHost,
-      stream));
-  AT_CUDA_CHECK(cudaStreamSynchronize(stream));
-  _copy_same_type_(dst, dst_contig);
-}
+static void copy_kernel_cuda(TensorIterator& iter, bool non_blocking) {
+  AT_ASSERT(iter.ntensors() == 2);
+
+  if (copy_requires_temporaries(iter)) {
+    // NB: this involves recursive calls to copy. Be careful that those copies
+    // don't require temporaries or you will cause an infinite recursion!
+    auto& dst = iter.tensor(0);
+    Tensor dst_contig;
+    Tensor src_contig;
+
+    // Type conversions are performed on the CPU for CPU-GPU copies and on
+    // the src device for GPU-GPU copies.
+    if (iter.device_type(0) == kCUDA) {
+      dst_contig = dst.is_contiguous() ? dst : at::empty_like(dst);
+      src_contig = iter.tensor(1).to(iter.dtype(0)).expand_as(dst).contiguous();
+    } else {
+      bool same_type = iter.dtype(0) == iter.dtype(1);
+      dst_contig = (dst.is_contiguous() && same_type) ? dst : at::empty_like(dst, iter.dtype(1));
+      src_contig = iter.tensor(1).expand_as(dst).contiguous();
+    }
 
-void copy_from_cpu_async_(Tensor& dst, const Tensor& src) {
-  AT_CHECK(dst.is_contiguous(), "Target tensor must be contiguous.");
-  AT_CHECK(src.is_contiguous(), "Source tensor must be contiguous.");
+    // perform a same-dtype copy on contiguous tensors
+    TORCH_INTERNAL_ASSERT(dst_contig.sizes().equals(src_contig.sizes()));
+    TORCH_INTERNAL_ASSERT(dst_contig.scalar_type() == src_contig.scalar_type());
+    dst_contig.copy_(src_contig, non_blocking);
 
-  if (dst.numel() == 0) {
+    // if necessary, copy back into dst
+    if (!dst_contig.is_same(dst)) {
+      TORCH_INTERNAL_ASSERT(dst_contig.device() == dst.device());
+      dst.copy_(dst_contig, non_blocking);
+    }
     return;
   }
 
-  CUDAGuard device_guard(dst.device());
-  CUDAStream stream = getCurrentCUDAStream();
-
-  AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::Bool, src.scalar_type(), "copy_from_cpu_async", [&]() {
-    AT_CUDA_CHECK(cudaMemcpyAsync(
-        dst.data<scalar_t>(),
-        src.data<scalar_t>(),
-        src.numel() * sizeof(scalar_t),
-        cudaMemcpyHostToDevice,
-        stream));
-    AT_CUDA_CHECK(THCCachingHostAllocator_recordEvent(
-        src.storage().data<scalar_t>(), stream));
-  });
-}
+  Device dst_device = iter.device(0);
+  Device src_device = iter.device(1);
 
-void copy_to_cpu_async_(Tensor& dst, const Tensor& src) {
-  AT_CHECK(dst.is_contiguous(), "Target tensor must be contiguous.");
-  AT_CHECK(src.is_contiguous(), "Source tensor must be contiguous.");
-
-  if (dst.numel() == 0) {
+  // Copy on GPU (or between GPUs)
+  if (dst_device.is_cuda() && src_device.is_cuda()) {
+    copy_device_to_device(iter, non_blocking);
     return;
   }
 
-  CUDAGuard device_guard(src.device());
-  CUDAStream stream = getCurrentCUDAStream();
-
-  AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::Bool, src.scalar_type(), "copy_to_cpu_async", [&]() {
-    AT_CUDA_CHECK(cudaMemcpyAsync(
-        dst.data<scalar_t>(),
-        src.data<scalar_t>(),
-        src.numel() * sizeof(scalar_t),
-        cudaMemcpyDeviceToHost,
-        stream));
-    AT_CUDA_CHECK(THCCachingHostAllocator_recordEvent(
-        src.storage().data<scalar_t>(), stream));
-  });
-}
-
-template <typename dst_T>
-void _copy__cuda(Tensor& dst, const Tensor& src, bool non_blocking) {
-  AT_CHECK(dst.numel() == src.numel(), "sizes do not match");
-  AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::Bool, src.scalar_type(), "_copy__cuda", [&]() {
-    if (dst.is_cuda() && src.is_cuda()) {
-      copy_device_to_device<dst_T, scalar_t>(dst, src);
-    } else if (dst.is_cuda()) {
-      if (std::is_same<dst_T, scalar_t>::value) {
-        if (non_blocking) {
-          copy_from_cpu_async_(dst, src);
-        } else {
-          copy_from_cpu(dst, src);
-        }
-      } else {
-        // Do a dtype converting copy on the CPU, then copy to device
-        Tensor srcf = at::empty_like(src, src.options().dtype(dst.dtype()));
-        s_copy_(srcf, src);
-        copy_from_cpu(dst, srcf);
-      }
-    } else {
-      if (std::is_same<dst_T, scalar_t>::value) {
-        if (non_blocking) {
-          copy_to_cpu_async_(dst, src);
-        } else {
-          copy_to_cpu(dst, src);
-        }
-      } else {
-        // Copy to CPU as the same dtype, then do a dtype converting copy
-        Tensor srcf = at::empty_like(src, dst.options().dtype(src.dtype()));
-        copy_to_cpu(srcf, src);
-        s_copy_(dst, srcf);
-      }
-    }
-  });
-}
+  // Copy between CPU and GPU
+  cuda::OptionalCUDAGuard device_guard;
+  cudaMemcpyKind kind;
+  if (dst_device.is_cuda() && src_device.is_cpu()) {
+    device_guard.set_device(dst_device);
+    kind = cudaMemcpyHostToDevice;
+  } else if (dst_device.is_cpu() && src_device.is_cuda()) {
+    device_guard.set_device(src_device);
+    kind = cudaMemcpyDeviceToHost;
+  } else {
+    TORCH_INTERNAL_ASSERT(false, "unsupported devices in GPU copy_()");
+  }
 
-} // namespace
+  void* dst = iter.data_ptr(0);
+  void* src = iter.data_ptr(1);
+  int64_t nbytes = iter.numel() * iter.element_size(0);
+  CUDAStream stream = getCurrentCUDAStream();
 
-namespace at {
-namespace native {
+  AT_CUDA_CHECK(cudaMemcpyAsync(dst, src, nbytes, kind, stream));
 
-Tensor& _s_copy__cuda(Tensor& self, const Tensor& src, bool non_blocking) {
-  AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::Bool, self.scalar_type(), "_copy__cuda", [&]() {
-    ::_copy__cuda<scalar_t>(self, src, non_blocking);
-  });
-  return self;
+  if (non_blocking) {
+    void* ptr = (dst_device == kCPU ? dst : src);
+    AT_CUDA_CHECK(THCCachingHostAllocator_recordEvent(ptr, stream));
+  } else {
+    AT_CUDA_CHECK(cudaStreamSynchronize(stream));
+  }
 }
 
-Tensor _s_copy_from_cuda(
-    const Tensor& self,
-    const Tensor& dst,
-    bool non_blocking) {
-  Tensor dst_ = dst;
-  _s_copy__cuda(dst_, self);
-  return dst;
-}
+REGISTER_DISPATCH(copy_stub, &copy_kernel_cuda);
 
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/cuda/CuFFTPlanCache.h b/aten/src/ATen/native/cuda/CuFFTPlanCache.h
index ea894edc5953..d239551f5475 100644
--- a/aten/src/ATen/native/cuda/CuFFTPlanCache.h
+++ b/aten/src/ATen/native/cuda/CuFFTPlanCache.h
@@ -109,13 +109,13 @@ class CuFFTConfig {
     if (input.scalar_type() == ScalarType::Half) {
       // cuFFT on half requires compute capability of at least SM_53
       auto dev_prop = at::cuda::getCurrentDeviceProperties();
-      AT_CHECK(dev_prop->major >= 5 && !(dev_prop->major == 5 && dev_prop->minor < 3),
+      TORCH_CHECK(dev_prop->major >= 5 && !(dev_prop->major == 5 && dev_prop->minor < 3),
                "cuFFT doesn't support signals of half type with compute "
                "capability less than SM_53, but the device containing input half "
                "tensor only has SM_", dev_prop->major, dev_prop->minor);
       for (int64_t i = 0; i < signal_ndim; i++) {
         auto signal_size = checked_signal_sizes[i];
-        AT_CHECK(is_pow_of_two(signal_size),
+        TORCH_CHECK(is_pow_of_two(signal_size),
                  "cuFFT doesn't support signals of half type with size at any ",
                  "dimension that is not a power of two, but got a signal size of ",
                  checked_signal_sizes);
@@ -451,9 +451,9 @@ class CuFFTParamsLRUCache {
     // We check that 0 <= new_size <= CUFFT_MAX_PLAN_NUM here. Since
     // CUFFT_MAX_PLAN_NUM is of type size_t, we need to do non-negativity check
     // first.
-    AT_CHECK(new_size >= 0,
+    TORCH_CHECK(new_size >= 0,
              "cuFFT plan cache size must be non-negative, but got ", new_size);
-    AT_CHECK(new_size <= CUFFT_MAX_PLAN_NUM,
+    TORCH_CHECK(new_size <= CUFFT_MAX_PLAN_NUM,
              "cuFFT plan cache size can not be larger than ", CUFFT_MAX_PLAN_NUM, ", but got ", new_size);
     _max_size = static_cast<size_t>(new_size);
   }
diff --git a/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu b/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu
new file mode 100644
index 000000000000..8ef0ed8dbf04
--- /dev/null
+++ b/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu
@@ -0,0 +1,420 @@
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/native/DilatedMaxPool.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+#include <ATen/cuda/detail/TensorInfo.cuh>
+#include <ATen/cuda/detail/IndexUtils.cuh>
+#include <ATen/cuda/detail/KernelUtils.h>
+#include <THC/THCNumerics.cuh>
+#include <c10/macros/Macros.h>
+
+
+namespace at {
+namespace native {
+namespace {
+
+__device__ inline int min(int a, int b) {
+  return a <= b ? a : b;
+}
+
+// kernels borrowed from Caffe
+template <typename scalar_t, typename accscalar_t>
+__global__ void MaxPoolForward(const int nthreads, const scalar_t* bottom_data,
+    const int num, const int channels, const int height,
+    const int width, const int pooled_height, const int pooled_width,
+    const int kernel_h, const int kernel_w, const int stride_h,
+    const int stride_w, const int pad_h, const int pad_w,
+    const int dilation_h, const int dilation_w, scalar_t* top_data,
+    int64_t* top_mask) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+    int hstart = ph * stride_h - pad_h;
+    int wstart = pw * stride_w - pad_w;
+    int hend = min(hstart + (kernel_h - 1) * dilation_h + 1, height);
+    int wend = min(wstart + (kernel_w - 1) * dilation_w + 1, width);
+    while(hstart < 0)
+      hstart += dilation_h;
+    while(wstart < 0)
+      wstart += dilation_w;
+    accscalar_t maxval = THCNumerics<accscalar_t>::min();
+    int maxidx = -1;
+    bottom_data += (n * channels + c) * height * width;
+    for (int h = hstart; h < hend; h += dilation_h) {
+      for (int w = wstart; w < wend; w += dilation_w) {
+        scalar_t val = bottom_data[h * width + w];
+        if ((ScalarConvert<scalar_t, accscalar_t>::to(val) > maxval) || THCNumerics<scalar_t>::isnan(val)) {
+          maxidx = h * width + w;
+          maxval = ScalarConvert<scalar_t, accscalar_t>::to(val);
+        }
+      }
+    }
+    top_data[index] = ScalarConvert<scalar_t, accscalar_t>::to(maxval);
+    top_mask[index] = maxidx;
+  }
+}
+
+static const int BACKWARD_THREADS = 256;
+
+template <typename scalar_t, typename accscalar_t>
+#if defined (__HIP_PLATFORM_HCC__)
+C10_LAUNCH_BOUNDS_2(BACKWARD_THREADS, 4)
+#else
+C10_LAUNCH_BOUNDS_2(BACKWARD_THREADS, 8)
+#endif
+__global__ void MaxPoolBackward(const int nthreads, const scalar_t* top_diff,
+    const int64_t* top_mask, const int num, const int channels,
+    const int height, const int width, const int pooled_height,
+    const int pooled_width, const int kernel_h, const int kernel_w,
+    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
+    const int dilation_h, const int dilation_w,
+    scalar_t* bottom_diff) {
+    CUDA_KERNEL_LOOP(index, height*width) {
+    int h = index/width;
+    int w = index - h * width;
+//get some templating performance benefits without actually templating
+    int phstart, phend, pwstart, pwend;
+    if (stride_h == 1) {
+       phstart =
+        (h + pad_h < ((kernel_h - 1) * dilation_h + 1)) ? 0 : (h + pad_h - ((kernel_h - 1) * dilation_h + 1))  + 1;
+       phend = min((h + pad_h)  + 1, pooled_height);
+    } else if (stride_h == 2) {
+       phstart =
+        (h + pad_h < ((kernel_h - 1) * dilation_h + 1)) ? 0 : (h + pad_h - ((kernel_h - 1) * dilation_h + 1)) / 2  + 1;
+       phend = min((h + pad_h) / 2  + 1, pooled_height);
+    } else {
+       phstart =
+        (h + pad_h < ((kernel_h - 1) * dilation_h + 1)) ? 0 : (h + pad_h - ((kernel_h - 1) * dilation_h + 1)) / stride_h  + 1;
+       phend = min((h + pad_h) / stride_h  + 1, pooled_height);
+    }
+    if (stride_w == 1) {
+        pwstart =
+        (w + pad_w < ((kernel_w - 1) * dilation_w + 1)) ? 0 : (w + pad_w - ((kernel_w - 1) * dilation_w + 1)) + 1;
+        pwend = min((w + pad_w) + 1, pooled_width);
+    } else if (stride_w == 2) {
+        pwstart =
+        (w + pad_w < ((kernel_w - 1) * dilation_w + 1)) ? 0 : (w + pad_w - ((kernel_w - 1) * dilation_w + 1)) / 2 + 1;
+        pwend = min((w + pad_w) / 2 + 1, pooled_width);
+    } else {
+        pwstart =
+        (w + pad_w < ((kernel_w - 1) * dilation_w + 1)) ? 0 : (w + pad_w - ((kernel_w - 1) * dilation_w + 1)) / stride_w + 1;
+        pwend = min((w + pad_w) / stride_w + 1, pooled_width);
+    }
+    for (int n = blockIdx.y; n < num; n += gridDim.y)
+       for (int c = blockIdx.z; c < channels; c+= gridDim.z) {
+
+        accscalar_t gradient = accscalar_t(0);
+        int offset = (n * channels + c) * pooled_height * pooled_width;
+        top_diff += offset;
+        top_mask += offset;
+//get some templating performance benefits without actually templating
+        if ((phstart + 1 != phend) || (pwstart + 1 != pwend)) {
+        for (int ph = phstart; ph < phend; ++ph) {
+          for (int pw = pwstart; pw < pwend; ++pw) {
+            if (top_mask[ph * pooled_width + pw] == h * width + w) {
+              gradient += ScalarConvert<scalar_t, accscalar_t>::to(top_diff[ph * pooled_width + pw]);
+            }
+          }
+        }
+        } else {
+            if (top_mask[phstart * pooled_width + pwstart] == h * width + w) {
+              gradient += ScalarConvert<scalar_t, accscalar_t>::to(top_diff[phstart * pooled_width + pwstart]);
+            }
+        }
+        bottom_diff[(n*channels+c)*height*width+index] = ScalarConvert<accscalar_t, scalar_t>::to(gradient);
+      }
+  }
+}
+
+void max_pool2d_with_indices_out_cuda_template(
+           Tensor& output,
+           Tensor& indices,
+           const Tensor& input_,
+           IntArrayRef kernel_size,
+           IntArrayRef stride,
+           IntArrayRef padding,
+           IntArrayRef dilation,
+           bool ceil_mode)
+{
+  TensorArg output_arg{ output, "output", 1 };
+  TensorArg indices_arg{ indices, "indices", 2 };
+  TensorArg input_arg{ input_, "input_", 3 };
+
+  checkAllSameGPU("max_pool2d_with_indices_out_cuda",
+                  {output_arg, indices_arg, input_arg});
+
+  // XXX JIT: Pooling.cpp allows stride.empty().
+  // XXX IntegrationTest.MNIST: padding.size() == 1 && dilation.size() == 1.
+  TORCH_CHECK(kernel_size.size() == 2 &&
+              (stride.empty() || stride.size() == 2) &&
+              (padding.size() == 1 || padding.size() == 2) &&
+              (dilation.size() == 1 || dilation.size() == 2),
+    "max_pool2d_with_indices: internal error: all IntArrayRef sizes must be 2");
+
+  TORCH_CHECK((input_.ndimension() == 3 || input_.ndimension() == 4),
+    "non-empty 3D or 4D (batch mode) tensor expected for input");
+
+  const int kH = safe_downcast<int, int64_t>(kernel_size[0]);
+  const int kW = safe_downcast<int, int64_t>(kernel_size[1]);
+
+  const int dH = stride.empty() ? kH : safe_downcast<int, int64_t>(stride[0]);
+  const int dW = stride.empty() ? kW : safe_downcast<int, int64_t>(stride[1]);
+
+  const int padH = safe_downcast<int, int64_t>(padding[0]);
+  const int padW = padding.size() == 1 ? padH : safe_downcast<int, int64_t>(padding[1]);
+
+  const int dilationH = safe_downcast<int, int64_t>(dilation[0]);
+  const int dilationW = dilation.size() == 1 ? dilationH : safe_downcast<int, int64_t>(dilation[1]);
+
+  const int64_t nbatch = input_.ndimension() == 4 ? input_.size(-4) : 1;
+  const int64_t nInputPlane = input_.size(-3);
+  const int64_t inputHeight = input_.size(-2);
+  const int64_t inputWidth = input_.size(-1);
+
+  const int64_t outputWidth = pooling_output_shape<int64_t>(inputWidth, kW, padW, dW, dilationW, ceil_mode);
+  const int64_t outputHeight = pooling_output_shape<int64_t>(inputHeight, kH, padH, dH, dilationH, ceil_mode);
+
+  max_pool2d_with_indices_shape_check(
+    input_,
+    kH, kW, dH, dW, padH, padW, dilationH, dilationW,
+    nInputPlane,
+    inputHeight, inputWidth,
+    outputHeight, outputWidth);
+
+  Tensor input = input_.contiguous();
+
+  output.resize_({nbatch, nInputPlane, outputHeight, outputWidth});
+  indices.resize_({nbatch, nInputPlane, outputHeight, outputWidth});
+
+  const int count = safe_downcast<int, int64_t>(output.numel());
+  const int num_threads = std::min(at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock,
+                                   BACKWARD_THREADS);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(),
+    "max_pool2d_with_indices_out_cuda_frame",
+    [&] {
+      using accscalar_t = acc_type<scalar_t, true>;
+
+      scalar_t *output_data = output.data<scalar_t>();
+      scalar_t *input_data = input.data<scalar_t>();
+      int64_t *indices_data = indices.data<int64_t>();
+
+      MaxPoolForward<scalar_t, scalar_t>
+        <<<cuda::ATenCeilDiv(count, num_threads), num_threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+          count, input_data,
+          nbatch, nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth,
+          kH, kW, dH, dW, padH, padW, dilationH, dilationW, output_data, indices_data); }
+  );
+
+  TORCH_CHECK(cudaGetLastError() == cudaSuccess,
+     "max_pool2d_with_indices_out_cuda_frame failed with error code ",
+     cudaGetLastError());
+
+  if(input.ndimension() == 3) {
+    output.resize_({nInputPlane, outputHeight, outputWidth});
+  }
+}
+
+void max_pool2d_with_indices_backward_out_cuda_template(
+           Tensor& gradInput,
+           const Tensor& gradOutput_,
+           const Tensor& input_,
+           const Tensor& indices,
+           IntArrayRef kernel_size,
+           IntArrayRef stride,
+           IntArrayRef padding,
+           IntArrayRef dilation,
+           bool ceil_mode)
+{
+  TensorArg gradInput_arg{ gradInput, "gradInput", 1 };
+  TensorArg gradOutput_arg{ gradOutput_, "gradOutput_", 2 };
+  TensorArg input_arg{ input_, "input_", 3 };
+  TensorArg indices_arg{ indices, "indices", 4 };
+
+  checkAllSameGPU("max_pool2d_with_indices_out_cuda",
+                  {gradInput_arg, gradOutput_arg, input_arg, indices_arg});
+
+  // XXX JIT: Pooling.cpp allows stride.empty().
+  // XXX IntegrationTest.MNIST: padding.size() == 1 && dilation.size() == 1.
+  TORCH_CHECK(kernel_size.size() == 2 &&
+              (stride.empty() || stride.size() == 2) &&
+              (padding.size() == 1 || padding.size() == 2) &&
+              (dilation.size() == 1 || dilation.size() == 2),
+    "max_pool2d_with_indices: internal error: all IntArrayRef sizes must be 2");
+
+  TORCH_CHECK((input_.ndimension() == 3 || input_.ndimension() == 4),
+    "non-empty 3D or 4D (batch mode) tensor expected for input");
+
+  const int kH = safe_downcast<int, int64_t>(kernel_size[0]);
+  const int kW = safe_downcast<int, int64_t>(kernel_size[1]);
+
+  const int dH = stride.empty() ? kH : safe_downcast<int, int64_t>(stride[0]);
+  const int dW = stride.empty() ? kW : safe_downcast<int, int64_t>(stride[1]);
+
+  const int padH = safe_downcast<int, int64_t>(padding[0]);
+  const int padW = padding.size() == 1 ? padH : safe_downcast<int, int64_t>(padding[1]);
+
+  const int dilationH = safe_downcast<int, int64_t>(dilation[0]);
+  const int dilationW = dilation.size() == 1 ? dilationH : safe_downcast<int, int64_t>(dilation[1]);
+
+  const Tensor input = input_.contiguous();
+
+  const int64_t nbatch = input.ndimension() == 4 ? input.size(-4) : 1;
+  const int64_t nInputPlane = input.size(-3);
+  const int64_t inputHeight = input.size(-2);
+  const int64_t inputWidth = input.size(-1);
+
+  const int64_t outputHeight = pooling_output_shape<int64_t>(inputHeight, kH, padH, dH, dilationH, ceil_mode);
+  const int64_t outputWidth = pooling_output_shape<int64_t>(inputWidth, kW, padW, dW, dilationW, ceil_mode);
+
+  max_pool2d_with_indices_shape_check(
+    input_,
+    gradOutput_,
+    indices,
+    nbatch,
+    kH, kW, dH, dW, padH, padW, dilationH, dilationW,
+    nInputPlane,
+    inputHeight, inputWidth,
+    outputHeight, outputWidth,
+    /*cuda=*/ true);
+
+  const Tensor gradOutput = gradOutput_.contiguous();
+  gradInput.resize_as_(input);
+
+  int64_t count = input.numel();
+  dim3 grid;
+  int imgcount = inputWidth * inputHeight;
+  const int blocks = (imgcount + BACKWARD_THREADS - 1) / BACKWARD_THREADS;
+  grid.x = blocks;
+  grid.y = nbatch;
+  grid.z = nInputPlane;
+  uint64_t maxGridY = at::cuda::getCurrentDeviceProperties()->maxGridSize[1];
+  uint64_t maxGridZ = at::cuda::getCurrentDeviceProperties()->maxGridSize[2];
+  if (maxGridY < grid.y) grid.y = maxGridY;
+  if (maxGridZ < grid.z) grid.z = maxGridZ;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(),
+    "max_pool2d_with_indices_out_cuda_frame",
+    [&] {
+      using accscalar_t = acc_type<scalar_t, true>;
+
+      scalar_t *gradOutput_data = gradOutput.data<scalar_t>();
+      scalar_t *gradInput_data = gradInput.data<scalar_t>();
+      int64_t *indices_data = indices.data<int64_t>();
+
+      MaxPoolBackward<scalar_t, accscalar_t>
+        <<<grid, BACKWARD_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>(
+          count,
+          gradOutput_data,
+          indices_data,
+          nbatch,
+          nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth,
+          kH, kW, dH, dW, padH, padW, dilationH, dilationW,
+          gradInput_data);
+    }
+  );
+
+  TORCH_CHECK(cudaGetLastError() == cudaSuccess,
+    "fractional_max_pool2d_backward_out_cuda failed with error code ",
+    cudaGetLastError());
+}
+
+} // namespace
+
+std::tuple<Tensor& ,Tensor&> max_pool2d_with_indices_out_cuda(
+  Tensor& output,
+  Tensor& indices,
+  const Tensor& input,
+  IntArrayRef kernel_size,
+  IntArrayRef stride,
+  IntArrayRef padding,
+  IntArrayRef dilation,
+  bool ceil_mode)
+{
+  max_pool2d_with_indices_out_cuda_template(
+    output,
+    indices,
+    input,
+    kernel_size,
+    stride,
+    padding,
+    dilation,
+    ceil_mode);
+  return std::tuple<Tensor&, Tensor&>(output, indices);
+}
+
+std::tuple<Tensor ,Tensor> max_pool2d_with_indices_cuda(
+  const Tensor& input,
+  IntArrayRef kernel_size,
+  IntArrayRef stride,
+  IntArrayRef padding,
+  IntArrayRef dilation,
+  bool ceil_mode)
+{
+  Tensor output = at::empty({0}, input.options());
+  Tensor indices = at::empty({0}, input.options().dtype(kLong));
+  max_pool2d_with_indices_out_cuda_template(
+    output,
+    indices,
+    input,
+    kernel_size,
+    stride,
+    padding,
+    dilation,
+    ceil_mode);
+  return std::tuple<Tensor&, Tensor&>(output, indices);
+}
+
+Tensor& max_pool2d_with_indices_backward_out_cuda(
+  Tensor& gradInput,
+  const Tensor& gradOutput_,
+  const Tensor& input,
+  IntArrayRef kernel_size,
+  IntArrayRef stride,
+  IntArrayRef padding,
+  IntArrayRef dilation,
+  bool ceil_mode,
+  const Tensor& indices)
+{
+  max_pool2d_with_indices_backward_out_cuda_template(
+    gradInput,
+    gradOutput_,
+    input,
+    indices,
+    kernel_size,
+    stride,
+    padding,
+    dilation,
+    ceil_mode);
+  return gradInput;
+}
+
+Tensor max_pool2d_with_indices_backward_cuda(
+  const Tensor& gradOutput_,
+  const Tensor& input,
+  IntArrayRef kernel_size,
+  IntArrayRef stride,
+  IntArrayRef padding,
+  IntArrayRef dilation,
+  bool ceil_mode,
+  const Tensor& indices)
+{
+  auto gradInput = at::zeros_like(input);
+  max_pool2d_with_indices_backward_out_cuda_template(
+    gradInput,
+    gradOutput_,
+    input,
+    indices,
+    kernel_size,
+    stride,
+    padding,
+    dilation,
+    ceil_mode);
+  return gradInput;
+}
+
+} // at::native
+} // at
diff --git a/aten/src/ATen/native/cuda/Distributions.cu b/aten/src/ATen/native/cuda/Distributions.cu
index 079a26f98022..0aec588b5476 100644
--- a/aten/src/ATen/native/cuda/Distributions.cu
+++ b/aten/src/ATen/native/cuda/Distributions.cu
@@ -11,6 +11,8 @@
 #include <functional>
 
 #include <ATen/native/Distributions.h>
+#include <ATen/native/cuda/Loops.cuh>
+#include <ATen/native/TensorIterator.h>
 
 #include <THC/THCGeneral.h>
 #include <THC/THCTensorRandom.h>
@@ -23,17 +25,160 @@
 #include <utility>
 #include <type_traits>
 
+/**
+ * Note [Register spilling in curand call for CUDA < 10]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * For CUDA < 10, curandStatePhilox4_32_10_t engine achieves poor performance (60% SOL bandwidth)
+ * when called to generate one random number at a time. This is because the line
+ *            unsigned ret = (&state->output.x)[state->STATE++];
+ * in
+ *            QUALIFIERS unsigned int curand(curandStatePhilox4_32_10_t *state)
+ * in curand_kernel.h dynamically indexes into state.output, preventing the compiler from ever
+ * storing state.output in registers.
+ *
+ * CUDA 10 fixed this problem. However, for backwards compatibility, in the following kernels
+ * we are using curand distributions that utilize curand4 call. curand4 call doesn't have the
+ * register spilling problem.
+ */
+ 
 THCGenerator* THCRandom_getGenerator(THCState* state);
 
 namespace {
-// increment should be at least the number of curand() random numbers used in
-// each thread.
+// Increment should be at least the number of curand() random numbers used in
+// each thread. It is the user's responsibility to make sure that the increment for philox is never
+// smaller than the number of curand() calls. Increment value > the number of curand() calls
+// won't harm but anything less would mean that you would be reusing random values from
+// previous calls. 
+// e.g. In many kernels below, we use distributions that utilize curand4 call in the kernel.
+//      Hence, increment value should be at least 4 for those kernels.
 std::pair<uint64_t, uint64_t> next_philox_seed(at::Generator* gen, uint64_t increment) {
   auto gen_ = THCRandom_getGenerator(at::globalContext().getTHCState());
   uint64_t offset = gen_->state.philox_seed_offset.fetch_add(increment);
   return std::make_pair(gen_->state.initial_seed, offset);
 }
 
+// launch bounds used for kernels utilizing TensorIterator
+const uint32_t block_size_bound = 256;
+const uint32_t grid_size_bound = 4;
+// number of randoms given by distributions like curand_uniform4, curand_uniform2_double
+// used in calculating philox offset.
+const uint32_t curand4_engine_calls = 4;
+
+// utility function that calculates proper philox_offset
+// for distributions utilizing TensorIterator. For distributions using
+// TensorIterator, we are using a grid-stride loop with each
+// thread yielding one element per thread. For the edge of the grid-stride
+// loop, if the tensor size is large, the unroll loop will kick in and the float4
+// from curand4 will start getting utilized (for common tensor sizes, we end up
+// using rand.x from each thread). Hence, the philox_offset is 
+// (number of elements per thread * number of engine calls), which makes
+// sure that philox offset increment is not less than the number of randoms used
+// in each thread.
+std::tuple<uint64_t, dim3, dim3> calc_execution_policy(int64_t total_elements) {
+  const uint64_t numel = static_cast<uint64_t>(total_elements);
+  const uint32_t block_size = block_size_bound;
+  const uint32_t unroll = curand4_engine_calls;
+  dim3 dim_block(block_size);
+  dim3 grid((numel + block_size - 1) / block_size);
+  uint32_t blocks_per_sm = at::cuda::getCurrentDeviceProperties()->maxThreadsPerMultiProcessor / block_size;
+  grid.x = std::min(
+      static_cast<uint32_t>(at::cuda::getCurrentDeviceProperties()->multiProcessorCount) * blocks_per_sm,
+      grid.x);
+  //number of times random will be generated per thread, to offset philox counter in thc random state
+  uint64_t counter_offset = ((numel - 1) / (block_size * grid.x * unroll) + 1)
+                                * curand4_engine_calls;
+  return std::make_tuple(counter_offset, grid, dim_block);
+}
+
+// grid stride loop kernel for distributions
+template<typename accscalar_t, int unroll_factor, typename dist_t, typename transform_t>
+C10_LAUNCH_BOUNDS_2(block_size_bound, grid_size_bound)
+__global__ void distribution_elementwise_grid_stride_kernel(int numel,
+                                                            std::pair<uint64_t, uint64_t> seeds,
+                                                            const dist_t dist_func,
+                                                            const transform_t transform_func) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  curandStatePhilox4_32_10_t state;
+  curand_init(
+      seeds.first,
+      idx,
+      seeds.second,
+      &state);
+  int rounded_size = ((numel - 1)/(blockDim.x * gridDim.x * unroll_factor)+1) *
+      blockDim.x * gridDim.x * unroll_factor;
+  for(int linear_index = idx; linear_index < rounded_size; linear_index += blockDim.x * gridDim.x * unroll_factor) {
+    auto rand = dist_func(&state);
+    #pragma unroll
+    for (int ii = 0; ii < unroll_factor; ii++) {
+      int li = linear_index + blockDim.x * gridDim.x * ii;
+      if (li < numel) {
+        transform_func(li, static_cast<accscalar_t>((&rand.x)[ii]));
+      }
+    }
+    __syncthreads(); 
+  }
+}
+
+template<typename scalar_t, 
+         typename accscalar_t,
+         int unroll_factor,
+         typename dist_t,
+         typename transform_t>
+void distribution_nullary_kernel(at::TensorIterator& iter,
+                                 at::Generator* gen,
+                                 const dist_t& dist_func,
+                                 const transform_t transform_func) {
+  static_assert(unroll_factor >= 1, "unroll_factor must be >= 1.");
+  int64_t numel = iter.numel();
+  if (numel == 0) {
+    return;
+  }
+  
+  auto execution_policy = calc_execution_policy(numel);
+  auto counter_offset = std::get<0>(execution_policy);
+  auto grid = std::get<1>(execution_policy);
+  auto block = std::get<2>(execution_policy);
+  auto seeds = next_philox_seed(gen, counter_offset);
+
+  if (!iter.can_use_32bit_indexing()) {
+    for (auto& sub_iter : iter.with_32bit_indexing()) {
+      distribution_nullary_kernel<scalar_t, accscalar_t, unroll_factor>(sub_iter,
+        gen, dist_func, transform_func);
+    }
+    return;
+  }
+
+  char* out_data = (char*)iter.data_ptr(0);
+
+  auto stream = at::cuda::getCurrentCUDAStream();
+  if (iter.is_trivial_1d()) {
+    auto strides = iter.get_inner_strides();
+    int stride0 = strides[0];
+    distribution_elementwise_grid_stride_kernel<accscalar_t, unroll_factor><<<grid, block, 0, stream>>>(
+      numel,
+      seeds,
+      dist_func,
+      [=]__device__(int idx, accscalar_t rand) {
+        scalar_t* out = (scalar_t*)&out_data[stride0 * idx];
+        *out = transform_func(rand);
+      }
+    );
+  } else {
+    auto offset_calc = at::native::make_offset_calculator<1>(iter);
+    distribution_elementwise_grid_stride_kernel<accscalar_t, unroll_factor><<<grid, block, 0, stream>>>(
+      numel,
+      seeds,
+      dist_func,
+      [=]__device__(int idx, accscalar_t rand) {
+        auto offsets = offset_calc.get(idx);
+        scalar_t* out = (scalar_t*)&out_data[offsets[0]];
+        *out = transform_func(rand);
+      }
+    );
+  }
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
 template <typename scalar_t>
 void poisson_cuda_kernel(
     at::Tensor& ret,
@@ -117,6 +262,7 @@ void bernoulli_tensor_cuda_kernel(
             blockIdx.x * blockDim.x + threadIdx.x,
             seeds.second,
             &state);
+        // See Note [Register spilling in curand call for CUDA < 10]
         float4 rand = curand_uniform4(&state);
         switch (n) {
           case 4: {
@@ -159,6 +305,7 @@ void bernoulli_scalar_cuda_kernel(
             blockIdx.x * blockDim.x + threadIdx.x,
             seeds.second,
             &state);
+        // See Note [Register spilling in curand call for CUDA < 10]
         float4 rand = curand_uniform4(&state);
         switch (n) {
           case 4: {
@@ -248,7 +395,7 @@ Tensor& bernoulli_tensor_cuda_(Tensor &self, const Tensor& p_, Generator* gen) {
 }
 
 Tensor& bernoulli_scalar_cuda_(Tensor &self, double p, Generator* gen) {
-  AT_CHECK(0 <= p && p <= 1, "bernoulli_ expects p to be in [0, 1], but got p=", p);
+  TORCH_CHECK(0 <= p && p <= 1, "bernoulli_ expects p to be in [0, 1], but got p=", p);
   AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, self.scalar_type(), "bernoulli_scalar_cuda_", [&] {
     auto seeds = next_philox_seed(gen, 10);
     bernoulli_scalar_cuda_kernel<scalar_t>(self, p, seeds);
@@ -256,5 +403,49 @@ Tensor& bernoulli_scalar_cuda_(Tensor &self, double p, Generator* gen) {
   return self;
 }
 
+void uniform_kernel_cuda(TensorIterator& iter, double from_, double to_, Generator* gen) {
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.dtype(), "uniform_cuda", [&] {
+    auto from = static_cast<scalar_t>(from_);
+    auto to = static_cast<scalar_t>(to_);
+    TORCH_CHECK(from <= to,
+      "uniform_ expects to return a [from, to) range, but found from=", from,
+      " > to=", to);
+    TORCH_CHECK((to - from) <= std::numeric_limits<scalar_t>::max(),
+          "uniform_ expects to-from <= std::numeric_limits<", toString(iter.dtype()),
+          ">::max(), but found to=", to, " and from=", from,
+          " which result in to-from to exceed the limit");
+
+    using accscalar_t = at::acc_type<scalar_t, true>;
+    auto range = static_cast<accscalar_t>(to-from);
+    from = static_cast<accscalar_t>(from);
+    // define lambda to reverse bounds, multiply 'range' and add 'from_'
+    auto uniform_func = [range, from] __device__ (accscalar_t rand) {
+      // reverse the bounds of curand4 from (0, 1] to [0, 1)
+      // Note that this method is from legacy THCTensorRandom and is likely to give
+      // you more 0-s, since, the probability of gettings 1-s is higher than 0-s and
+      // by reversing the bounds, we are flipping the probabilities of 1-s and 0-s.
+      auto reverse_bound_rand = rand == static_cast<accscalar_t>(1.0) ? static_cast<accscalar_t>(0.0) : rand;
+      return static_cast<scalar_t>(reverse_bound_rand * range + from);
+    };
+    if (std::is_same<scalar_t, double>::value) {
+      distribution_nullary_kernel<scalar_t, accscalar_t, curand4_engine_calls/2>(iter,
+        gen,
+        [] __device__ (curandStatePhilox4_32_10_t* state) { return curand_uniform2_double(state); },
+        uniform_func);
+    } else {
+      distribution_nullary_kernel<scalar_t, accscalar_t, curand4_engine_calls>(iter,
+        gen,
+        [] __device__ (curandStatePhilox4_32_10_t* state) { return curand_uniform4(state); },
+        uniform_func);
+    }
+   });
+}
+
+Tensor& uniform_cuda_(Tensor& self, double from, double to, Generator* gen) {
+  auto iter = TensorIterator::nullary_op(self);
+  uniform_kernel_cuda(*iter, from, to, gen);
+  return self;
+}
+
 
 }} // namespace at::native
diff --git a/aten/src/ATen/native/cuda/Dropout.cu b/aten/src/ATen/native/cuda/Dropout.cu
index d858a0ece80b..e617af6611e5 100644
--- a/aten/src/ATen/native/cuda/Dropout.cu
+++ b/aten/src/ATen/native/cuda/Dropout.cu
@@ -102,6 +102,8 @@ fused_dropout_cuda(const Tensor& self, double p, Generator * gen){
   Tensor ret = at::empty_like(self);
   Tensor mask = at::empty(self.sizes(), self.options().dtype(kByte));
   const int64_t nelem = self.numel();
+//empty tensors should not get here, but just in case, avoid FPE
+  if (nelem==0) return std::tuple<Tensor,Tensor>(self, mask);
   const int64_t block_size = 256;
   unsigned int blocks_per_sm = at::cuda::getCurrentDeviceProperties()->maxThreadsPerMultiProcessor/block_size;
   dim3 dim_block(block_size);
@@ -152,7 +154,7 @@ fused_dropout_cuda(const Tensor& self, double p, Generator * gen){
 
 Tensor masked_scale_cuda(const Tensor& self, const Tensor& mask, double scale){
    Tensor ret = at::empty_like(self);
-   AT_CHECK(mask.scalar_type() == at::ScalarType::Byte, "mask should be torch.uint8 dtype");
+   TORCH_CHECK(mask.scalar_type() == at::ScalarType::Byte, "mask should be torch.uint8 dtype");
    AT_DISPATCH_FLOATING_TYPES_AND_HALF(ret.scalar_type(), "masked_scale", [&] {
       using accscalar_t = acc_type<scalar_t, true>;
       accscalar_t pa = (accscalar_t)(scale);
diff --git a/aten/src/ATen/native/cuda/EmbeddingBag.cu b/aten/src/ATen/native/cuda/EmbeddingBag.cu
index ca9132ad90e7..f0761bd9b213 100644
--- a/aten/src/ATen/native/cuda/EmbeddingBag.cu
+++ b/aten/src/ATen/native/cuda/EmbeddingBag.cu
@@ -472,7 +472,7 @@ Tensor _embedding_bag_per_sample_weights_backward_cuda(
     const Tensor& offsets,
     const Tensor& offset2bag,
     int64_t mode) {
-  AT_CHECK(
+  TORCH_CHECK(
       mode == MODE_SUM,
       "embedding_bag_backward: per_sample_weights only supported for mode='sum'");
 
diff --git a/aten/src/ATen/native/cuda/FractionalMaxPool2d.cu b/aten/src/ATen/native/cuda/FractionalMaxPool2d.cu
index 7a0951419c7f..219eaa656667 100644
--- a/aten/src/ATen/native/cuda/FractionalMaxPool2d.cu
+++ b/aten/src/ATen/native/cuda/FractionalMaxPool2d.cu
@@ -135,11 +135,11 @@ void fractional_max_pool2d_out_cuda_template(
   int numBatch = 1;
 
   int ndims = input.ndimension();
-  AT_CHECK(input.numel() > 0,
+  TORCH_CHECK(input.numel() > 0,
     "fractional_max_pool2d(): expected input to have non-empty ",
     "spatial dimensions.");
 
-  AT_CHECK((ndims == 3 || ndims == 4),
+  TORCH_CHECK((ndims == 3 || ndims == 4),
      "non-empty 3D or 4D (batch mode) tensor expected for input");
 
   if (ndims == 4) {
@@ -159,10 +159,10 @@ void fractional_max_pool2d_out_cuda_template(
   int poolSizeH = pool_size[0];
   int poolSizeW = pool_size[1];
 
-  AT_CHECK(outputH + poolSizeH - 1 <= inputH,
+  TORCH_CHECK(outputH + poolSizeH - 1 <= inputH,
              "fractional_max_pool2d(): pool_size height ", poolSizeH,
              " too large relative to input height ", inputH);
-  AT_CHECK(outputW + poolSizeW - 1 <= inputW,
+  TORCH_CHECK(outputW + poolSizeW - 1 <= inputW,
            "pool_size width ", poolSizeW,
            " too large relative to input width ", inputW);
 
@@ -208,7 +208,7 @@ void fractional_max_pool2d_out_cuda_template(
           poolSizeH, poolSizeW);
        }
      );
-  AT_CHECK(cudaGetLastError() == cudaSuccess,
+  TORCH_CHECK(cudaGetLastError() == cudaSuccess,
      "fractional_max_pool2d_out_cuda_frame failed with error code ",
      cudaGetLastError());
 }
@@ -237,9 +237,9 @@ void fractional_max_pool2d_backward_out_cuda_template(
   int outputH = output_size[0];
   int outputW = output_size[1];
 
-  AT_CHECK(outputH == gradOutput.size(dimh),
+  TORCH_CHECK(outputH == gradOutput.size(dimh),
            "fractional_max_pool2d(): gradOutput height unexpected");
-  AT_CHECK(outputW == gradOutput.size(dimw),
+  TORCH_CHECK(outputW == gradOutput.size(dimw),
            "fractional_max_pool2d(): gradOutput width unexpected");
 
   /* resize */
@@ -277,7 +277,7 @@ void fractional_max_pool2d_backward_out_cuda_template(
         devGradInput, devGradOutput, devIndices);
       }
     );
-  AT_CHECK(cudaGetLastError() == cudaSuccess,
+  TORCH_CHECK(cudaGetLastError() == cudaSuccess,
     "fractional_max_pool2d_backward_out_cuda_frame failed with error code ",
     cudaGetLastError());
 }
diff --git a/aten/src/ATen/native/cuda/FractionalMaxPool3d.cu b/aten/src/ATen/native/cuda/FractionalMaxPool3d.cu
index 95f9b1a73bfc..c44b49c004d4 100644
--- a/aten/src/ATen/native/cuda/FractionalMaxPool3d.cu
+++ b/aten/src/ATen/native/cuda/FractionalMaxPool3d.cu
@@ -163,7 +163,7 @@ void fractional_max_pool3d_out_cuda_template(
     int64_t poolSizeW = pool_size[2];
 
     int64_t ndims = input.ndimension();
-    AT_CHECK(
+    TORCH_CHECK(
       input.numel() != 0 && (ndims == 4 || ndims == 5),
       "fractional_max_pool3d_out_cuda_template(): ",
       "non-empty 4D or 5D (batch mode) tensor expected for input, but got: ",
@@ -183,17 +183,17 @@ void fractional_max_pool3d_out_cuda_template(
     int64_t inputH = input.size(dimh);
     int64_t inputW = input.size(dimw);
 
-    AT_CHECK(
+    TORCH_CHECK(
       outputT + poolSizeT - 1 < inputT,
       "fractional_max_pool3d_out_cuda_template(): ",
       "pool time (", poolSizeT, ") too large relative to input time (",
       inputT, ")");
-    AT_CHECK(
+    TORCH_CHECK(
       outputH + poolSizeH - 1 < inputH,
       "fractional_max_pool3d_out_cuda_template(): ",
       "pool height (", poolSizeH, ") too large relative to input height (",
       inputH, ")");
-    AT_CHECK(
+    TORCH_CHECK(
       outputW + poolSizeW - 1 < inputW,
       "fractional_max_pool3d_out_cuda_template(): ",
       "pool width (", poolSizeW, ") too large relative to input width (",
@@ -244,7 +244,7 @@ void fractional_max_pool3d_out_cuda_template(
         );
       }
     );
-    AT_CHECK(cudaGetLastError() == cudaSuccess,
+    TORCH_CHECK(cudaGetLastError() == cudaSuccess,
           "fractional_max_pool2d_out_cuda_template failed with error code ",
           cudaGetLastError());
   }
@@ -276,17 +276,17 @@ void fractional_max_pool3d_backward_out_cuda_template(
     int64_t inputH = input.size(dimh);
     int64_t inputW = input.size(dimw);
 
-    AT_CHECK(
+    TORCH_CHECK(
       outputT == gradOutput.size(dimt),
       "fractional_max_pool3d_backward_out_cuda_template(): ",
       "gradOutput time unexpected"
     );
-    AT_CHECK(
+    TORCH_CHECK(
       outputH == gradOutput.size(dimh),
       "fractional_max_pool3d_backward_out_cuda_template(): ",
       "gradOutput height unexpected"
     );
-    AT_CHECK(
+    TORCH_CHECK(
       outputW == gradOutput.size(dimw),
       "fractional_max_pool3d_backward_out_cuda_template(): ",
       "gradOutput width unexpected"
@@ -332,7 +332,7 @@ void fractional_max_pool3d_backward_out_cuda_template(
         );
       }
     );
-    AT_CHECK(cudaGetLastError() == cudaSuccess,
+    TORCH_CHECK(cudaGetLastError() == cudaSuccess,
           "fractional_max_pool2d_out_cuda_template failed with error code ",
           cudaGetLastError());
   }
diff --git a/aten/src/ATen/native/cuda/Lerp.cu b/aten/src/ATen/native/cuda/Lerp.cu
index d7a660e21c9a..2861485c8563 100644
--- a/aten/src/ATen/native/cuda/Lerp.cu
+++ b/aten/src/ATen/native/cuda/Lerp.cu
@@ -38,7 +38,7 @@ namespace native {
 Tensor& lerp_cuda_tensor_out(Tensor& result, const Tensor& self,
                             const Tensor& end, const Tensor& weight) {
   Tensor b_self, b_end, b_weight;
-  AT_CHECK(weight.dim() <= std::max(self.dim(), end.dim()),
+  TORCH_CHECK(weight.dim() <= std::max(self.dim(), end.dim()),
            "weight should be of dimension max(self.dim(), end.dim()) or lesser");
   std::tie(b_self, b_end, b_weight) = expand_outplace(self, end, weight, "lerp_out_cuda");
   result.resize_as_(b_self);
@@ -62,10 +62,10 @@ Tensor& lerp_cuda_scalar_out(Tensor& result, const Tensor& self,
 Tensor& lerp_cuda_tensor_(Tensor& self, const Tensor& end, const Tensor& weight) {
   Tensor b_self, b_end, b_weight;
   std::tie(b_self, b_end, b_weight) = expand_outplace(self, end, weight, "lerp__cuda");
-  AT_CHECK(b_self.sizes() == self.sizes(),
+  TORCH_CHECK(b_self.sizes() == self.sizes(),
            "output with shape ", self.sizes(),
            " doesn't match the broadcast shape ", b_self.sizes());
-  AT_CHECK(weight.dim() <= std::max(self.dim(), end.dim()),
+  TORCH_CHECK(weight.dim() <= std::max(self.dim(), end.dim()),
            "weight should be of dimension max(self.dim(), end.dim()) or lesser");
   AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "lerp__cuda", [&]{
     lerp_cuda<scalar_t>(self, b_self, b_end, b_weight);
@@ -76,7 +76,7 @@ Tensor& lerp_cuda_tensor_(Tensor& self, const Tensor& end, const Tensor& weight)
 Tensor& lerp_cuda_scalar_(Tensor& self, const Tensor& end, Scalar weight) {
   Tensor b_self, b_end;
   std::tie(b_self, b_end) = expand_outplace(self, end, "lerp__cuda");
-  AT_CHECK(b_self.sizes() == self.sizes(),
+  TORCH_CHECK(b_self.sizes() == self.sizes(),
            "output with shape ", self.sizes(),
            " doesn't match the broadcast shape ", b_self.sizes());
   AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "lerp__cuda", [&]{
@@ -87,7 +87,7 @@ Tensor& lerp_cuda_scalar_(Tensor& self, const Tensor& end, Scalar weight) {
 
 Tensor lerp_cuda_tensor(const Tensor& self, const Tensor& end, const Tensor& weight) {
   Tensor b_self, b_end, b_weight;
-  AT_CHECK(weight.dim() <= std::max(self.dim(), end.dim()),
+  TORCH_CHECK(weight.dim() <= std::max(self.dim(), end.dim()),
            "weight should be of dimension max(self.dim(), end.dim()) or lesser");
   std::tie(b_self, b_end, b_weight) = expand_outplace(self, end, weight, "lerp_cuda");
   Tensor result = at::empty_like(b_self);
diff --git a/aten/src/ATen/native/cuda/LossCTC.cu b/aten/src/ATen/native/cuda/LossCTC.cu
index 8521a344a77b..141771e3c363 100644
--- a/aten/src/ATen/native/cuda/LossCTC.cu
+++ b/aten/src/ATen/native/cuda/LossCTC.cu
@@ -179,9 +179,9 @@ std::tuple<Tensor, Tensor> ctc_loss_gpu_template(const Tensor& log_probs, const
 
   int64_t batch_size = log_probs.size(1);
   int64_t num_labels = log_probs.size(2);
-  AT_CHECK((0 <= BLANK) && (BLANK < num_labels), "blank must be in label range");
-  AT_CHECK(input_lengths.size() == batch_size, "input_lengths must be of size batch_size");
-  AT_CHECK(target_lengths.size() == batch_size, "target_lengths must be of size batch_size");
+  TORCH_CHECK((0 <= BLANK) && (BLANK < num_labels), "blank must be in label range");
+  TORCH_CHECK(input_lengths.size() == batch_size, "input_lengths must be of size batch_size");
+  TORCH_CHECK(target_lengths.size() == batch_size, "target_lengths must be of size batch_size");
 
   int64_t lp_input_stride = log_probs.stride(0);
   int64_t lp_char_stride = log_probs.stride(2);
@@ -211,13 +211,13 @@ std::tuple<Tensor, Tensor> ctc_loss_gpu_template(const Tensor& log_probs, const
     }
     tg_target_stride = targets.stride(1);
     checkSize(c, targets_arg, 0, batch_size);
-    AT_CHECK(targets.size(1) >= max_target_length,
+    TORCH_CHECK(targets.size(1) >= max_target_length,
              "Expected tensor to have size at least ", max_target_length, " at dimension 1, but got size ", targets.size(1), " for ", targets_arg,
              " (while checking arguments for ", c, ")");
   }
   int64_t max_input_length = log_probs.size(0);
   for (int64_t b = 0; b < batch_size; b++) {
-    AT_CHECK(input_lengths[b] <= max_input_length,
+    TORCH_CHECK(input_lengths[b] <= max_input_length,
              "Expected tensor to have size at least ", max_input_length, " at dimension 1, but got size ", targets.size(0), " for ", targets_arg,
              " (while checking arguments for ", c, ")");
   }
diff --git a/aten/src/ATen/native/cuda/Normalization.cu b/aten/src/ATen/native/cuda/Normalization.cu
index 8f92acd4393d..35e4684ee480 100644
--- a/aten/src/ATen/native/cuda/Normalization.cu
+++ b/aten/src/ATen/native/cuda/Normalization.cu
@@ -86,7 +86,7 @@ std::tuple<Tensor, Tensor> batch_norm_update_stats_cuda(
   return AT_DISPATCH_FLOATING_TYPES_AND_HALF(self.scalar_type(), "batch_norm_backward", [&] {
       auto mean_st = running_mean.dtype();
       auto var_st = running_var.dtype();
-      AT_CHECK(mean_st == var_st, "running_mean and running_var need to have the same data types");
+      TORCH_CHECK(mean_st == var_st, "running_mean and running_var need to have the same data types");
       // <sigh> Some workloads depend on passing in half input and float stats, which is
       // usually handled by cuDNN. However, the JIT sometimes replaces cuDNN calls with this
       // one so it needs to support the same case, or people start to complain.
diff --git a/aten/src/ATen/native/cuda/RangeFactories.cu b/aten/src/ATen/native/cuda/RangeFactories.cu
index 3286e1bd0629..d3ad7792a07f 100644
--- a/aten/src/ATen/native/cuda/RangeFactories.cu
+++ b/aten/src/ATen/native/cuda/RangeFactories.cu
@@ -38,7 +38,7 @@ struct LogspaceOp {
 };
 
 Tensor& linspace_cuda_out(Tensor& result, Scalar start, Scalar end, int64_t steps) {
-  AT_CHECK(steps >= 0, "number of steps must be non-negative");
+  TORCH_CHECK(steps >= 0, "number of steps must be non-negative");
 
   if (result.numel() != steps) {
     result.resize_({steps});
@@ -68,7 +68,7 @@ Tensor& linspace_cuda_out(Tensor& result, Scalar start, Scalar end, int64_t step
 }
 
 Tensor& logspace_cuda_out(Tensor& result, Scalar start, Scalar end, int64_t steps, double base) {
-  AT_CHECK(steps >= 0, "number of steps must be non-negative");
+  TORCH_CHECK(steps >= 0, "number of steps must be non-negative");
 
   if (result.numel() != steps) {
     result.resize_({steps});
@@ -105,11 +105,11 @@ Tensor& range_cuda_out(Tensor& result, Scalar start, Scalar end, Scalar step) {
     auto xend = end.to<accscalar_t>();
     auto xstep = step.to<accscalar_t>();
 
-    AT_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
-    AT_CHECK(std::isfinite(static_cast<double>(xstart)) &&
+    TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
+    TORCH_CHECK(std::isfinite(static_cast<double>(xstart)) &&
              std::isfinite(static_cast<double>(xend)),
              "unsupported range: ", xstart, " -> ", xend);
-    AT_CHECK(((xstep > 0) && (xend >= xstart)) || ((xstep < 0) && (xend <= xstart)),
+    TORCH_CHECK(((xstep > 0) && (xend >= xstart)) || ((xstep < 0) && (xend <= xstart)),
              "upper bound and larger bound inconsistent with step sign");
     int64_t size = static_cast<int64_t>(((xend - xstart) / xstep) + 1);
     if (result.numel() != size) {
@@ -152,14 +152,14 @@ Tensor& arange_cuda_out(Tensor& result, Scalar start, Scalar end, Scalar step) {
                          / step.to<double>());
     }
 
-    AT_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
-    AT_CHECK(std::isfinite(static_cast<double>(xstart)) &&
+    TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
+    TORCH_CHECK(std::isfinite(static_cast<double>(xstart)) &&
              std::isfinite(static_cast<double>(xend)),
              "unsupported range: ", xstart, " -> ", xend);
-    AT_CHECK(((xstep > 0) && (xend >= xstart)) || ((xstep < 0) && (xend <= xstart)),
+    TORCH_CHECK(((xstep > 0) && (xend >= xstart)) || ((xstep < 0) && (xend <= xstart)),
              "upper bound and larger bound inconsistent with step sign");
 
-    AT_CHECK(size_d >= 0 && size_d <= static_cast<double>(std::numeric_limits<int64_t>::max()),
+    TORCH_CHECK(size_d >= 0 && size_d <= static_cast<double>(std::numeric_limits<int64_t>::max()),
              "invalid size, possible overflow?");
     int64_t size = static_cast<int64_t>(size_d);
 
diff --git a/aten/src/ATen/native/cuda/Reduce.cuh b/aten/src/ATen/native/cuda/Reduce.cuh
index 574bf86eb361..d18758497131 100644
--- a/aten/src/ATen/native/cuda/Reduce.cuh
+++ b/aten/src/ATen/native/cuda/Reduce.cuh
@@ -16,6 +16,7 @@
 #include <tuple>
 #include <type_traits>
 #include <utility>
+#include <thrust/tuple.h>
 
 namespace at { namespace native {
 
@@ -48,7 +49,7 @@ C10_HOST_DEVICE static void reduce_fraction(size_t &numerator, size_t &denominat
       a = b;
       b = tmp;
   }
-  
+
   // a is now the GCD
   numerator /= a;
   denominator /= a;
@@ -200,9 +201,11 @@ template <typename index_t>
 static OffsetCalculator<2, index_t> make_output_calculator(const TensorIterator& iter) {
   int num_reduce_dims = iter.num_reduce_dims();
   int num_output_dims = iter.ndim() - num_reduce_dims;
+  int input_index = iter.ntensors() - 1;
+  int output_index = 0;
   std::array<const int64_t*, 2> strides = {
-    iter.strides(0).data() + num_reduce_dims,
-    iter.strides(1).data() + num_reduce_dims,
+    iter.strides(output_index).data() + num_reduce_dims,
+    iter.strides(input_index).data() + num_reduce_dims,
   };
   auto shape = iter.shape().data() + num_reduce_dims;
   return OffsetCalculator<2, index_t>(num_output_dims, shape, strides.data());
@@ -211,8 +214,9 @@ static OffsetCalculator<2, index_t> make_output_calculator(const TensorIterator&
 template <typename index_t>
 static OffsetCalculator<1, index_t> make_input_calculator(const TensorIterator& iter) {
   int num_reduce_dims = iter.num_reduce_dims();
+  int input_index = iter.ntensors() - 1;
   std::array<const int64_t*, 1> strides = {
-    iter.strides(1).data(),
+    iter.strides(input_index).data(),
   };
   return OffsetCalculator<1, index_t>(num_reduce_dims, iter.shape().data(), strides.data());
 }
@@ -277,7 +281,7 @@ struct ReduceOp {
   InputCalculator input_calc;
   OutputCalculator output_calc;
   const void* src;
-  void* dst;
+  const char* dst[2]; //it accepts at most two destinations
   // acc_buf used for accumulation among sub Tensor Iterator when accumulation on
   // output is not permissible
   void* acc_buf;
@@ -286,19 +290,24 @@ struct ReduceOp {
   int* semaphores;
   bool accumulate;
   bool final_output;
+  int noutputs;
 
   ReduceOp(ops_t ops, ReduceConfig config, InputCalculator input_calc, OutputCalculator output_calc,
-           const void* src, void* dst, void* acc_buf, void* cta_buf, int* semaphores, arg_t ident)
+           const void* src, char* dst0, optional<char*> dst1, void* acc_buf, void* cta_buf, int* semaphores, arg_t ident, int noutputs)
     : ops(ops)
     , config(config)
     , input_calc(input_calc)
     , output_calc(output_calc)
     , src(src)
-    , dst(dst)
     , acc_buf(acc_buf)
     , cta_buf(cta_buf)
     , semaphores(semaphores)
-    , ident(ident) {
+    , ident(ident)
+    , noutputs(noutputs) {
+    dst[0] = dst0;
+    if (dst1.has_value()) {
+      dst[1] = dst1.value();
+    }
   }
 
   C10_DEVICE void run() const {
@@ -320,7 +329,7 @@ struct ReduceOp {
       value = block_x_reduce(value, shared_memory);
     }
 
-    auto out = (out_scalar_t*)((char*)dst + base_offsets[0]);
+    auto out = (out_scalar_t*)((char*)dst[0] + base_offsets[0]);
     arg_t* acc = nullptr;
     if (acc_buf != nullptr) {
       size_t numerator = sizeof(arg_t);
@@ -330,19 +339,23 @@ struct ReduceOp {
     }
 
     if (config.should_global_reduce()) {
-      value = global_reduce(value, out, acc, shared_memory);
+      value = global_reduce(value, acc, shared_memory);
     } else if (config.should_store(output_idx)) {
       if (acc == nullptr) {
         if (accumulate) {
           value = accumulate_in_output<can_accumulate_in_output>(out, value);
         }
-        *out = project_if_necessary<can_accumulate_in_output>(value);
+        if (final_output) {
+          set_results_to_output(value, base_offsets[0]);
+        } else {
+          *out = get_accumulated_output<can_accumulate_in_output>(out, value);
+        }
       } else {
         if (accumulate) {
           value = ops.combine(*acc, value);
         }
         if (final_output) {
-          *out = ops.project(value);
+          set_results_to_output(value, base_offsets[0]);
         } else {
           *acc = value;
         }
@@ -453,14 +466,14 @@ struct ReduceOp {
   }
 
   template <bool can_acc>
-  C10_DEVICE out_scalar_t project_if_necessary(
-    arg_t value,
+  C10_DEVICE out_scalar_t get_accumulated_output(
+    out_scalar_t* out, arg_t value,
     typename std::enable_if<can_acc>::type* = nullptr
   ) const {
-    return final_output ? (out_scalar_t)ops.project(value) : (out_scalar_t)value;
+    assert(!final_output);
+    return (out_scalar_t)value;
   }
 
-
   // This function should never be called --
   // it's the version of `accumulate_in_output`
   // when accumulation in the output is not possible.
@@ -473,17 +486,48 @@ struct ReduceOp {
     return arg_t {};
   }
 
+  // This function should never be called --
+  // it's the version of `get_accumulated_output`
+  // when accumulation in the output is not possible.
   template <bool can_acc>
-  C10_DEVICE out_scalar_t project_if_necessary(
-    arg_t value,
+  C10_DEVICE out_scalar_t get_accumulated_output(
+    out_scalar_t* out, arg_t value,
     typename std::enable_if<!can_acc>::type* = nullptr
   ) const {
+    assert(false);
+    return *out;
+  }
+
+  template<class T>
+  C10_DEVICE void set_results(const T x, const index_t base_offset) const {
+    assert(noutputs == 1);
+    auto res = (out_scalar_t*)((char*)dst[0] + base_offset);
+    *res = x;
+  }
+
+  //Currently implemented for max of two outputs
+  template<class T>
+  C10_DEVICE void set_results(const thrust::tuple<T, T> x, const index_t base_offset) const {
+    if (noutputs >= 1) {
+      auto res0 = (out_scalar_t*)((char*)dst[0] + base_offset);
+      *res0 = thrust::get<0>(x);
+    }
+    if (noutputs >= 2) {
+      auto res1 = (out_scalar_t *) ((char *) dst[1] + base_offset);
+      *res1 = thrust::get<1>(x);
+    }
+  }
+
+  C10_DEVICE void set_results_to_output(arg_t value, index_t base_offset) const {
     assert(final_output);
-    return ops.project(value);
+    set_results(ops.project(value), base_offset);
   }
 
-  C10_DEVICE arg_t global_reduce(arg_t value, out_scalar_t* out, arg_t* acc, char* shared_memory) const {
+  C10_DEVICE arg_t global_reduce(arg_t value, arg_t* acc, char* shared_memory) const {
     arg_t* reduce_buffer = (arg_t*)cta_buf;
+    index_t output_idx = config.output_idx();
+    auto base_offsets = output_calc.get(output_idx);
+    auto out = (out_scalar_t*)((char*)dst[0] + base_offsets[0]);
 
     bool should_store = config.should_store(config.output_idx());
     if (should_store) {
@@ -523,13 +567,17 @@ struct ReduceOp {
           if (accumulate) {
             value = accumulate_in_output<can_accumulate_in_output>(out, value);
           }
-          *out = project_if_necessary<can_accumulate_in_output>(value);
+          if (final_output) {
+            set_results_to_output(value, base_offsets[0]);
+          } else {
+            *out = get_accumulated_output<can_accumulate_in_output>(out, value);
+          }
         } else {
           if (accumulate) {
             value = ops.combine(*acc, value);
           }
           if (final_output) {
-            *out = ops.project(value);
+            set_results_to_output(value, base_offsets[0]);
           } else {
             *acc = value;
           }
@@ -590,7 +638,7 @@ struct AccumulationBuffer {
 template <typename scalar_t, typename out_scalar_t, int vt0=4, typename ops_t, typename ident_t=double>
 inline void gpu_reduce_kernel(TensorIterator& iter, const ops_t& ops, ident_t ident=0,
                               AccumulationBuffer* acc_buf_ptr=nullptr) {
-  AT_ASSERT(iter.numel() > 0 && iter.ntensors() == 2);
+  AT_ASSERT(iter.numel() > 0 && iter.ntensors() - iter.noutputs() == 1 && iter.noutputs() >= 1);
 
   using traits = binary_function_traits<decltype(&ops_t::reduce)>;
   using arg_t = typename traits::arg1_t;
@@ -604,7 +652,7 @@ inline void gpu_reduce_kernel(TensorIterator& iter, const ops_t& ops, ident_t id
   // reused by all recursive function calls.
   if (acc_buf_ptr == NULL) {
     // acc_buf_ptr holds buffer used for accumulation among multiple sub_iter
-    // when accumulation in output is not possible. 
+    // when accumulation in output is not possible.
     if (!can_accumulate_in_output && !can_use_32bit_indexing) {
       int64_t output_memory_size = 1;
       for (int dim = 0; dim < iter.ndim(); dim++) {
@@ -627,21 +675,29 @@ inline void gpu_reduce_kernel(TensorIterator& iter, const ops_t& ops, ident_t id
     return;
   }
 
+  const char* in_data = (char*)iter.data_ptr(iter.ntensors() - 1);
   char* out_data = (char*)iter.data_ptr(0);
-  const char* in_data = (char*)iter.data_ptr(1);
+  const auto noutputs = iter.noutputs();
+  optional<char*> out_data_extra;
+  if (noutputs > 1) {
+    out_data_extra = (char*)iter.data_ptr(1);
+  } else {
+    out_data_extra = nullopt;
+  }
   char* acc_data = acc_buf_ptr->get_acc_slice(out_data);
 
   // Start by assuming that each thread handles a single output and all
   // the inputs for that output.
   int64_t num_outputs = iter.num_output_elements();
   int64_t inputs_per_output = iter.numel() / num_outputs;
+  int input_index = iter.ntensors() - 1;
 
   auto config = ReduceConfig(sizeof(arg_t), num_outputs, inputs_per_output);
 
   int64_t dim0;
   int64_t dim1;
   // adjust block size to fit width to fast changing dimension
-  if (iter.strides(/*arg=*/1)[0] == sizeof(scalar_t)) {
+  if (iter.strides(/*arg=*/input_index)[0] == sizeof(scalar_t)) {
     dim0 = iter.shape()[0];
     dim1 = num_outputs;
   } else {
@@ -654,7 +710,7 @@ inline void gpu_reduce_kernel(TensorIterator& iter, const ops_t& ops, ident_t id
   int block_width = config.block_width;
   int block_height = config.block_height;
 
-  if (iter.ndim() == 0 || iter.strides(/*arg=*/1)[0] == sizeof(scalar_t)) {
+  if (iter.ndim() == 0 || iter.strides(/*arg=*/input_index)[0] == sizeof(scalar_t)) {
     // Split the input across lanes if the input is contiguous in the reduced
     // dimension. This will require reduction between threads using warp
     // shuffle instructions and shared memory (if block_width > warpSize).
@@ -706,10 +762,12 @@ inline void gpu_reduce_kernel(TensorIterator& iter, const ops_t& ops, ident_t id
       output_calc,
       in_data,
       out_data,
+      out_data_extra,
       acc_data,
       buffer.get(),
       (int*)semaphores.get(),
-      ident);
+      ident,
+      noutputs);
   reduce.accumulate = iter.should_accumulate();
   reduce.final_output = iter.is_final_output();
 
diff --git a/aten/src/ATen/native/cuda/ReduceOpsKernel.cu b/aten/src/ATen/native/cuda/ReduceOpsKernel.cu
index 5c2c22981622..58dfb7cc738a 100644
--- a/aten/src/ATen/native/cuda/ReduceOpsKernel.cu
+++ b/aten/src/ATen/native/cuda/ReduceOpsKernel.cu
@@ -12,6 +12,7 @@
 #include <limits>
 #include <tuple>
 #include <THC/THCNumerics.cuh>
+#include <thrust/tuple.h>
 
 
 namespace at { namespace native {
@@ -27,14 +28,14 @@ template <typename scalar_t>
 void std_var_kernel_impl(TensorIterator& iter, bool unbiased, bool take_sqrt) {
   // reducing unrolling factor to 2 for welford kernel
   // This is necessary to lower register usage that leads to register spills.
-  gpu_reduce_kernel<scalar_t, scalar_t, 2>(iter, WelfordOps<scalar_t, scalar_t, int32_t, float> { unbiased, take_sqrt }, WelfordData<scalar_t, int32_t, float> {});
+  gpu_reduce_kernel<scalar_t, scalar_t, 2>(iter, WelfordOps<scalar_t, scalar_t, int32_t, float, thrust::tuple<scalar_t, scalar_t>> { unbiased, take_sqrt }, WelfordData<scalar_t, int32_t, float> {});
 }
 
 template <>
 void std_var_kernel_impl<at::Half>(TensorIterator& iter, bool unbiased, bool take_sqrt) {
   // reducing unrolling factor to 2 for welford kernel
   // This is necessary to lower register usage that leads to register spills.
-  gpu_reduce_kernel<at::Half, at::Half, 2>(iter, WelfordOps<at::Half, float, int32_t, float> { unbiased, take_sqrt }, WelfordData<float, int32_t, float> {});
+  gpu_reduce_kernel<at::Half, at::Half, 2>(iter, WelfordOps<at::Half, float, int32_t, float, thrust::tuple<at::Half, at::Half>> { unbiased, take_sqrt }, WelfordData<float, int32_t, float> {});
 }
 
 template <typename scalar_t, typename acc_t=scalar_t>
diff --git a/aten/src/ATen/native/cuda/ReflectionPad.cu b/aten/src/ATen/native/cuda/ReflectionPad.cu
index 0a1cacf416bc..67d9795f5651 100644
--- a/aten/src/ATen/native/cuda/ReflectionPad.cu
+++ b/aten/src/ATen/native/cuda/ReflectionPad.cu
@@ -148,14 +148,14 @@ __global__ void reflection_pad2d_backward_out_kernel(
 
 void reflection_pad1d_out_template(
     Tensor &output, const Tensor &input_, IntArrayRef padding) {
-  AT_CHECK(canUse32BitIndexMath(input_),
+  TORCH_CHECK(canUse32BitIndexMath(input_),
     "input tensor must fit into 32-bit index math");
 
   int64_t dim_plane = 0;
   int64_t dim_w = 1;
   int64_t nbatch = 1;
 
-  AT_CHECK(input_.numel() > 0 &&
+  TORCH_CHECK(input_.numel() > 0 &&
     (input_.ndimension() == 2 || input_.ndimension() == 3), "non-empty 2D "
     "or 3D (batch mode) tensor expected for input, but got: ", input_);
 
@@ -172,11 +172,11 @@ void reflection_pad1d_out_template(
   int64_t input_w = input_.size(dim_w);
   int64_t output_w  = input_w + pad_l + pad_r;
 
-  AT_CHECK(pad_l < input_w && pad_r < input_w, "Padding size should be less "
+  TORCH_CHECK(pad_l < input_w && pad_r < input_w, "Padding size should be less "
     "than the corresponding input dimension, but got: padding (",  pad_l, ", ",
     pad_r, ") at dimension ", dim_w, " of input ", input_);
 
-  AT_CHECK(output_w >= 1,
+  TORCH_CHECK(output_w >= 1,
     "input (W: ", input_w, ")is too small. Calculated output W: ", output_w);
 
   if (input_.ndimension() == 2) {
@@ -206,10 +206,10 @@ void reflection_pad1d_backward_out_template(
     Tensor & grad_input, const Tensor & grad_output_,
     const Tensor & input, IntArrayRef padding) {
 
-  AT_CHECK(canUse32BitIndexMath(input),
+  TORCH_CHECK(canUse32BitIndexMath(input),
     "input tensor must fit into 32-bit index math");
 
-  AT_CHECK(canUse32BitIndexMath(grad_output_),
+  TORCH_CHECK(canUse32BitIndexMath(grad_output_),
     "input tensor must fit into 32-bit index math");
 
   int64_t dim_plane = 0;
@@ -231,7 +231,7 @@ void reflection_pad1d_backward_out_template(
 
   Tensor grad_output = grad_output_.contiguous();
 
-  AT_CHECK(output_w == grad_output.size(dim_w),
+  TORCH_CHECK(output_w == grad_output.size(dim_w),
     "gradOutput width unexpected. Expected: ", output_w, ", Got: ",
     grad_output.size(dim_w));
 
@@ -252,7 +252,7 @@ void reflection_pad1d_backward_out_template(
 
 void reflection_pad2d_out_template(
     Tensor &output, const Tensor &input_, IntArrayRef padding) {
-  AT_CHECK(canUse32BitIndexMath(input_),
+  TORCH_CHECK(canUse32BitIndexMath(input_),
     "input tensor must fit into 32-bit index math");
 
   int plane_dim = 0;
@@ -260,7 +260,7 @@ void reflection_pad2d_out_template(
   int dim_w = 2;
   int nbatch = 1;
 
-  AT_CHECK(input_.numel() > 0 &&
+  TORCH_CHECK(input_.numel() > 0 &&
     (input_.ndimension() == 3 || input_.ndimension() == 4), "non-empty 3D or "
     "4D (batch mode) tensor expected for input, but got: ", input_);
 
@@ -280,12 +280,12 @@ void reflection_pad2d_out_template(
   int input_h = input_.size(dim_h);
   int input_w = input_.size(dim_w);
 
-  AT_CHECK(pad_l < input_w && pad_r < input_w,
+  TORCH_CHECK(pad_l < input_w && pad_r < input_w,
     "Padding size should be less than the corresponding input dimension, but "
     "got: padding (", pad_l, ", ", pad_r, ") at dimension ", dim_w,
     " of input ", input_.sizes());
 
-  AT_CHECK(pad_t < input_h && pad_b < input_h,
+  TORCH_CHECK(pad_t < input_h && pad_b < input_h,
     "Padding size should be less than the corresponding input dimension, but "
     "got: padding (", pad_t, ", ", pad_b, ") at dimension ", dim_h,
     " of input ", input_.sizes());
@@ -293,7 +293,7 @@ void reflection_pad2d_out_template(
   int output_h = input_h + pad_t + pad_b;
   int output_w  = input_w + pad_l + pad_r;
 
-  AT_CHECK(output_w >= 1 || output_h >= 1,
+  TORCH_CHECK(output_w >= 1 || output_h >= 1,
     "input (H: ", input_h, ", W: ", input_w, ")is too small.  Calculated "
     "output H: ", output_h, " W: ", output_w);
 
@@ -326,9 +326,9 @@ void reflection_pad2d_out_template(
 void reflection_pad2d_backward_out_template(
     Tensor &grad_input, const Tensor &grad_output_,
     const Tensor &input, IntArrayRef padding) {
-  AT_CHECK(canUse32BitIndexMath(input),
+  TORCH_CHECK(canUse32BitIndexMath(input),
     "input tensor must fit into 32-bit index math");
-  AT_CHECK(canUse32BitIndexMath(grad_output_),
+  TORCH_CHECK(canUse32BitIndexMath(grad_output_),
     "output gradient tensor must fit into 32-bit index math");
 
   int plane_dim = 0;
@@ -355,9 +355,9 @@ void reflection_pad2d_backward_out_template(
   int output_h = input_h + pad_t + pad_b;
   int output_w  = input_w + pad_l + pad_r;
 
-  AT_CHECK(output_w == grad_output_.size(dim_w), "grad_output width "
+  TORCH_CHECK(output_w == grad_output_.size(dim_w), "grad_output width "
     "unexpected. Expected: ", output_w, ", Got: ", grad_output_.size(dim_w));
-  AT_CHECK(output_h == grad_output_.size(dim_h), "grad_output height "
+  TORCH_CHECK(output_h == grad_output_.size(dim_h), "grad_output height "
     "unexpected. Expected: ", output_h, ", Got: ", grad_output_.size(dim_h));
 
   Tensor grad_output = grad_output_.contiguous();
diff --git a/aten/src/ATen/native/cuda/ReplicationPadding.cu b/aten/src/ATen/native/cuda/ReplicationPadding.cu
index 867ebf275bfa..c9da8f440b72 100644
--- a/aten/src/ATen/native/cuda/ReplicationPadding.cu
+++ b/aten/src/ATen/native/cuda/ReplicationPadding.cu
@@ -205,9 +205,9 @@ void replication_pad1d_out_cuda_template(
     const Tensor& input,
     IntArrayRef paddingSize)
 {
-  AT_CHECK(at::cuda::detail::canUse32BitIndexMath(input),
+  TORCH_CHECK(at::cuda::detail::canUse32BitIndexMath(input),
       "input tensor must fit into 32-bit index math");
-  AT_CHECK(paddingSize.size() == 2, "padding Size is expected to be 2");
+  TORCH_CHECK(paddingSize.size() == 2, "padding Size is expected to be 2");
 
   int padL = paddingSize[0];
   int padR = paddingSize[1];
@@ -216,7 +216,7 @@ void replication_pad1d_out_cuda_template(
   int numBatch = 1;
 
   int numInputDims = input.ndimension();
-  AT_CHECK(input.numel() > 0 && (numInputDims == 2 || numInputDims == 3),
+  TORCH_CHECK(input.numel() > 0 && (numInputDims == 2 || numInputDims == 3),
       "2D or 3D (batch mode) tensor expected for input")
 
     if (numInputDims == 3) {
@@ -229,7 +229,7 @@ void replication_pad1d_out_cuda_template(
   int inputW = input.size(dimw);
   int outputW  = inputW + padL + padR;
 
-  AT_CHECK(outputW >= 1,
+  TORCH_CHECK(outputW >= 1,
       "input (W: ", inputW, ")is too small."
       " Calculated output W: ", outputW);
 
@@ -279,11 +279,11 @@ void replication_pad1d_backward_out_cuda_template(
     IntArrayRef paddingSize)
 {
 
-  AT_CHECK(at::cuda::detail::canUse32BitIndexMath(input),
+  TORCH_CHECK(at::cuda::detail::canUse32BitIndexMath(input),
       "input tensor must fit into 32-bit index math");
-  AT_CHECK(at::cuda::detail::canUse32BitIndexMath(gradOutput),
+  TORCH_CHECK(at::cuda::detail::canUse32BitIndexMath(gradOutput),
       "output gradient tensor must fit into 32-bit index math");
-  AT_CHECK(paddingSize.size() == 2, "padding Size is expected to be 2");
+  TORCH_CHECK(paddingSize.size() == 2, "padding Size is expected to be 2");
 
   int padL = paddingSize[0];
   int padR = paddingSize[1];
@@ -298,7 +298,7 @@ void replication_pad1d_backward_out_cuda_template(
   int iwidth = input.size(dimw);
   int owidth  = iwidth + padL + padR;
 
-  AT_CHECK(owidth == gradOutput.size(dimw),
+  TORCH_CHECK(owidth == gradOutput.size(dimw),
       "gradOutput width unexpected. Expected: ", owidth, ", Got: ",
       gradOutput.size(dimw));
 
@@ -336,9 +336,9 @@ void replication_pad2d_out_cuda_template(
     const Tensor& input,
     IntArrayRef paddingSize)
 {
-  AT_CHECK(at::cuda::detail::canUse32BitIndexMath(input),
+  TORCH_CHECK(at::cuda::detail::canUse32BitIndexMath(input),
       "input tensor must fit into 32-bit index math");
-  AT_CHECK(paddingSize.size() == 4, "padding Size is expected to be 4");
+  TORCH_CHECK(paddingSize.size() == 4, "padding Size is expected to be 4");
 
   int padL = paddingSize[0];
   int padR = paddingSize[1];
@@ -350,7 +350,7 @@ void replication_pad2d_out_cuda_template(
   int numBatch = 1;
 
   int numInputDims = input.dim();
-  AT_CHECK(input.numel() && (numInputDims == 3 || numInputDims == 4),
+  TORCH_CHECK(input.numel() && (numInputDims == 3 || numInputDims == 4),
       "non-empty 3D or 4D (batch mode) tensor expected for input, but got: ",
       input)
 
@@ -367,7 +367,7 @@ void replication_pad2d_out_cuda_template(
   int outputH = inputH + padT + padB;
   int outputW  = inputW + padL + padR;
 
-  AT_CHECK(outputW >= 1 || outputH >= 1,
+  TORCH_CHECK(outputW >= 1 || outputH >= 1,
       "input (H: ", inputH, ", W: ", inputW, ") is too small."
       " Calculated output H: ", outputH, " W: ", outputW);
 
@@ -418,11 +418,11 @@ void replication_pad2d_backward_out_cuda_template(
     IntArrayRef paddingSize)
 {
 
-  AT_CHECK(at::cuda::detail::canUse32BitIndexMath(input),
+  TORCH_CHECK(at::cuda::detail::canUse32BitIndexMath(input),
       "input tensor must fit into 32-bit index math");
-  AT_CHECK(at::cuda::detail::canUse32BitIndexMath(gradOutput),
+  TORCH_CHECK(at::cuda::detail::canUse32BitIndexMath(gradOutput),
       "output gradient tensor must fit into 32-bit index math");
-  AT_CHECK(paddingSize.size() == 4, "padding Size is expected to be 4");
+  TORCH_CHECK(paddingSize.size() == 4, "padding Size is expected to be 4");
 
   int padL = paddingSize[0];
   int padR = paddingSize[1];
@@ -443,10 +443,10 @@ void replication_pad2d_backward_out_cuda_template(
   int oheight = iheight + padT + padB;
   int owidth  = iwidth + padL + padR;
 
-  AT_CHECK(owidth == gradOutput.size(dimw),
+  TORCH_CHECK(owidth == gradOutput.size(dimw),
       "gradOutput width unexpected. Expected: ", owidth, ", Got: ",
       gradOutput.size(dimw));
-  AT_CHECK(oheight == gradOutput.size(dimh),
+  TORCH_CHECK(oheight == gradOutput.size(dimh),
       "gradOutput height unexpected. Expected: ", oheight, ", Got: ",
       gradOutput.size(dimh));
 
@@ -483,11 +483,11 @@ static inline void shapeCheck3d(
     int pleft, int pright,
     int ptop, int pbottom,
     int pfront, int pback) {
-  AT_CHECK(at::cuda::detail::canUse32BitIndexMath(input),
+  TORCH_CHECK(at::cuda::detail::canUse32BitIndexMath(input),
       "input tensor must fit into 32-bit index math");
   int numInputDims = input.dim();
 
-  AT_CHECK(input.numel() && (numInputDims == 4 || numInputDims == 5),
+  TORCH_CHECK(input.numel() && (numInputDims == 4 || numInputDims == 5),
       "non-empty 4D or 5D (batch mode) tensor expected for input, but got: ", input);
 
   int planeDim = 0;
@@ -508,7 +508,7 @@ static inline void shapeCheck3d(
   int odepth = idepth + pfront + pback;
   int oheight = iheight + ptop + pbottom;
   int owidth  = iwidth + pleft + pright;
-  AT_CHECK(owidth >= 1 || oheight >= 1 || odepth >= 1,
+  TORCH_CHECK(owidth >= 1 || oheight >= 1 || odepth >= 1,
       "input (D: ", idepth, " H: ", iheight, ", W: ", iwidth,
       ") is too small."
       " Calculated output D: ", odepth, " H: ", oheight, " W: ", owidth);
@@ -521,11 +521,11 @@ static inline void shapeAndGradOutputCheck3d(
     int pleft, int pright,
     int ptop, int pbottom,
     int pfront, int pback) {
-  AT_CHECK(at::cuda::detail::canUse32BitIndexMath(input),
+  TORCH_CHECK(at::cuda::detail::canUse32BitIndexMath(input),
       "input tensor must fit into 32-bit index math");
   int numInputDims = input.dim();
 
-  AT_CHECK(input.numel() && (numInputDims == 4 || numInputDims == 5),
+  TORCH_CHECK(input.numel() && (numInputDims == 4 || numInputDims == 5),
       "non-empty 4D or 5D (batch mode) tensor expected for input, but got: ", input);
 
   int planeDim = 0;
@@ -546,24 +546,24 @@ static inline void shapeAndGradOutputCheck3d(
   int odepth = idepth + pfront + pback;
   int oheight = iheight + ptop + pbottom;
   int owidth  = iwidth + pleft + pright;
-  AT_CHECK(owidth >= 1 || oheight >= 1 || odepth >= 1,
+  TORCH_CHECK(owidth >= 1 || oheight >= 1 || odepth >= 1,
       "input (D: ", idepth, " H: ", iheight, ", W: ", iwidth,
       ") is too small."
       " Calculated output D: ", odepth, " H: ", oheight, " W: ", owidth);
 
-  AT_CHECK(at::cuda::detail::canUse32BitIndexMath(gradOutput),
+  TORCH_CHECK(at::cuda::detail::canUse32BitIndexMath(gradOutput),
       "output gradient tensor must fit into 32-bit index math");
 
-  AT_CHECK(numPlanes == gradOutput.size(planeDim),
+  TORCH_CHECK(numPlanes == gradOutput.size(planeDim),
       "gradOutput width unexpected. Expected: ", numPlanes, ", Got: ",
       gradOutput.size(planeDim));
-  AT_CHECK(owidth == gradOutput.size(dimw),
+  TORCH_CHECK(owidth == gradOutput.size(dimw),
       "gradOutput width unexpected. Expected: ", owidth, ", Got: ",
       gradOutput.size(dimw));
-  AT_CHECK(oheight == gradOutput.size(dimh),
+  TORCH_CHECK(oheight == gradOutput.size(dimh),
       "gradOutput height unexpected. Expected: ", oheight, ", Got: ",
       gradOutput.size(dimh));
-  AT_CHECK(odepth == gradOutput.size(dimd),
+  TORCH_CHECK(odepth == gradOutput.size(dimd),
       "gradOutput depth unexpected. Expected: ", odepth, ", Got: ",
       gradOutput.size(dimd));
 }
@@ -573,7 +573,7 @@ void replication_pad3d_out_cuda_template(
     const Tensor& input,
     IntArrayRef paddingSize)
 {
-  AT_CHECK(paddingSize.size() == 6, "padding Size is expected to be 6");
+  TORCH_CHECK(paddingSize.size() == 6, "padding Size is expected to be 6");
   int pleft = paddingSize[0];
   int pright = paddingSize[1];
   int ptop = paddingSize[2];
@@ -654,7 +654,7 @@ void replication_pad3d_backward_out_cuda_template(
     const Tensor& input,
     IntArrayRef paddingSize)
 {
-  AT_CHECK(paddingSize.size() == 6, "padding Size is expected to be 6");
+  TORCH_CHECK(paddingSize.size() == 6, "padding Size is expected to be 6");
   int pleft = paddingSize[0];
   int pright = paddingSize[1];
   int ptop = paddingSize[2];
diff --git a/aten/src/ATen/native/cuda/SoftMax.cu b/aten/src/ATen/native/cuda/SoftMax.cu
index ef3031e2208b..3b2db0fa0767 100644
--- a/aten/src/ATen/native/cuda/SoftMax.cu
+++ b/aten/src/ATen/native/cuda/SoftMax.cu
@@ -330,6 +330,7 @@ blockReduce(AccumT* smem, AccumT val,
   AccumT warpVal = defaultVal;
 
   // First warp will perform per-warp reductions for the remaining warps
+  uint32_t mask = (((uint64_t)1) << (blockDim.x / 32)) - 1;
   if (threadIdx.x < 32) {
     int lane = threadIdx.x % 32;
     if (lane < blockDim.x / 32) {
@@ -337,6 +338,9 @@ blockReduce(AccumT* smem, AccumT val,
       for (int i = 0; i < 32; ++i) {
         warpVal = r(warpVal, smem[lane * 32 + i]);
       }
+#if CUDA_VERSION >= 9000
+      __syncwarp(mask);
+#endif
       smem[lane] = warpVal;
     }
   }
@@ -482,7 +486,7 @@ Tensor host_softmax(const Tensor & input_, const int64_t dim_, const bool half_t
   static_assert(std::is_same<acc_type<at::Half, true>, float>::value, "accscalar_t for half should be float");
   if (input.dim() == 0) input = input.view(1);
   int64_t dim = maybe_wrap_dim(dim_, input.dim());
-  AT_CHECK(dim >=0 && dim < input.dim(), "dim must be non-negative and less than input dimensions");
+  TORCH_CHECK(dim >=0 && dim < input.dim(), "dim must be non-negative and less than input dimensions");
   int64_t outer_size = 1;
   int64_t dim_size = input.size(dim);
 
@@ -557,7 +561,7 @@ Tensor host_softmax_backward(const Tensor &grad_, const Tensor &output_, int64_t
   auto grad = grad_.contiguous();
   static_assert(std::is_same<acc_type<at::Half, true>, float>::value, "accscalar_t for half should be float");
   if (grad.dim() == 0) grad = grad.view(1);
-  AT_CHECK(dim >=0 && dim < grad.dim(), "dim must be non-negative and less than input dimensions");
+  TORCH_CHECK(dim >=0 && dim < grad.dim(), "dim must be non-negative and less than input dimensions");
   auto output = output_.contiguous();
   if (output.dim() == 0) output = output.view(1);
   int64_t outer_size = 1;
diff --git a/aten/src/ATen/native/cuda/SortingKthValue.cu b/aten/src/ATen/native/cuda/SortingKthValue.cu
index ebf1dc8d7e6a..2c2c63cc06cc 100644
--- a/aten/src/ATen/native/cuda/SortingKthValue.cu
+++ b/aten/src/ATen/native/cuda/SortingKthValue.cu
@@ -145,11 +145,11 @@ void kthvalue_cuda_template(
   // FIXME: This seems bogus, I only do this because it was the old behaviour.
   //        The reductions are fine, as long as the axis being reduced along
   //        isn't of 0 elements (and the output has elements).
-  AT_CHECK(
+  TORCH_CHECK(
       self.numel() > 0,
       "cannot perform reduction function kthvalue",
       " on tensor with no elements because the operation does not have an identity");
-  AT_CHECK(k >= 1 && k <= slicesize, "selected number k out of range");
+  TORCH_CHECK(k >= 1 && k <= slicesize, "selected number k out of range");
 
   _reduction_with_indices_allocate_or_resize_output(
       values, indices, self, dim, keepdim);
@@ -159,7 +159,7 @@ void kthvalue_cuda_template(
     return;
   }
 
-  AT_CHECK(
+  TORCH_CHECK(
       self.dim() <= MAX_TENSORINFO_DIMS,
       "cannot operate on more than ",
       MAX_TENSORINFO_DIMS,
@@ -188,14 +188,14 @@ void kthvalue_cuda_template(
 // this does not reduce to median with dim beause we don't want to copy twice
 template <typename scalar_t>
 Tensor median_cuda_template(const Tensor& self) {
-  AT_CHECK(self.numel() > 0, "median cannot be called with empty tensor");
+  TORCH_CHECK(self.numel() > 0, "median cannot be called with empty tensor");
   if (self.dim() == 0 && self.numel() == 1) {
     return self.clone();
   }
   auto self_copy = self.clone().view(-1);
   auto values = at::empty({1}, self.options());
   auto indices = at::empty({1}, self.options().dtype(kLong));
-  AT_CHECK(
+  TORCH_CHECK(
       self.dim() <= MAX_TENSORINFO_DIMS,
       "cannot operate on more than ",
       MAX_TENSORINFO_DIMS,
diff --git a/aten/src/ATen/native/cuda/SortingRadixSelect.cuh b/aten/src/ATen/native/cuda/SortingRadixSelect.cuh
index 9efd035178fa..ea340cdd9b61 100644
--- a/aten/src/ATen/native/cuda/SortingRadixSelect.cuh
+++ b/aten/src/ATen/native/cuda/SortingRadixSelect.cuh
@@ -121,7 +121,7 @@ struct TopKTypeConfig<at::Half> {
   typedef uint32_t RadixType;
 
   static inline __device__ RadixType convert(at::Half v) {
-#if CUDA_VERSION >= 8000 || defined __HIP_PLATFORM_HCC__
+#if defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_HCC__)
     RadixType x = __half_as_ushort(v);
     RadixType mask = -((x >> 15)) | 0x8000;
     return (v == v) ? (x ^ mask) : 0xffff;
@@ -132,7 +132,7 @@ struct TopKTypeConfig<at::Half> {
   }
 
   static inline __device__ at::Half deconvert(RadixType v) {
-#if CUDA_VERSION >= 8000 || defined __HIP_PLATFORM_HCC__
+#if defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_HCC__)
     RadixType mask = ((v >> 15) - 1) | 0x8000;
     return __ushort_as_half(v ^ mask);
 #else
diff --git a/aten/src/ATen/native/cuda/SpectralOps.cu b/aten/src/ATen/native/cuda/SpectralOps.cu
index 97a712a4c184..fd8eb29c8377 100644
--- a/aten/src/ATen/native/cuda/SpectralOps.cu
+++ b/aten/src/ATen/native/cuda/SpectralOps.cu
@@ -286,7 +286,7 @@ CuFFTParamsLRUCache &cufft_get_plan_cache(int64_t device_index) {
 namespace detail {
 
 int64_t cufft_get_plan_cache_max_size_impl(int64_t device_index) {
-  AT_CHECK(0 <= device_index && device_index < at::detail::getCUDAHooks().getNumGPUs(),
+  TORCH_CHECK(0 <= device_index && device_index < at::detail::getCUDAHooks().getNumGPUs(),
     "cufft_get_plan_cache_max_size: expected 0 <= device_index < ",
     at::detail::getCUDAHooks().getNumGPUs(), "], but got device_index=",
     device_index);
@@ -294,7 +294,7 @@ int64_t cufft_get_plan_cache_max_size_impl(int64_t device_index) {
 }
 
 void cufft_set_plan_cache_max_size_impl(int64_t device_index, int64_t max_size) {
-  AT_CHECK(0 <= device_index && device_index < at::detail::getCUDAHooks().getNumGPUs(),
+  TORCH_CHECK(0 <= device_index && device_index < at::detail::getCUDAHooks().getNumGPUs(),
     "cufft_set_plan_cache_max_size: expected 0 <= device_index < ",
     at::detail::getCUDAHooks().getNumGPUs(), "], but got device_index=",
     device_index);
@@ -302,7 +302,7 @@ void cufft_set_plan_cache_max_size_impl(int64_t device_index, int64_t max_size)
 }
 
 int64_t cufft_get_plan_cache_size_impl(int64_t device_index) {
-  AT_CHECK(0 <= device_index && device_index < at::detail::getCUDAHooks().getNumGPUs(),
+  TORCH_CHECK(0 <= device_index && device_index < at::detail::getCUDAHooks().getNumGPUs(),
     "cufft_get_plan_cache_size: expected 0 <= device_index < ",
     at::detail::getCUDAHooks().getNumGPUs(), "], but got device_index=",
     device_index);
@@ -310,7 +310,7 @@ int64_t cufft_get_plan_cache_size_impl(int64_t device_index) {
 }
 
 void cufft_clear_plan_cache_impl(int64_t device_index) {
-  AT_CHECK(0 <= device_index && device_index < at::detail::getCUDAHooks().getNumGPUs(),
+  TORCH_CHECK(0 <= device_index && device_index < at::detail::getCUDAHooks().getNumGPUs(),
     "cufft_clear_plan_cache: expected 0 <= device_index < ",
     at::detail::getCUDAHooks().getNumGPUs(), "], but got device_index=",
     device_index);
diff --git a/aten/src/ATen/native/cuda/SummaryOps.cu b/aten/src/ATen/native/cuda/SummaryOps.cu
index 7428f988c4b4..3143fec586d7 100644
--- a/aten/src/ATen/native/cuda/SummaryOps.cu
+++ b/aten/src/ATen/native/cuda/SummaryOps.cu
@@ -311,7 +311,7 @@ Tensor _histc_cuda_template(
   if (nbins <= 0) {
     AT_ERROR("bins must be > 0");
   }
-  Tensor output = native::zeros({nbins}, device(DeviceType::CUDA).dtype(kLong));
+  Tensor output = native::zeros({nbins}, device(DeviceType::CUDA).dtype(self.scalar_type()));
   input_t minvalue = min;
   input_t maxvalue = max;
   if (min == max) {
@@ -322,7 +322,8 @@ Tensor _histc_cuda_template(
     minvalue = minvalue - 1;
     maxvalue = maxvalue + 1;
   }
-  auto ret = cuda::CUDA_tensor_histogram<int64_t, input_t, false>(
+
+  auto ret = cuda::CUDA_tensor_histogram<input_t, input_t, false>(
     output, self, Tensor(), nbins, minvalue, maxvalue);
   return output;
 }
diff --git a/aten/src/ATen/native/cuda/TensorFactories.cu b/aten/src/ATen/native/cuda/TensorFactories.cu
index 90d0208569f6..cc4e5678457d 100644
--- a/aten/src/ATen/native/cuda/TensorFactories.cu
+++ b/aten/src/ATen/native/cuda/TensorFactories.cu
@@ -26,7 +26,7 @@ Tensor& eye_out_cuda(Tensor& result, int64_t n) {
 }
 
 Tensor& eye_out_cuda(Tensor& result, int64_t n, int64_t m) {
-  AT_CHECK(n >= 0, "n must be greater or equal to 0, got ", n);
+  TORCH_CHECK(n >= 0, "n must be greater or equal to 0, got ", n);
 
   if(m < 0) {
     m = n;
@@ -46,7 +46,7 @@ Tensor& eye_out_cuda(Tensor& result, int64_t n, int64_t m) {
 Tensor empty_cuda(IntArrayRef size, const TensorOptions& options) {
   AT_ASSERT(options.backend() == at::Backend::CUDA);
   AT_ASSERT(!options.is_variable());  // is_variable should have been 'unpacked'  // TODO: remove this when Variable and Tensor are merged
-  AT_CHECK(!options.pinned_memory(), "Only dense CPU tensors can be pinned");
+  TORCH_CHECK(!options.pinned_memory(), "Only dense CPU tensors can be pinned");
   check_size_nonnegative(size);
 
   auto* allocator = at::cuda::getCUDADeviceAllocator();
@@ -74,8 +74,8 @@ Tensor empty_strided_cuda(IntArrayRef size, IntArrayRef stride, const TensorOpti
 }
 
 Tensor& randperm_out_cuda(Tensor& result, int64_t n, Generator* generator) {
-  AT_CHECK(n >= 0, "n must be non-negative, got", n);
-  AT_CHECK(at::scalar_tensor(n, result.options()).defined(),
+  TORCH_CHECK(n >= 0, "n must be non-negative, got", n);
+  TORCH_CHECK(at::scalar_tensor(n, result.options()).defined(),
   "n is too large for result tensor type: '", result.type().toString(), "'");
 
   result.resize_({n});
@@ -322,7 +322,7 @@ Tensor tril_indices_cuda(
     dim3 dim_grid;
     // using tril_size instead of tensor.numel(), as each thread takes care of
     // two elements in the tensor.
-    AT_CHECK(
+    TORCH_CHECK(
       cuda::getApplyGrid(tril_size, dim_grid, tensor.get_device()),
       "unable to get dim grid");
 
@@ -398,7 +398,7 @@ Tensor triu_indices_cuda(
 
     // using triu_size instead of tensor.numel(), as each thread takes care of
     // two elements in the tensor.
-    AT_CHECK(
+    TORCH_CHECK(
       cuda::getApplyGrid(triu_size, dim_grid, tensor.get_device()),
       "unable to get dim grid");
 
diff --git a/aten/src/ATen/native/cuda/TensorTransformations.cu b/aten/src/ATen/native/cuda/TensorTransformations.cu
index d07863423e6d..b9a06cb128ee 100644
--- a/aten/src/ATen/native/cuda/TensorTransformations.cu
+++ b/aten/src/ATen/native/cuda/TensorTransformations.cu
@@ -179,7 +179,7 @@ Tensor roll_cuda(const Tensor& self, IntArrayRef shifts, IntArrayRef dims) {
 
   dim3 dim_block = cuda::getApplyBlock();
   dim3 dim_grid;
-  AT_CHECK(cuda::getApplyGrid(N, dim_grid, in_tensor.get_device()), "unable to get dim grid");
+  TORCH_CHECK(cuda::getApplyGrid(N, dim_grid, in_tensor.get_device()), "unable to get dim grid");
 
   auto total_dims = in_tensor.dim();
 
diff --git a/aten/src/ATen/native/cuda/UnaryOpsKernel.cu b/aten/src/ATen/native/cuda/UnaryOpsKernel.cu
new file mode 100644
index 000000000000..74dabf5a13da
--- /dev/null
+++ b/aten/src/ATen/native/cuda/UnaryOpsKernel.cu
@@ -0,0 +1,28 @@
+#include <ATen/native/UnaryOps.h>
+#include <ATen/native/cuda/Loops.cuh>
+#include <ATen/Context.h>
+#include <ATen/Dispatch.h>
+#include <ATen/native/cuda/Loops.cuh>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/TensorIterator.h>
+#include <limits>
+
+namespace at { namespace native {
+
+template <typename scalar_t>
+void fill_kernel_impl(TensorIterator& iter, Scalar value_scalar) {
+  auto value = value_scalar.to<scalar_t>();
+  gpu_nullary_kernel(iter, [value]GPU_LAMBDA() -> scalar_t {
+    return value;
+  });
+}
+
+static void fill_kernel_cuda(TensorIterator& iter, Scalar value) {
+  AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Bool, at::ScalarType::Half, iter.dtype(), "fill_cuda", [&]() {
+    fill_kernel_impl<scalar_t>(iter, value);
+  });
+}
+
+REGISTER_DISPATCH(fill_stub, &fill_kernel_cuda);
+
+}}
diff --git a/aten/src/ATen/native/cuda/Unique.cu b/aten/src/ATen/native/cuda/Unique.cu
index 3395701efc5e..10ad1d7cec1b 100644
--- a/aten/src/ATen/native/cuda/Unique.cu
+++ b/aten/src/ATen/native/cuda/Unique.cu
@@ -38,7 +38,7 @@ std::tuple<Tensor, Tensor, int64_t> compute_unique(
   if (!return_inverse) {
     inverse_indices = at::empty({0}, options);
   } else {
-    AT_CHECK(sorted_indices.defined(),
+    TORCH_CHECK(sorted_indices.defined(),
       "return_inverse is set to true, but sorted_indices is undefined. Send a bug report!");
     const int64_t *sorted_indices_ptr = sorted_indices.data<int64_t>();
     Tensor inv_loc = at::empty({num_inp}, options);
diff --git a/aten/src/ATen/native/cuda/UpSample.cuh b/aten/src/ATen/native/cuda/UpSample.cuh
new file mode 100644
index 000000000000..ff9d9594ea6a
--- /dev/null
+++ b/aten/src/ATen/native/cuda/UpSample.cuh
@@ -0,0 +1,247 @@
+#include <ATen/ATen.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+
+#include <math.h>
+
+namespace at {
+namespace native {
+
+/* TODO: move this to a common place */
+template <typename scalar_t>
+__device__ inline scalar_t min(scalar_t a, scalar_t b) {
+  return a < b ? a : b;
+}
+
+template <typename scalar_t>
+__device__ inline scalar_t max(scalar_t a, scalar_t b) {
+  return a > b ? a : b;
+}
+
+static inline void upsample_1d_shape_check(
+    const Tensor& input,
+    const Tensor& grad_output,
+    int nbatch,
+    int nchannels,
+    int input_width,
+    int output_width) {
+  TORCH_CHECK(
+      input_width > 0 && output_width > 0,
+      "input and output sizes should be greater than 0, but got input (W: ",
+      input_width,
+      ") and output (W: ",
+      output_width,
+      ")");
+
+  if (input.defined()) {
+    TORCH_CHECK(
+        input.numel() != 0 && input.dim() == 3,
+        "non-empty 3D input tensor expected but got a tensor with sizes ",
+        input.sizes());
+  } else if (grad_output.defined()) {
+    check_dim_size(grad_output, 3, 0, nbatch);
+    check_dim_size(grad_output, 3, 1, nchannels);
+    check_dim_size(grad_output, 3, 2, output_width);
+  }
+}
+
+static inline void upsample_2d_shape_check(
+    const Tensor& input,
+    const Tensor& grad_output,
+    int nbatch,
+    int nchannels,
+    int input_height,
+    int input_width,
+    int output_height,
+    int output_width) {
+  TORCH_CHECK(
+      input_height > 0 && input_width > 0 && output_height > 0 &&
+          output_width > 0,
+      "input and output sizes should be greater than 0,"
+      " but got input (H: ",
+      input_height,
+      ", W: ",
+      input_width,
+      ") output (H: ",
+      output_height,
+      ", W: ",
+      output_width,
+      ")");
+
+  if (input.defined()) {
+    TORCH_CHECK(
+        input.numel() != 0 && input.dim() == 4,
+        "non-empty 4D input tensor expected but got a tensor with sizes ",
+        input.sizes());
+  } else if (grad_output.defined()) {
+    check_dim_size(grad_output, 4, 0, nbatch);
+    check_dim_size(grad_output, 4, 1, nchannels);
+    check_dim_size(grad_output, 4, 2, output_height);
+    check_dim_size(grad_output, 4, 3, output_width);
+  }
+}
+
+static inline void upsample_3d_shape_check(
+    const Tensor& input,
+    const Tensor& grad_output,
+    int nbatch,
+    int nchannels,
+    int input_depth,
+    int input_height,
+    int input_width,
+    int output_depth,
+    int output_height,
+    int output_width) {
+  TORCH_CHECK(
+      input_depth > 0 && input_height > 0 && input_width > 0 &&
+          output_depth > 0 && output_height > 0 && output_width > 0,
+      "Input and output sizes should be greater than 0, but got input (D: ",
+      input_depth,
+      ", H: ",
+      input_height,
+      ", W: ",
+      input_width,
+      ") output (D: ",
+      output_depth,
+      ", H: ",
+      output_height,
+      ", W: ",
+      output_width,
+      ")");
+
+  if (input.defined()) {
+    TORCH_CHECK(
+        input.numel() != 0 && input.dim() == 5,
+        "Non-empty 5D data tensor expected but got a tensor with sizes ",
+        input.sizes());
+  } else if (grad_output.defined()) {
+    check_dim_size(grad_output, 5, 0, nbatch);
+    check_dim_size(grad_output, 5, 1, nchannels);
+    check_dim_size(grad_output, 5, 2, output_depth);
+    check_dim_size(grad_output, 5, 3, output_height);
+    check_dim_size(grad_output, 5, 4, output_width);
+  }
+}
+
+template <typename accscalar_t>
+__host__ __forceinline__ static accscalar_t area_pixel_compute_scale(
+    int input_size,
+    int output_size,
+    bool align_corners) {
+  if (output_size > 1) {
+    return align_corners ? (accscalar_t)(input_size - 1) / (output_size - 1)
+                         : (accscalar_t)input_size / output_size;
+  } else {
+    return static_cast<accscalar_t>(0);
+  }
+}
+
+template <typename accscalar_t>
+__device__ __forceinline__ static accscalar_t area_pixel_compute_source_index(
+    accscalar_t scale,
+    int dst_index,
+    bool align_corners,
+    bool cubic) {
+  if (align_corners) {
+    return scale * dst_index;
+  } else {
+    accscalar_t src_idx = scale * (dst_index + static_cast<accscalar_t>(0.5)) -
+        static_cast<accscalar_t>(0.5);
+    // See Note[Follow Opencv resize logic]
+    return (!cubic && src_idx < static_cast<accscalar_t>(0))
+        ? static_cast<accscalar_t>(0)
+        : src_idx;
+  }
+}
+
+__device__ __forceinline__ static int nearest_neighbor_compute_source_index(
+    const float scale,
+    int dst_index,
+    int input_size) {
+  const int src_index =
+      min<int>(static_cast<int>(floorf(dst_index * scale)), input_size - 1);
+  return src_index;
+}
+
+/* Used by UpSampleBicubic2d.cu */
+template <typename scalar_t>
+__device__ __forceinline__ static scalar_t upsample_get_value_bounded(
+    const PackedTensorAccessor<scalar_t, 4>& data,
+    int batch,
+    int channel,
+    int height,
+    int width,
+    int y,
+    int x) {
+  int access_y = max<int>(min<int>(y, height - 1), 0);
+  int access_x = max<int>(min<int>(x, width - 1), 0);
+  return data[batch][channel][access_y][access_x];
+}
+
+/* Used by UpSampleBicubic2d.cu */
+template <typename scalar_t, typename accscalar_t>
+__device__ __forceinline__ static void upsample_increment_value_bounded(
+    PackedTensorAccessor<scalar_t, 4>& data,
+    int batch,
+    int channel,
+    int height,
+    int width,
+    int y,
+    int x,
+    accscalar_t value) {
+  int access_y = max<int>(min<int>(y, height - 1), 0);
+  int access_x = max<int>(min<int>(x, width - 1), 0);
+  /* TODO: result here is trucated to scalar_t,
+     check: https://github.com/pytorch/pytorch/pull/19630#discussion_r281426912
+   */
+  atomicAdd(
+      &data[batch][channel][access_y][access_x], static_cast<scalar_t>(value));
+}
+
+// Based on
+// https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm
+template <typename accscalar_t>
+__device__ __forceinline__ static accscalar_t cubic_convolution1(
+    accscalar_t x,
+    accscalar_t A) {
+  return ((A + 2) * x - (A + 3)) * x * x + 1;
+}
+
+template <typename accscalar_t>
+__device__ __forceinline__ static accscalar_t cubic_convolution2(
+    accscalar_t x,
+    accscalar_t A) {
+  return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A;
+}
+
+template <typename accscalar_t>
+__device__ __forceinline__ static void get_cubic_upsampling_coefficients(
+    accscalar_t coeffs[4],
+    accscalar_t t) {
+  accscalar_t A = -0.75;
+
+  accscalar_t x1 = t;
+  coeffs[0] = cubic_convolution2<accscalar_t>(x1 + 1.0, A);
+  coeffs[1] = cubic_convolution1<accscalar_t>(x1, A);
+
+  // opposite coefficients
+  accscalar_t x2 = 1.0 - t;
+  coeffs[2] = cubic_convolution1<accscalar_t>(x2, A);
+  coeffs[3] = cubic_convolution2<accscalar_t>(x2 + 1.0, A);
+}
+
+template <typename scalar_t, typename accscalar_t>
+__device__ __forceinline__ static accscalar_t cubic_interp1d(
+    scalar_t x0,
+    scalar_t x1,
+    scalar_t x2,
+    scalar_t x3,
+    accscalar_t t) {
+  accscalar_t coeffs[4];
+  get_cubic_upsampling_coefficients<accscalar_t>(coeffs, t);
+
+  return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3];
+}
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/cuda/UpSampleBicubic2d.cu b/aten/src/ATen/native/cuda/UpSampleBicubic2d.cu
index 5375e33513ca..443e88ec078b 100644
--- a/aten/src/ATen/native/cuda/UpSampleBicubic2d.cu
+++ b/aten/src/ATen/native/cuda/UpSampleBicubic2d.cu
@@ -1,25 +1,329 @@
 #include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
 #include <ATen/NativeFunctions.h>
-#include <ATen/LegacyTHFunctions.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/Utils.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+#include <ATen/native/cuda/UpSample.cuh>
 
 namespace at {
 namespace native {
+namespace {
+
+template <typename scalar_t, typename accscalar_t>
+C10_LAUNCH_BOUNDS_1(1024)
+__global__ void upsample_bicubic2d_out_frame(
+    const int num_elements,
+    const accscalar_t height_scale,
+    const accscalar_t width_scale,
+    const bool align_corners,
+    const PackedTensorAccessor<scalar_t, 4> idata,
+    PackedTensorAccessor<scalar_t, 4> odata) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+
+  const int batchsize = idata.size(0);
+  const int channels = idata.size(1);
+  const int input_height = idata.size(2);
+  const int input_width = idata.size(3);
+  const int output_height = odata.size(2);
+  const int output_width = odata.size(3);
+
+  if (index >= num_elements) {
+    return;
+  }
+
+  // Special case: input and output are the same size, just copy
+  const int output_x = index % output_width;
+  const int output_y = index / output_width;
+
+  if (input_height == output_height && input_width == output_width) {
+    for (int n = 0; n < batchsize; n++) {
+      for (int c = 0; c < channels; c++) {
+        const scalar_t val = idata[n][c][output_y][output_x];
+        odata[n][c][output_y][output_x] = val;
+      }
+    }
+    return;
+  }
+
+  // Interpolation kernel
+  accscalar_t real_x = area_pixel_compute_source_index(
+      width_scale, output_x, align_corners, /*cubic=*/true);
+  int in_x = floorf(real_x);
+  accscalar_t t_x = real_x - in_x;
+
+  accscalar_t real_y = area_pixel_compute_source_index(
+      height_scale, output_y, align_corners, /*cubic=*/true);
+  int in_y = floorf(real_y);
+  accscalar_t t_y = real_y - in_y;
+
+  for (int n = 0; n < batchsize; n++) {
+    for (int c = 0; c < channels; c++) {
+      accscalar_t coefficients[4];
+
+      for (int k = 0; k < 4; k++) {
+        coefficients[k] = cubic_interp1d(
+            upsample_get_value_bounded<scalar_t>(
+                idata, n, c, input_height, input_width, in_y - 1 + k, in_x - 1),
+            upsample_get_value_bounded<scalar_t>(
+                idata, n, c, input_height, input_width, in_y - 1 + k, in_x + 0),
+            upsample_get_value_bounded<scalar_t>(
+                idata, n, c, input_height, input_width, in_y - 1 + k, in_x + 1),
+            upsample_get_value_bounded<scalar_t>(
+                idata, n, c, input_height, input_width, in_y - 1 + k, in_x + 2),
+            t_x);
+      }
+
+      odata[n][c][output_y][output_x] = static_cast<scalar_t>(cubic_interp1d(
+          coefficients[0],
+          coefficients[1],
+          coefficients[2],
+          coefficients[3],
+          t_y));
+    }
+  }
+}
+
+// Backward (adjoint) operation 1 <- 2 (accumulates)
+template <typename scalar_t, typename accscalar_t>
+C10_LAUNCH_BOUNDS_1(1024)
+__global__ void upsample_bicubic2d_backward_out_frame(
+    const int num_elements,
+    const accscalar_t height_scale,
+    const accscalar_t width_scale,
+    const bool align_corners,
+    PackedTensorAccessor<scalar_t, 4> idata,
+    const PackedTensorAccessor<scalar_t, 4> odata) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+
+  const int batchsize = idata.size(0);
+  const int channels = idata.size(1);
+  const int input_height = idata.size(2);
+  const int input_width = idata.size(3);
+  const int output_height = odata.size(2);
+  const int output_width = odata.size(3);
+
+  if (index >= num_elements) {
+    return;
+  }
+
+  const int output_x = index % output_width;
+  const int output_y = index / output_width;
+  // special case: output_xust copy
+  if (input_height == output_height && input_width == output_width) {
+    for (int n = 0; n < batchsize; n++) {
+      for (int c = 0; c < channels; ++c) {
+        const scalar_t val = odata[n][c][output_y][output_x];
+        idata[n][c][output_y][output_x] = val;
+      }
+    }
+    return;
+  }
+
+  accscalar_t real_x = area_pixel_compute_source_index(
+      width_scale, output_x, align_corners, /*cubic=*/true);
+  int input_x = floorf(real_x);
+  accscalar_t t_x = real_x - input_x;
+
+  accscalar_t real_y = area_pixel_compute_source_index(
+      height_scale, output_y, align_corners, /*cubic=*/true);
+  int input_y = floorf(real_y);
+  accscalar_t t_y = real_y - input_y;
+
+  accscalar_t x_coeffs[4];
+  accscalar_t y_coeffs[4];
+
+  get_cubic_upsampling_coefficients(x_coeffs, t_x);
+  get_cubic_upsampling_coefficients(y_coeffs, t_y);
+
+  for (int n = 0; n < batchsize; n++) {
+    for (int c = 0; c < channels; ++c) {
+      scalar_t out_value = odata[n][c][output_y][output_x];
+      for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 4; j++) {
+          upsample_increment_value_bounded<scalar_t, accscalar_t>(
+              idata,
+              n,
+              c,
+              input_height,
+              input_width,
+              input_y - 1 + i,
+              input_x - 1 + j,
+              out_value * y_coeffs[i] * x_coeffs[j]);
+        }
+      }
+    }
+  }
+}
+
+static void upsample_bicubic2d_out_cuda_template(
+    Tensor& output,
+    const Tensor& input,
+    IntArrayRef output_size,
+    bool align_corners) {
+  TensorArg input_arg{input, "input", 1}, output_arg{output, "output", 2};
+  checkAllSameGPU("upsample_bicubic2d_out", {input_arg, output_arg});
+
+  TORCH_CHECK(
+      output_size.size() == 2,
+      "It is expected output_size equals to 2, but got size ",
+      output_size.size());
+
+  int output_height = output_size[0];
+  int output_width = output_size[1];
+
+  int nbatch = input.size(0);
+  int channels = input.size(1);
+  int input_height = input.size(2);
+  int input_width = input.size(3);
+
+  upsample_2d_shape_check(
+      input,
+      Tensor(),
+      nbatch,
+      channels,
+      input_height,
+      input_width,
+      output_height,
+      output_width);
+
+  output.resize_({input.size(0), input.size(1), output_height, output_width});
+  output.zero_();
+
+  AT_ASSERT(
+      input_height > 0 && input_width > 0 && output_height > 0 &&
+      output_width > 0);
+
+  const int num_output_elements = output_height * output_width;
+  const int max_threads = std::min(
+      at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024);
+
+  // Launch kernel
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "upsample_bicubic2d_out_frame", [&] {
+        using accscalar_t = at::acc_type<scalar_t, true>;
+
+        auto idata = input.packed_accessor<scalar_t, 4>();
+        auto odata = output.packed_accessor<scalar_t, 4>();
+
+        // Get scaling factors
+        const accscalar_t rheight = area_pixel_compute_scale<accscalar_t>(
+            input_height, output_height, align_corners);
+        const accscalar_t rwidth = area_pixel_compute_scale<accscalar_t>(
+            input_width, output_width, align_corners);
+
+        upsample_bicubic2d_out_frame<scalar_t, accscalar_t>
+            <<<cuda::ATenCeilDiv(num_output_elements, max_threads),
+               max_threads,
+               0,
+               stream>>>(
+                num_output_elements,
+                rheight,
+                rwidth,
+                align_corners,
+                idata,
+                odata);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+static void upsample_bicubic2d_backward_out_cuda_template(
+    Tensor& grad_input,
+    const Tensor& grad_output_,
+    IntArrayRef output_size,
+    IntArrayRef input_size,
+    bool align_corners) {
+  TensorArg grad_input_arg{grad_input, "grad_input", 1},
+      grad_output_arg{grad_output_, "grad_output_", 2};
+  checkAllSameGPU(
+      "upsample_bicubic2d_backward_out_cuda",
+      {grad_output_arg, grad_input_arg});
+
+  TORCH_CHECK(
+      output_size.size() == 2,
+      "It is expected output_size equals to 2, but got size ",
+      output_size.size());
+
+  TORCH_CHECK(
+      input_size.size() == 4,
+      "It is expected input_size equals to 4, but got size ",
+      input_size.size());
+
+  int output_height = output_size[0];
+  int output_width = output_size[1];
+
+  int nbatch = input_size[0];
+  int channels = input_size[1];
+  int input_height = input_size[2];
+  int input_width = input_size[3];
+
+  upsample_2d_shape_check(
+      Tensor(),
+      grad_output_,
+      nbatch,
+      channels,
+      input_height,
+      input_width,
+      output_height,
+      output_width);
+
+  Tensor grad_output = grad_output_.contiguous();
+
+  grad_input.resize_({nbatch, channels, input_height, input_width});
+  grad_input.zero_();
+
+  const int num_kernels = output_height * output_width;
+  const int num_threads = std::min(
+      at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "upsample_bicubic2d_backward_out_frame", [&] {
+        using accscalar_t = at::acc_type<scalar_t, true>;
+
+        auto idata = grad_input.packed_accessor<scalar_t, 4>();
+        auto odata = grad_output.packed_accessor<scalar_t, 4>();
+
+        const accscalar_t rheight = area_pixel_compute_scale<accscalar_t>(
+            input_height, output_height, align_corners);
+        const accscalar_t rwidth = area_pixel_compute_scale<accscalar_t>(
+            input_width, output_width, align_corners);
+
+        upsample_bicubic2d_backward_out_frame<scalar_t, accscalar_t>
+            <<<cuda::ATenCeilDiv(num_kernels, num_threads),
+               num_threads,
+               0,
+               stream>>>(
+                num_kernels, rheight, rwidth, align_corners, idata, odata);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+} // namespace
 
 Tensor& upsample_bicubic2d_out_cuda(
     Tensor& output,
     const Tensor& input,
     IntArrayRef output_size,
     bool align_corners) {
-    return at::legacy::th::_thnn_upsample_bicubic2d_forward_out(
-        output, input, output_size, align_corners);
+  upsample_bicubic2d_out_cuda_template(
+      output, input, output_size, align_corners);
+  return output;
 }
 
 Tensor upsample_bicubic2d_cuda(
     const Tensor& input,
     IntArrayRef output_size,
     bool align_corners) {
-    return at::legacy::th::_thnn_upsample_bicubic2d_forward(
-        input, output_size, align_corners);
+  Tensor output = at::empty_like(input);
+  upsample_bicubic2d_out_cuda_template(
+      output, input, output_size, align_corners);
+  return output;
 }
 
 Tensor& upsample_bicubic2d_backward_out_cuda(
@@ -28,8 +332,9 @@ Tensor& upsample_bicubic2d_backward_out_cuda(
     IntArrayRef output_size,
     IntArrayRef input_size,
     bool align_corners) {
-    return at::legacy::th::_thnn_upsample_bicubic2d_backward_out(
-        grad_input, grad_output, output_size, input_size, align_corners);
+  upsample_bicubic2d_backward_out_cuda_template(
+      grad_input, grad_output, output_size, input_size, align_corners);
+  return grad_input;
 }
 
 Tensor upsample_bicubic2d_backward_cuda(
@@ -37,9 +342,11 @@ Tensor upsample_bicubic2d_backward_cuda(
     IntArrayRef output_size,
     IntArrayRef input_size,
     bool align_corners) {
-    return at::legacy::th::_thnn_upsample_bicubic2d_backward(
-        grad_output, output_size, input_size, align_corners);
+  Tensor grad_input = at::empty_like(grad_output);
+  upsample_bicubic2d_backward_out_cuda_template(
+      grad_input, grad_output, output_size, input_size, align_corners);
+  return grad_input;
 }
 
-} // native
-} // at
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu b/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu
index 7c53443f70a6..d4e8d1b14f55 100644
--- a/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu
+++ b/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu
@@ -1,25 +1,308 @@
+// Adapted from interp.cpp from Caffe util by Pauline Luc
+// Originally developed by George Papandreou
 #include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
 #include <ATen/NativeFunctions.h>
-#include <ATen/LegacyTHFunctions.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/Utils.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+#include <ATen/native/cuda/UpSample.cuh>
 
 namespace at {
 namespace native {
+namespace {
+
+template <typename scalar_t, typename accscalar_t>
+C10_LAUNCH_BOUNDS_1(1024)
+__global__ void upsample_bilinear2d_out_frame(
+    const int n,
+    const accscalar_t rheight,
+    const accscalar_t rwidth,
+    const bool align_corners,
+    const PackedTensorAccessor<scalar_t, 4> idata,
+    PackedTensorAccessor<scalar_t, 4> odata) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+
+  const int batchsize = idata.size(0);
+  const int channels = idata.size(1);
+  const int height1 = idata.size(2);
+  const int width1 = idata.size(3);
+  const int height2 = odata.size(2);
+  const int width2 = odata.size(3);
+
+  if (index < n) {
+    const int w2 = index % width2; // 0:width2-1
+    const int h2 = index / width2; // 0:height2-1
+    // special case: just copy
+    if (height1 == height2 && width1 == width2) {
+      const int h1 = h2;
+      const int w1 = w2;
+      for (int n = 0; n < batchsize; n++) {
+        for (int c = 0; c < channels; ++c) {
+          const scalar_t val = idata[n][c][h1][w1];
+          odata[n][c][h2][w2] = val;
+        }
+      }
+      return;
+    }
+    //
+    const accscalar_t h1r = area_pixel_compute_source_index<accscalar_t>(
+        rheight, h2, align_corners, /*cubic=*/false);
+    const int h1 = h1r;
+    const int h1p = (h1 < height1 - 1) ? 1 : 0;
+    const accscalar_t h1lambda = h1r - h1;
+    const accscalar_t h0lambda = static_cast<accscalar_t>(1) - h1lambda;
+    //
+    const accscalar_t w1r = area_pixel_compute_source_index<accscalar_t>(
+        rwidth, w2, align_corners, /*cubic=*/false);
+    const int w1 = w1r;
+    const int w1p = (w1 < width1 - 1) ? 1 : 0;
+    const accscalar_t w1lambda = w1r - w1;
+    const accscalar_t w0lambda = static_cast<accscalar_t>(1) - w1lambda;
+    //
+    for (int n = 0; n < batchsize; n++) {
+      for (int c = 0; c < channels; ++c) {
+        const accscalar_t val = h0lambda *
+                (w0lambda * idata[n][c][h1][w1] +
+                 w1lambda * idata[n][c][h1][w1 + w1p]) +
+            h1lambda *
+                (w0lambda * idata[n][c][h1 + h1p][w1] +
+                 w1lambda * idata[n][c][h1 + h1p][w1 + w1p]);
+        odata[n][c][h2][w2] = static_cast<scalar_t>(val);
+      }
+    }
+  }
+}
+
+// Backward (adjoint) operation 1 <- 2 (accumulates)
+template <typename scalar_t, typename accscalar_t>
+C10_LAUNCH_BOUNDS_1(1024)
+__global__ void upsample_bilinear2d_backward_out_frame(
+    const int n,
+    const accscalar_t rheight,
+    const accscalar_t rwidth,
+    const bool align_corners,
+    PackedTensorAccessor<scalar_t, 4> idata,
+    const PackedTensorAccessor<scalar_t, 4> odata) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+
+  const int batchsize = idata.size(0);
+  const int channels = idata.size(1);
+  const int height1 = idata.size(2);
+  const int width1 = idata.size(3);
+  const int height2 = odata.size(2);
+  const int width2 = odata.size(3);
+
+  if (index < n) {
+    const int w2 = index % width2; // 0:width2-1
+    const int h2 = index / width2; // 0:height2-1
+    // special case: just copy
+    if (height1 == height2 && width1 == width2) {
+      const int h1 = h2;
+      const int w1 = w2;
+      for (int n = 0; n < batchsize; n++) {
+        for (int c = 0; c < channels; ++c) {
+          const scalar_t val = odata[n][c][h1][w1];
+          idata[n][c][h2][w2] = val;
+        }
+      }
+      return;
+    }
+    //
+    const accscalar_t h1r = area_pixel_compute_source_index<accscalar_t>(
+        rheight, h2, align_corners, /*cubic=*/false);
+    const int h1 = h1r;
+    const int h1p = (h1 < height1 - 1) ? 1 : 0;
+    const accscalar_t h1lambda = h1r - h1;
+    const accscalar_t h0lambda = static_cast<accscalar_t>(1) - h1lambda;
+    //
+    const accscalar_t w1r = area_pixel_compute_source_index<accscalar_t>(
+        rwidth, w2, align_corners, /*cubic=*/false);
+    const int w1 = w1r;
+    const int w1p = (w1 < width1 - 1) ? 1 : 0;
+    const accscalar_t w1lambda = w1r - w1;
+    const accscalar_t w0lambda = static_cast<accscalar_t>(1) - w1lambda;
+    //
+    for (int n = 0; n < batchsize; n++) {
+      for (int c = 0; c < channels; ++c) {
+        const scalar_t d2val = odata[n][c][h2][w2];
+        atomicAdd(
+            &idata[n][c][h1][w1],
+            static_cast<scalar_t>(h0lambda * w0lambda * d2val));
+        atomicAdd(
+            &idata[n][c][h1][w1 + w1p],
+            static_cast<scalar_t>(h0lambda * w1lambda * d2val));
+        atomicAdd(
+            &idata[n][c][h1 + h1p][w1],
+            static_cast<scalar_t>(h1lambda * w0lambda * d2val));
+        atomicAdd(
+            &idata[n][c][h1 + h1p][w1 + w1p],
+            static_cast<scalar_t>(h1lambda * w1lambda * d2val));
+      }
+    }
+  }
+}
+
+static void upsample_bilinear2d_out_cuda_template(
+    Tensor& output,
+    const Tensor& input,
+    IntArrayRef output_size,
+    bool align_corners) {
+  TensorArg input_arg{input, "input", 1}, output_arg{output, "output", 2};
+  checkAllSameGPU("upsample_bilinear2d_out_cuda", {input_arg, output_arg});
+
+  TORCH_CHECK(
+      output_size.size() == 2,
+      "It is expected output_size equals to 2, but got size ",
+      output_size.size());
+
+  int output_height = output_size[0];
+  int output_width = output_size[1];
+
+  int nbatch = input.size(0);
+  int channels = input.size(1);
+  int input_height = input.size(2);
+  int input_width = input.size(3);
+
+  upsample_2d_shape_check(
+      input,
+      Tensor(),
+      nbatch,
+      channels,
+      input_height,
+      input_width,
+      output_height,
+      output_width);
+
+  output.resize_({input.size(0), input.size(1), output_height, output_width});
+  output.zero_();
+
+  AT_ASSERT(
+      input_height > 0 && input_width > 0 && output_height > 0 &&
+      output_width > 0);
+
+  const int num_kernels = output_height * output_width;
+  const int num_threads = std::min(
+      at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "upsample_bilinear2d_out_frame", [&] {
+        using accscalar_t = at::acc_type<scalar_t, true>;
+
+        auto idata = input.packed_accessor<scalar_t, 4>();
+        auto odata = output.packed_accessor<scalar_t, 4>();
+
+        const accscalar_t rheight = area_pixel_compute_scale<accscalar_t>(
+            input_height, output_height, align_corners);
+        const accscalar_t rwidth = area_pixel_compute_scale<accscalar_t>(
+            input_width, output_width, align_corners);
+
+        upsample_bilinear2d_out_frame<scalar_t, accscalar_t>
+            <<<cuda::ATenCeilDiv(num_kernels, num_threads),
+               num_threads,
+               0,
+               stream>>>(
+                num_kernels, rheight, rwidth, align_corners, idata, odata);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+static void upsample_bilinear2d_backward_out_cuda_template(
+    Tensor& grad_input,
+    const Tensor& grad_output_,
+    IntArrayRef output_size,
+    IntArrayRef input_size,
+    bool align_corners) {
+  TensorArg grad_input_arg{grad_input, "grad_input", 1},
+      grad_output_arg{grad_output_, "grad_output_", 2};
+  checkAllSameGPU(
+      "upsample_bilinear2d_backward_out_cuda",
+      {grad_output_arg, grad_input_arg});
+
+  TORCH_CHECK(
+      output_size.size() == 2,
+      "It is expected output_size equals to 2, but got size ",
+      output_size.size());
+
+  TORCH_CHECK(
+      input_size.size() == 4,
+      "It is expected input_size equals to 4, but got size ",
+      input_size.size());
+
+  int output_height = output_size[0];
+  int output_width = output_size[1];
+
+  int nbatch = input_size[0];
+  int channels = input_size[1];
+  int input_height = input_size[2];
+  int input_width = input_size[3];
+
+  upsample_2d_shape_check(
+      Tensor(),
+      grad_output_,
+      nbatch,
+      channels,
+      input_height,
+      input_width,
+      output_height,
+      output_width);
+
+  Tensor grad_output = grad_output_.contiguous();
+
+  grad_input.resize_({nbatch, channels, input_height, input_width});
+  grad_input.zero_();
+
+  const int num_kernels = output_height * output_width;
+  const int num_threads = std::min(
+      at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "upsample_bilinear2d_backward_out_frame", [&] {
+        using accscalar_t = at::acc_type<scalar_t, true>;
+
+        auto idata = grad_input.packed_accessor<scalar_t, 4>();
+        auto odata = grad_output.packed_accessor<scalar_t, 4>();
+
+        const accscalar_t rheight = area_pixel_compute_scale<accscalar_t>(
+            input_height, output_height, align_corners);
+        const accscalar_t rwidth = area_pixel_compute_scale<accscalar_t>(
+            input_width, output_width, align_corners);
+
+        upsample_bilinear2d_backward_out_frame<scalar_t, accscalar_t>
+            <<<cuda::ATenCeilDiv(num_kernels, num_threads),
+               num_threads,
+               0,
+               stream>>>(
+                num_kernels, rheight, rwidth, align_corners, idata, odata);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+} // namespace
 
 Tensor& upsample_bilinear2d_out_cuda(
     Tensor& output,
     const Tensor& input,
     IntArrayRef output_size,
     bool align_corners) {
-    return at::legacy::th::_thnn_upsample_bilinear2d_forward_out(
-        output, input, output_size, align_corners);
+  upsample_bilinear2d_out_cuda_template(
+      output, input, output_size, align_corners);
+  return output;
 }
 
 Tensor upsample_bilinear2d_cuda(
     const Tensor& input,
     IntArrayRef output_size,
     bool align_corners) {
-    return at::legacy::th::_thnn_upsample_bilinear2d_forward(
-        input, output_size, align_corners);
+  Tensor output = at::empty_like(input);
+  upsample_bilinear2d_out_cuda_template(
+      output, input, output_size, align_corners);
+  return output;
 }
 
 Tensor& upsample_bilinear2d_backward_out_cuda(
@@ -28,8 +311,9 @@ Tensor& upsample_bilinear2d_backward_out_cuda(
     IntArrayRef output_size,
     IntArrayRef input_size,
     bool align_corners) {
-    return at::legacy::th::_thnn_upsample_bilinear2d_backward_out(
-        grad_input, grad_output, output_size, input_size, align_corners);
+  upsample_bilinear2d_backward_out_cuda_template(
+      grad_input, grad_output, output_size, input_size, align_corners);
+  return grad_input;
 }
 
 Tensor upsample_bilinear2d_backward_cuda(
@@ -37,9 +321,11 @@ Tensor upsample_bilinear2d_backward_cuda(
     IntArrayRef output_size,
     IntArrayRef input_size,
     bool align_corners) {
-    return at::legacy::th::_thnn_upsample_bilinear2d_backward(
-        grad_output, output_size, input_size, align_corners);
+  Tensor grad_input = at::empty_like(grad_output);
+  upsample_bilinear2d_backward_out_cuda_template(
+      grad_input, grad_output, output_size, input_size, align_corners);
+  return grad_input;
 }
 
-} // native
-} // at
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/cuda/UpSampleLinear1d.cu b/aten/src/ATen/native/cuda/UpSampleLinear1d.cu
index eb491f5e9c2b..0f70b57344cb 100644
--- a/aten/src/ATen/native/cuda/UpSampleLinear1d.cu
+++ b/aten/src/ATen/native/cuda/UpSampleLinear1d.cu
@@ -1,25 +1,248 @@
+// Adapted from interp.cpp from Caffe util by Pauline Luc
+// Originally developed by George Papandreou
 #include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
 #include <ATen/NativeFunctions.h>
-#include <ATen/LegacyTHFunctions.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/Utils.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+#include <ATen/native/cuda/UpSample.cuh>
 
 namespace at {
 namespace native {
+namespace {
+
+template <typename scalar_t, typename accscalar_t>
+#ifdef __HIP_PLATFORM_HCC__
+C10_LAUNCH_BOUNDS_1(1024)
+#endif
+__global__ void upsample_linear1d_out_frame(
+    const int n,
+    const accscalar_t rwidth,
+    const bool align_corners,
+    const PackedTensorAccessor<scalar_t, 3> idata,
+    PackedTensorAccessor<scalar_t, 3> odata) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+
+  const int batchsize = idata.size(0);
+  const int channels = idata.size(1);
+  const int width1 = idata.size(2);
+  const int width2 = odata.size(2);
+
+  if (index < n) {
+    const int w2 = index % width2;
+    // special case: just copy
+    if (width1 == width2) {
+      const int w1 = w2;
+      for (int n = 0; n < batchsize; n++) {
+        for (int c = 0; c < channels; ++c) {
+          const scalar_t val = idata[n][c][w1];
+          odata[n][c][w2] = val;
+        }
+      }
+      return;
+    }
+    //
+    const accscalar_t w1r = area_pixel_compute_source_index<accscalar_t>(
+        rwidth, w2, align_corners, /*cubic=*/false);
+    const int w1 = w1r;
+    const int w1p = (w1 < width1 - 1) ? 1 : 0;
+    const accscalar_t w1lambda = w1r - w1;
+    const accscalar_t w0lambda = static_cast<accscalar_t>(1) - w1lambda;
+    //
+    for (int n = 0; n < batchsize; n++) {
+      for (int c = 0; c < channels; ++c) {
+        const accscalar_t val =
+            w0lambda * idata[n][c][w1] + w1lambda * idata[n][c][w1 + w1p];
+        odata[n][c][w2] = static_cast<scalar_t>(val);
+      }
+    }
+  }
+}
+
+// Backward (adjoint) operation 1 <- 2 (accumulates)
+template <typename scalar_t, typename accscalar_t>
+#ifdef __HIP_PLATFORM_HCC__
+C10_LAUNCH_BOUNDS_1(1024)
+#endif
+__global__ void upsample_linear1d_out_frame_backward(
+    const int n,
+    const accscalar_t rwidth,
+    const bool align_corners,
+    PackedTensorAccessor<scalar_t, 3> idata,
+    const PackedTensorAccessor<scalar_t, 3> odata) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+
+  const int batchsize = idata.size(0);
+  const int channels = idata.size(1);
+  const int width1 = idata.size(2);
+  const int width2 = odata.size(2);
+
+  if (index < n) {
+    const int w2 = index % width2;
+    // special case: just copy
+    if (width1 == width2) {
+      const int w1 = w2;
+      for (int n = 0; n < batchsize; n++) {
+        for (int c = 0; c < channels; ++c) {
+          const scalar_t val = odata[n][c][w1];
+          idata[n][c][w2] = val;
+        }
+      }
+      return;
+    }
+    //
+    const accscalar_t w1r = area_pixel_compute_source_index<accscalar_t>(
+        rwidth, w2, align_corners, /*cubic=*/false);
+    const int w1 = w1r;
+    const int w1p = (w1 < width1 - 1) ? 1 : 0;
+    const accscalar_t w1lambda = w1r - w1;
+    const accscalar_t w0lambda = static_cast<accscalar_t>(1) - w1lambda;
+    //
+    for (int n = 0; n < batchsize; n++) {
+      for (int c = 0; c < channels; ++c) {
+        const scalar_t d2val = odata[n][c][w2];
+        atomicAdd(&idata[n][c][w1], static_cast<scalar_t>(w0lambda * d2val));
+        atomicAdd(
+            &idata[n][c][w1 + w1p], static_cast<scalar_t>(w1lambda * d2val));
+      }
+    }
+  }
+}
+
+static void upsample_linear1d_out_cuda_template(
+    Tensor& output,
+    const Tensor& input,
+    IntArrayRef output_size,
+    bool align_corners) {
+  TensorArg input_arg{input, "input", 1}, output_arg{output, "output", 2};
+  checkAllSameGPU("upsample_linear1d_out_cuda", {input_arg, output_arg});
+
+  TORCH_CHECK(
+      output_size.size() == 1,
+      "It is expected output_size equals to 1, but got size ",
+      output_size.size());
+
+  int output_width = output_size[0];
+
+  int nbatch = input.size(0);
+  int channels = input.size(1);
+  int input_width = input.size(2);
+
+  upsample_1d_shape_check(
+      input, Tensor(), nbatch, channels, input_width, output_width);
+
+  output.resize_({input.size(0), input.size(1), output_width});
+  output.zero_();
+
+  AT_ASSERT(input_width > 0 && output_width > 0);
+
+  const int num_kernels = output_width;
+  const int num_threads =
+      at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock;
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "upsample_linear1d_out_frame", [&] {
+        using accscalar_t = at::acc_type<scalar_t, true>;
+
+        auto idata = input.packed_accessor<scalar_t, 3>();
+        auto odata = output.packed_accessor<scalar_t, 3>();
+
+        const accscalar_t rwidth = area_pixel_compute_scale<accscalar_t>(
+          input_width, output_width, align_corners);
+
+        upsample_linear1d_out_frame<scalar_t, accscalar_t>
+            <<<cuda::ATenCeilDiv(num_kernels, num_threads),
+               num_threads,
+               0,
+               stream>>>(num_kernels, rwidth, align_corners, idata, odata);
+      });
+
+      AT_CUDA_CHECK(cudaGetLastError());
+}
+
+static void upsample_linear1d_backward_out_cuda_template(
+    Tensor& grad_input,
+    const Tensor& grad_output_,
+    IntArrayRef output_size,
+    IntArrayRef input_size,
+    bool align_corners) {
+  TensorArg grad_output_arg{grad_output_, "grad_output_", 1},
+      grad_input_arg{grad_input, "grad_input", 2};
+  checkAllSameGPU(
+      "upsample_linear1d_backward_out_cuda", {grad_output_arg, grad_input_arg});
+
+  TORCH_CHECK(
+      output_size.size() == 1,
+      "It is expected output_size equals to 1, but got size ",
+      output_size.size());
+
+  TORCH_CHECK(
+      input_size.size() == 3,
+      "It is expected input_size equals to 3, but got size ",
+      input_size.size());
+
+  int output_width = output_size[0];
+
+  int nbatch = input_size[0];
+  int channels = input_size[1];
+  int input_width = input_size[2];
+
+  upsample_1d_shape_check(
+      Tensor(), grad_output_, nbatch, channels, input_width, output_width);
+
+  Tensor grad_output = grad_output_.contiguous();
+
+  grad_input.resize_({nbatch, channels, input_width});
+  grad_input.zero_();
+
+  const int num_kernels = output_width;
+  const int num_threads =
+      at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock;
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "upsample_linear1d_out_frame_backward", [&] {
+        using accscalar_t = at::acc_type<scalar_t, true>;
+
+        auto idata = grad_input.packed_accessor<scalar_t, 3>();
+        auto odata = grad_output.packed_accessor<scalar_t, 3>();
+
+        const accscalar_t rwidth = area_pixel_compute_scale<accscalar_t>(
+            input_width, output_width, align_corners);
+
+        upsample_linear1d_out_frame_backward<scalar_t, accscalar_t>
+            <<<cuda::ATenCeilDiv(num_kernels, num_threads),
+               num_threads,
+               0,
+               stream>>>(num_kernels, rwidth, align_corners, idata, odata);
+      });
+
+      AT_CUDA_CHECK(cudaGetLastError());
+}
+
+} // namespace
 
 Tensor& upsample_linear1d_out_cuda(
     Tensor& output,
     const Tensor& input,
     IntArrayRef output_size,
     bool align_corners) {
-    return at::legacy::th::_thnn_upsample_linear1d_forward_out(
-        output, input, output_size, align_corners);
+  upsample_linear1d_out_cuda_template(
+      output, input, output_size, align_corners);
+  return output;
 }
 
 Tensor upsample_linear1d_cuda(
     const Tensor& input,
     IntArrayRef output_size,
     bool align_corners) {
-    return at::legacy::th::_thnn_upsample_linear1d_forward(
-        input, output_size, align_corners);
+  Tensor output = at::empty_like(input);
+  upsample_linear1d_out_cuda_template(
+      output, input, output_size, align_corners);
+  return output;
 }
 
 Tensor& upsample_linear1d_backward_out_cuda(
@@ -28,8 +251,9 @@ Tensor& upsample_linear1d_backward_out_cuda(
     IntArrayRef output_size,
     IntArrayRef input_size,
     bool align_corners) {
-        return at::legacy::th::_thnn_upsample_linear1d_backward_out(
-        grad_input, grad_output, output_size, input_size, align_corners);
+  upsample_linear1d_backward_out_cuda_template(
+      grad_input, grad_output, output_size, input_size, align_corners);
+  return grad_input;
 }
 
 Tensor upsample_linear1d_backward_cuda(
@@ -37,9 +261,11 @@ Tensor upsample_linear1d_backward_cuda(
     IntArrayRef output_size,
     IntArrayRef input_size,
     bool align_corners) {
-    return at::legacy::th::_thnn_upsample_linear1d_backward(
-        grad_output, output_size, input_size, align_corners);
+  Tensor grad_input = at::empty_like(grad_output);
+  upsample_linear1d_backward_out_cuda_template(
+      grad_input, grad_output, output_size, input_size, align_corners);
+  return grad_input;
 }
 
-} // native
-} // at
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/cuda/UpSampleNearest1d.cu b/aten/src/ATen/native/cuda/UpSampleNearest1d.cu
index bfba5a16ddfc..2218d2775a53 100644
--- a/aten/src/ATen/native/cuda/UpSampleNearest1d.cu
+++ b/aten/src/ATen/native/cuda/UpSampleNearest1d.cu
@@ -1,23 +1,215 @@
 #include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
 #include <ATen/NativeFunctions.h>
-#include <ATen/LegacyTHFunctions.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/Utils.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+#include <ATen/native/cuda/UpSample.cuh>
 
 namespace at {
 namespace native {
+namespace {
 
-Tensor& upsample_nearest1d_out_cuda(
+template <typename scalar_t, typename accscalar_t>
+C10_LAUNCH_BOUNDS_1(1024)
+__global__ void upsample_nearest1d_out_frame(
+    const int n,
+    const PackedTensorAccessor<scalar_t, 3> idata,
+    PackedTensorAccessor<scalar_t, 3> odata) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+
+  const int batchsize = idata.size(0);
+  const int channels = idata.size(1);
+  const int width1 = idata.size(2);
+  const int width2 = odata.size(2);
+
+  const float scale = (float)width1 / (float)width2;
+
+  if (index < n) {
+    const int w2 = index % width2;
+    // special case: just copy
+    if (width1 == width2) {
+      const int w1 = w2;
+      for (int n = 0; n < batchsize; n++) {
+        for (int c = 0; c < channels; ++c) {
+          const scalar_t val = idata[n][c][w1];
+          odata[n][c][w2] = val;
+        }
+      }
+      return;
+    }
+    //
+    const int w1 = nearest_neighbor_compute_source_index(scale, w2, width1);
+
+    for (int n = 0; n < batchsize; n++) {
+      for (int c = 0; c < channels; ++c) {
+        const scalar_t val = idata[n][c][w1];
+        odata[n][c][w2] = val;
+      }
+    }
+  }
+}
+
+// Backward operation
+template <typename scalar_t, typename accscalar_t>
+C10_LAUNCH_BOUNDS_1(1024)
+__global__ void upsample_nearest1d_backward_out_frame(
+    const int n,
+    PackedTensorAccessor<scalar_t, 3> idata,
+    const PackedTensorAccessor<scalar_t, 3> odata) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+
+  const int batchsize = idata.size(0);
+  const int channels = idata.size(1);
+  const int width1 = idata.size(2);
+  const int width2 = odata.size(2);
+
+  const float scale = (float)width1 / (float)width2;
+
+  if (index < n) {
+    const int w2 = index % width2;
+    // special case: just copy
+    if (width1 == width2) {
+      const int w1 = w2;
+      for (int n = 0; n < batchsize; n++) {
+        for (int c = 0; c < channels; ++c) {
+          const scalar_t val = odata[n][c][w1];
+          idata[n][c][w2] = val;
+        }
+      }
+      return;
+    }
+    //
+    const int w1 = nearest_neighbor_compute_source_index(scale, w2, width1);
+
+    for (int n = 0; n < batchsize; n++) {
+      for (int c = 0; c < channels; ++c) {
+        const scalar_t d2val = odata[n][c][w2];
+        atomicAdd(&idata[n][c][w1], d2val);
+      }
+    }
+  }
+}
+
+static void upsample_nearest1d_out_cuda_template(
     Tensor& output,
     const Tensor& input,
     IntArrayRef output_size) {
-    return at::legacy::th::_thnn_upsample_nearest1d_forward_out(
-        output, input, output_size);
+  TensorArg input_arg{input, "input", 1}, output_arg{output, "output", 2};
+  checkAllSameGPU("upsample_nearest1d_out_cuda", {input_arg, output_arg});
+
+  TORCH_CHECK(
+      output_size.size() == 1,
+      "It is expected output_size equals to 1, but got size ",
+      output_size.size());
+
+  int output_width = output_size[0];
+
+  int nbatch = input.size(0);
+  int channels = input.size(1);
+  int input_width = input.size(2);
+
+  upsample_1d_shape_check(
+      input, Tensor(), nbatch, channels, input_width, output_width);
+
+  AT_ASSERT(input_width > 0 && output_width > 0);
+
+  output.resize_({input.size(0), input.size(1), output_width});
+  output.zero_();
+
+  const int num_kernels = output_width;
+  const int num_threads = std::min(
+      at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "upsample_nearest1d_out_frame", [&] {
+        using accscalar_t = at::acc_type<scalar_t, true>;
+
+        auto idata = input.packed_accessor<scalar_t, 3>();
+        auto odata = output.packed_accessor<scalar_t, 3>();
+
+        upsample_nearest1d_out_frame<scalar_t, accscalar_t>
+            <<<cuda::ATenCeilDiv(num_kernels, num_threads),
+               num_threads,
+               0,
+               stream>>>(num_kernels, idata, odata);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+static void upsample_nearest1d_backward_out_cuda_template(
+    Tensor& grad_input,
+    const Tensor& grad_output_,
+    IntArrayRef output_size,
+    IntArrayRef input_size) {
+  TensorArg grad_input_arg{grad_input, "grad_input", 1},
+      grad_output_arg{grad_output_, "grad_output_", 2};
+  checkAllSameGPU(
+      "upsample_nearest1d_backward_out_cuda_template",
+      {grad_output_arg, grad_input_arg});
+
+  TORCH_CHECK(
+      output_size.size() == 1,
+      "It is expected output_size equals to 1, but got size ",
+      output_size.size());
+
+  TORCH_CHECK(
+      input_size.size() == 3,
+      "It is expected input_size equals to 3, but got size ",
+      input_size.size());
+
+  int output_width = output_size[0];
+
+  int nbatch = input_size[0];
+  int channels = input_size[1];
+  int input_width = input_size[2];
+
+  upsample_1d_shape_check(
+      Tensor(), grad_output_, nbatch, channels, input_width, output_width);
+
+  Tensor grad_output = grad_output_.contiguous();
+  grad_input.resize_({nbatch, channels, input_width});
+  grad_input.zero_();
+
+  const int num_kernels = output_width;
+  const int num_threads = std::min(
+      at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "upsample_nearest1d_backward_out_frame", [&] {
+        using accscalar_t = at::acc_type<scalar_t, true>;
+
+        auto idata = grad_input.packed_accessor<scalar_t, 3>();
+        auto odata = grad_output.packed_accessor<scalar_t, 3>();
+
+        upsample_nearest1d_backward_out_frame<scalar_t, accscalar_t>
+            <<<cuda::ATenCeilDiv(num_kernels, num_threads),
+               num_threads,
+               0,
+               stream>>>(num_kernels, idata, odata);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
 }
 
-Tensor upsample_nearest1d_cuda(
+} // namespace
+
+Tensor& upsample_nearest1d_out_cuda(
+    Tensor& output,
     const Tensor& input,
     IntArrayRef output_size) {
-    return at::legacy::th::_thnn_upsample_nearest1d_forward(
-        input, output_size);
+  upsample_nearest1d_out_cuda_template(output, input, output_size);
+  return output;
+}
+
+Tensor upsample_nearest1d_cuda(const Tensor& input, IntArrayRef output_size) {
+  Tensor output = at::empty_like(input);
+  upsample_nearest1d_out_cuda_template(output, input, output_size);
+  return output;
 }
 
 Tensor& upsample_nearest1d_backward_out_cuda(
@@ -25,17 +217,20 @@ Tensor& upsample_nearest1d_backward_out_cuda(
     const Tensor& grad_output,
     IntArrayRef output_size,
     IntArrayRef input_size) {
-    return at::legacy::th::_thnn_upsample_nearest1d_backward_out(
-        grad_input, grad_output, output_size, input_size);
+  upsample_nearest1d_backward_out_cuda_template(
+      grad_input, grad_output, output_size, input_size);
+  return grad_input;
 }
 
 Tensor upsample_nearest1d_backward_cuda(
     const Tensor& grad_output,
     IntArrayRef output_size,
     IntArrayRef input_size) {
-    return at::legacy::th::_thnn_upsample_nearest1d_backward(
-        grad_output, output_size, input_size);
+  Tensor grad_input = at::empty_like(grad_output);
+  upsample_nearest1d_backward_out_cuda_template(
+      grad_input, grad_output, output_size, input_size);
+  return grad_input;
 }
 
-} // native
-} // at
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/cuda/UpSampleNearest2d.cu b/aten/src/ATen/native/cuda/UpSampleNearest2d.cu
index 83f40f68d6fe..f8d99609b84c 100644
--- a/aten/src/ATen/native/cuda/UpSampleNearest2d.cu
+++ b/aten/src/ATen/native/cuda/UpSampleNearest2d.cu
@@ -1,23 +1,255 @@
 #include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
 #include <ATen/NativeFunctions.h>
-#include <ATen/LegacyTHFunctions.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/Utils.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+#include <ATen/native/cuda/UpSample.cuh>
 
 namespace at {
 namespace native {
+namespace {
 
-Tensor& upsample_nearest2d_out_cuda(
+template <typename scalar_t, typename accscalar_t>
+C10_LAUNCH_BOUNDS_1(1024)
+__global__ void upsample_nearest2d_out_frame(
+    const int n,
+    const PackedTensorAccessor<scalar_t, 4> idata,
+    PackedTensorAccessor<scalar_t, 4> odata) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+
+  const int batchsize = idata.size(0);
+  const int channels = idata.size(1);
+  const int height1 = idata.size(2);
+  const int width1 = idata.size(3);
+  const int height2 = odata.size(2);
+  const int width2 = odata.size(3);
+
+  const float height_scale = (float)height1 / (float)height2;
+  const float width_scale = (float)width1 / (float)width2;
+
+  if (index < n) {
+    const int w2 = index % width2; // 0:width2-1
+    const int h2 = index / width2; // 0:height2-1
+    // special case: just copy
+    if (height1 == height2 && width1 == width2) {
+      const int h1 = h2;
+      const int w1 = w2;
+
+      for (int n = 0; n < batchsize; n++) {
+        for (int c = 0; c < channels; ++c) {
+          const scalar_t val = idata[n][c][h1][w1];
+          odata[n][c][h2][w2] = val;
+        }
+      }
+      return;
+    }
+    //
+    const int h1 =
+        nearest_neighbor_compute_source_index(height_scale, h2, height1);
+    const int w1 =
+        nearest_neighbor_compute_source_index(width_scale, w2, width1);
+
+    for (int n = 0; n < batchsize; n++) {
+      for (int c = 0; c < channels; ++c) {
+        const scalar_t val = idata[n][c][h1][w1];
+        odata[n][c][h2][w2] = val;
+      }
+    }
+  }
+}
+
+// Backward operation
+template <typename scalar_t, typename accscalar_t>
+C10_LAUNCH_BOUNDS_1(1024)
+__global__ void upsample_nearest2d_backward_out_frame(
+    const int n,
+    PackedTensorAccessor<scalar_t, 4> idata,
+    const PackedTensorAccessor<scalar_t, 4> odata) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+
+  const int batchsize = idata.size(0);
+  const int channels = idata.size(1);
+  const int height1 = idata.size(2);
+  const int width1 = idata.size(3);
+  const int height2 = odata.size(2);
+  const int width2 = odata.size(3);
+
+  const float height_scale = (float)height1 / (float)height2;
+  const float width_scale = (float)width1 / (float)width2;
+
+  if (index < n) {
+    const int w2 = index % width2; // 0:width2-1
+    const int h2 = index / width2; // 0:height2-1
+    // special case: just copy
+    if (height1 == height2 && width1 == width2) {
+      const int h1 = h2;
+      const int w1 = w2;
+
+      for (int n = 0; n < batchsize; n++) {
+        for (int c = 0; c < channels; ++c) {
+          const scalar_t val = odata[n][c][h2][w2];
+          idata[n][c][h1][w1] = val;
+        }
+      }
+      return;
+    }
+    //
+    const int h1 =
+        nearest_neighbor_compute_source_index(height_scale, h2, height1);
+    const int w1 =
+        nearest_neighbor_compute_source_index(width_scale, w2, width1);
+
+    for (int n = 0; n < batchsize; n++) {
+      for (int c = 0; c < channels; ++c) {
+        const scalar_t d2val = odata[n][c][h2][w2];
+        atomicAdd(&idata[n][c][h1][w1], d2val);
+      }
+    }
+  }
+}
+
+static void upsample_nearest2d_out_cuda_template(
     Tensor& output,
     const Tensor& input,
     IntArrayRef output_size) {
-    return at::legacy::th::_thnn_upsample_nearest2d_forward_out(
-        output, input, output_size);
+  TensorArg input_arg{input, "input", 1}, output_arg{output, "output", 2};
+  checkAllSameGPU(
+      "upsample_nearest2d_out_cuda_template", {input_arg, output_arg});
+
+  TORCH_CHECK(
+      output_size.size() == 2,
+      "It is expected output_size equals to 2, but got size ",
+      output_size.size());
+
+  int output_height = output_size[0];
+  int output_width = output_size[1];
+
+  int nbatch = input.size(0);
+  int channels = input.size(1);
+  int input_height = input.size(2);
+  int input_width = input.size(3);
+
+  upsample_2d_shape_check(
+      input,
+      Tensor(),
+      nbatch,
+      channels,
+      input_height,
+      input_width,
+      output_height,
+      output_width);
+
+  AT_ASSERT(
+      input_height > 0 && input_width > 0 && output_height > 0 &&
+      output_width > 0);
+
+  output.resize_({input.size(0), input.size(1), output_height, output_width});
+  output.zero_();
+
+  const int num_kernels = output_height * output_width;
+  const int num_threads = std::min(
+      at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "upsample_nearest2d_out_frame", [&] {
+        using accscalar_t = at::acc_type<scalar_t, true>;
+
+        auto idata = input.packed_accessor<scalar_t, 4>();
+        auto odata = output.packed_accessor<scalar_t, 4>();
+
+        upsample_nearest2d_out_frame<scalar_t, accscalar_t>
+            <<<cuda::ATenCeilDiv(num_kernels, num_threads),
+               num_threads,
+               0,
+               stream>>>(num_kernels, idata, odata);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+static void upsample_nearest2d_backward_out_cuda_template(
+    Tensor& grad_input,
+    const Tensor& grad_output_,
+    IntArrayRef output_size,
+    IntArrayRef input_size) {
+  TensorArg grad_input_arg{grad_input, "grad_input", 1},
+      grad_output_arg{grad_output_, "grad_output_", 2};
+  checkAllSameGPU(
+      "upsample_nearest2d_backward_out_cuda",
+      {grad_output_arg, grad_input_arg});
+
+  TORCH_CHECK(
+      output_size.size() == 2,
+      "It is expected output_size equals to 2, but got size ",
+      output_size.size());
+
+  TORCH_CHECK(
+      input_size.size() == 4,
+      "It is expected input_size equals to 4, but got size ",
+      input_size.size());
+
+  int output_height = output_size[0];
+  int output_width = output_size[1];
+
+  int nbatch = input_size[0];
+  int channels = input_size[1];
+  int input_height = input_size[2];
+  int input_width = input_size[3];
+
+  upsample_2d_shape_check(
+      Tensor(),
+      grad_output_,
+      nbatch,
+      channels,
+      input_height,
+      input_width,
+      output_height,
+      output_width);
+
+  Tensor grad_output = grad_output_.contiguous();
+  grad_input.resize_({nbatch, channels, input_height, input_width});
+
+  grad_input.zero_();
+
+  const int num_kernels = output_height * output_width;
+  const int num_threads = std::min(
+      at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "upsample_nearest2d_backward_out_frame", [&] {
+        using accscalar_t = at::acc_type<scalar_t, true>;
+
+        auto idata = grad_input.packed_accessor<scalar_t, 4>();
+        auto odata = grad_output.packed_accessor<scalar_t, 4>();
+
+        upsample_nearest2d_backward_out_frame<scalar_t, accscalar_t>
+            <<<cuda::ATenCeilDiv(num_kernels, num_threads),
+               num_threads,
+               0,
+               stream>>>(num_kernels, idata, odata);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
 }
 
-Tensor upsample_nearest2d_cuda(
+} // namespace
+
+Tensor& upsample_nearest2d_out_cuda(
+    Tensor& output,
     const Tensor& input,
     IntArrayRef output_size) {
-    return at::legacy::th::_thnn_upsample_nearest2d_forward(
-        input, output_size);
+  upsample_nearest2d_out_cuda_template(output, input, output_size);
+  return output;
+}
+
+Tensor upsample_nearest2d_cuda(const Tensor& input, IntArrayRef output_size) {
+  Tensor output = at::empty_like(input);
+  upsample_nearest2d_out_cuda_template(output, input, output_size);
+  return output;
 }
 
 Tensor& upsample_nearest2d_backward_out_cuda(
@@ -25,17 +257,20 @@ Tensor& upsample_nearest2d_backward_out_cuda(
     const Tensor& grad_output,
     IntArrayRef output_size,
     IntArrayRef input_size) {
-    return at::legacy::th::_thnn_upsample_nearest2d_backward_out(
-        grad_input, grad_output, output_size, input_size);
+  upsample_nearest2d_backward_out_cuda_template(
+      grad_input, grad_output, output_size, input_size);
+  return grad_input;
 }
 
 Tensor upsample_nearest2d_backward_cuda(
     const Tensor& grad_output,
     IntArrayRef output_size,
     IntArrayRef input_size) {
-    return at::legacy::th::_thnn_upsample_nearest2d_backward(
-        grad_output, output_size, input_size);
+  Tensor grad_input = at::empty_like(grad_output);
+  upsample_nearest2d_backward_out_cuda_template(
+      grad_input, grad_output, output_size, input_size);
+  return grad_input;
 }
 
-} // native
-} // at
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/cuda/UpSampleNearest3d.cu b/aten/src/ATen/native/cuda/UpSampleNearest3d.cu
index bb208a5986ba..39590bbfb40b 100644
--- a/aten/src/ATen/native/cuda/UpSampleNearest3d.cu
+++ b/aten/src/ATen/native/cuda/UpSampleNearest3d.cu
@@ -1,23 +1,280 @@
 #include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
 #include <ATen/NativeFunctions.h>
-#include <ATen/LegacyTHFunctions.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/Utils.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+#include <ATen/native/cuda/UpSample.cuh>
 
 namespace at {
 namespace native {
+namespace {
 
-Tensor& upsample_nearest3d_out_cuda(
+template <typename scalar_t, typename accscalar_t>
+C10_LAUNCH_BOUNDS_1(1024)
+__global__ void upsample_nearest3d_out_frame(
+    const int n,
+    const PackedTensorAccessor<scalar_t, 5> idata,
+    PackedTensorAccessor<scalar_t, 5> odata) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+
+  const int batchsize = idata.size(0);
+  const int channels = idata.size(1);
+  const int depth1 = idata.size(2);
+  const int height1 = idata.size(3);
+  const int width1 = idata.size(4);
+  const int depth2 = odata.size(2);
+  const int height2 = odata.size(3);
+  const int width2 = odata.size(4);
+
+  const float depth_scale = (float)depth1 / (float)depth2;
+  const float height_scale = (float)height1 / (float)height2;
+  const float width_scale = (float)width1 / (float)width2;
+
+  if (index < n) {
+    const int w2 = (index % (height2 * width2)) % width2; // 0:width2-1
+    const int h2 = (index % (height2 * width2)) / width2; // 0:height2-1
+    const int d2 = index / (height2 * width2); // 0:depth2-1
+    // special case: just copy
+    if (depth1 == depth2 && height1 == height2 && width1 == width2) {
+      const int d1 = d2;
+      const int h1 = h2;
+      const int w1 = w2;
+
+      for (int n = 0; n < batchsize; n++) {
+        for (int c = 0; c < channels; ++c) {
+          const scalar_t val = idata[n][c][d1][h1][w1];
+          odata[n][c][d2][h2][w2] = val;
+        }
+      }
+      return;
+    }
+    //
+    const int h1 =
+        nearest_neighbor_compute_source_index(height_scale, h2, height1);
+    const int w1 =
+        nearest_neighbor_compute_source_index(width_scale, w2, width1);
+    const int d1 =
+        nearest_neighbor_compute_source_index(depth_scale, d2, depth1);
+
+    for (int n = 0; n < batchsize; n++) {
+      for (int c = 0; c < channels; ++c) {
+        const scalar_t val = idata[n][c][d1][h1][w1];
+        odata[n][c][d2][h2][w2] = val;
+      }
+    }
+  }
+}
+
+// Backward operation
+template <typename scalar_t, typename accscalar_t>
+C10_LAUNCH_BOUNDS_1(1024)
+__global__ void upsample_nearest3d_backward_out_frame(
+    const int n,
+    PackedTensorAccessor<scalar_t, 5> idata,
+    const PackedTensorAccessor<scalar_t, 5> odata) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+
+  const int batchsize = idata.size(0);
+  const int channels = idata.size(1);
+  const int depth1 = idata.size(2);
+  const int height1 = idata.size(3);
+  const int width1 = idata.size(4);
+  const int depth2 = odata.size(2);
+  const int height2 = odata.size(3);
+  const int width2 = odata.size(4);
+
+  const float depth_scale = (float)depth1 / (float)depth2;
+  const float height_scale = (float)height1 / (float)height2;
+  const float width_scale = (float)width1 / (float)width2;
+
+  if (index < n) {
+    const int w2 = (index % (height2 * width2)) % width2; // 0:width2-1
+    const int h2 = (index % (height2 * width2)) / width2; // 0:height2-1
+    const int d2 = index / (height2 * width2); // 0:depth2-1
+
+    // special case: just copy
+    if (depth1 == depth2 && height1 == height2 && width1 == width2) {
+      const int d1 = d2;
+      const int h1 = h2;
+      const int w1 = w2;
+      for (int n = 0; n < batchsize; n++) {
+        for (int c = 0; c < channels; ++c) {
+          const scalar_t val = odata[n][c][d1][h1][w1];
+          idata[n][c][d2][h2][w2] = val;
+        }
+      }
+      return;
+    }
+    //
+    const int h1 =
+        nearest_neighbor_compute_source_index(height_scale, h2, height1);
+    const int w1 =
+        nearest_neighbor_compute_source_index(width_scale, w2, width1);
+    const int d1 =
+        nearest_neighbor_compute_source_index(depth_scale, d2, depth1);
+
+    for (int n = 0; n < batchsize; n++) {
+      for (int c = 0; c < channels; ++c) {
+        const scalar_t val = odata[n][c][d2][h2][w2];
+        atomicAdd(&idata[n][c][d1][h1][w1], val);
+      }
+    }
+  }
+}
+
+static void upsample_nearest3d_out_cuda_template(
     Tensor& output,
     const Tensor& input,
     IntArrayRef output_size) {
-    return at::legacy::th::_thnn_upsample_nearest3d_forward_out(
-        output, input, output_size);
+  TensorArg input_arg{input, "input", 1}, output_arg{output, "output", 2};
+  checkAllSameGPU("upsample_nearest3d_out_cuda", {input_arg, output_arg});
+
+  TORCH_CHECK(
+      output_size.size() == 3,
+      "It is expected output_size equals to 3, but got size ",
+      output_size.size());
+
+  int output_depth = output_size[0];
+  int output_height = output_size[1];
+  int output_width = output_size[2];
+
+  int nbatch = input.size(0);
+  int channels = input.size(1);
+  int input_depth = input.size(2);
+  int input_height = input.size(3);
+  int input_width = input.size(4);
+
+  upsample_3d_shape_check(
+      input,
+      Tensor(),
+      nbatch,
+      channels,
+      input_depth,
+      input_height,
+      input_width,
+      output_depth,
+      output_height,
+      output_width);
+
+  AT_ASSERT(
+      input_depth > 0 && input_height > 0 && input_width > 0 &&
+      output_depth > 0 && output_height > 0 && output_width > 0);
+
+  output.resize_({input.size(0),
+                  input.size(1),
+                  output_depth,
+                  output_height,
+                  output_width});
+  output.zero_();
+
+  const int num_kernels = output_depth * output_height * output_width;
+  const int num_threads = std::min(
+      at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "upsample_nearest3d_out_frame", [&] {
+        using accscalar_t = at::acc_type<scalar_t, true>;
+
+        auto idata = input.packed_accessor<scalar_t, 5>();
+        auto odata = output.packed_accessor<scalar_t, 5>();
+
+        upsample_nearest3d_out_frame<scalar_t, accscalar_t>
+            <<<cuda::ATenCeilDiv(num_kernels, num_threads),
+               num_threads,
+               0,
+               stream>>>(num_kernels, idata, odata);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
 }
 
-Tensor upsample_nearest3d_cuda(
+static void upsample_nearest3d_backward_out_cuda_template(
+    Tensor& grad_input,
+    const Tensor& grad_output_,
+    IntArrayRef output_size,
+    IntArrayRef input_size) {
+  TensorArg grad_input_arg{grad_input, "grad_input", 1},
+      grad_output_arg{grad_output_, "grad_output_", 2};
+  checkAllSameGPU(
+      "upsample_nearest3d_backward_out_cuda",
+      {grad_output_arg, grad_input_arg});
+
+  TORCH_CHECK(
+      output_size.size() == 3,
+      "It is expected output_size equals to 3, but got size ",
+      output_size.size());
+
+  TORCH_CHECK(
+      input_size.size() == 5,
+      "It is expected input_size equals to 5, but got size ",
+      input_size.size());
+
+  int output_depth = output_size[0];
+  int output_height = output_size[1];
+  int output_width = output_size[2];
+
+  int nbatch = input_size[0];
+  int channels = input_size[1];
+  int input_depth = input_size[2];
+  int input_height = input_size[3];
+  int input_width = input_size[4];
+
+  upsample_3d_shape_check(
+      Tensor(),
+      grad_output_,
+      nbatch,
+      channels,
+      input_depth,
+      input_height,
+      input_width,
+      output_depth,
+      output_height,
+      output_width);
+
+  Tensor grad_output = grad_output_.contiguous();
+  grad_input.resize_(
+      {nbatch, channels, input_depth, input_height, input_width});
+  grad_input.zero_();
+
+  const int num_kernels = output_depth * output_height * output_width;
+  const int num_threads = std::min(
+      at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "upsample_nearest3d_backward_out_frame", [&] {
+        using accscalar_t = at::acc_type<scalar_t, true>;
+
+        auto idata = grad_input.packed_accessor<scalar_t, 5>();
+        auto odata = grad_output.packed_accessor<scalar_t, 5>();
+
+        upsample_nearest3d_backward_out_frame<scalar_t, accscalar_t>
+            <<<cuda::ATenCeilDiv(num_kernels, num_threads),
+               num_threads,
+               0,
+               stream>>>(num_kernels, idata, odata);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+} // namespace
+
+Tensor& upsample_nearest3d_out_cuda(
+    Tensor& output,
     const Tensor& input,
     IntArrayRef output_size) {
-    return at::legacy::th::_thnn_upsample_nearest3d_forward(
-        input, output_size);
+  upsample_nearest3d_out_cuda_template(output, input, output_size);
+  return output;
+}
+
+Tensor upsample_nearest3d_cuda(const Tensor& input, IntArrayRef output_size) {
+  Tensor output = at::empty_like(input);
+  upsample_nearest3d_out_cuda_template(output, input, output_size);
+  return output;
 }
 
 Tensor& upsample_nearest3d_backward_out_cuda(
@@ -25,17 +282,20 @@ Tensor& upsample_nearest3d_backward_out_cuda(
     const Tensor& grad_output,
     IntArrayRef output_size,
     IntArrayRef input_size) {
-    return at::legacy::th::_thnn_upsample_nearest3d_backward_out(
-        grad_input, grad_output, output_size, input_size);
+  upsample_nearest3d_backward_out_cuda_template(
+      grad_input, grad_output, output_size, input_size);
+  return grad_input;
 }
 
 Tensor upsample_nearest3d_backward_cuda(
     const Tensor& grad_output,
     IntArrayRef output_size,
     IntArrayRef input_size) {
-    return at::legacy::th::_thnn_upsample_nearest3d_backward(
-        grad_output, output_size, input_size);
+  Tensor grad_input = at::empty_like(grad_output);
+  upsample_nearest3d_backward_out_cuda_template(
+      grad_input, grad_output, output_size, input_size);
+  return grad_input;
 }
 
-} // native
-} // at
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu b/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu
index 386887fc3ed2..683860e8a466 100644
--- a/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu
+++ b/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu
@@ -1,25 +1,384 @@
+// Adapted from interp.cpp from Caffe util by Pauline Luc
+// Originally developed by George Papandreou
 #include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
 #include <ATen/NativeFunctions.h>
-#include <ATen/LegacyTHFunctions.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/Utils.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+#include <ATen/native/cuda/UpSample.cuh>
 
 namespace at {
 namespace native {
+namespace {
+
+template <typename scalar_t, typename accscalar_t>
+C10_LAUNCH_BOUNDS_1(1024)
+__global__ void upsample_trilinear3d_out_frame(
+    const int n,
+    const accscalar_t rdepth,
+    const accscalar_t rheight,
+    const accscalar_t rwidth,
+    const bool align_corners,
+    const PackedTensorAccessor<scalar_t, 5> idata,
+    PackedTensorAccessor<scalar_t, 5> odata) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+
+  const int batchsize = idata.size(0);
+  const int channels = idata.size(1);
+  const int depth1 = idata.size(2);
+  const int height1 = idata.size(3);
+  const int width1 = idata.size(4);
+  const int depth2 = odata.size(2);
+  const int height2 = odata.size(3);
+  const int width2 = odata.size(4);
+
+  if (index < n) {
+    const int w2 = (index % (height2 * width2)) % width2; // 0:width2-1
+    const int h2 = (index % (height2 * width2)) / width2; // 0:height2-1
+    const int t2 = index / (height2 * width2); // 0:depth2-1
+    // special case: just copy
+    if (depth1 == depth2 && height1 == height2 && width1 == width2) {
+      const int t1 = t2;
+      const int h1 = h2;
+      const int w1 = w2;
+
+      for (int n = 0; n < batchsize; n++) {
+        for (int c = 0; c < channels; ++c) {
+          const scalar_t val = idata[n][c][t1][h1][w1];
+          odata[n][c][t2][h2][w2] = val;
+        }
+      }
+      return;
+    }
+    //
+    const accscalar_t t1r = area_pixel_compute_source_index<accscalar_t>(
+        rdepth, t2, align_corners, /*cubic=*/false);
+    const int t1 = t1r;
+    const int t1p = (t1 < depth1 - 1) ? 1 : 0;
+    const accscalar_t t1lambda = t1r - t1;
+    const accscalar_t t0lambda = static_cast<accscalar_t>(1) - t1lambda;
+    //
+    const accscalar_t h1r = area_pixel_compute_source_index<accscalar_t>(
+        rheight, h2, align_corners, /*cubic=*/false);
+    const int h1 = h1r;
+    const int h1p = (h1 < height1 - 1) ? 1 : 0;
+    const accscalar_t h1lambda = h1r - h1;
+    const accscalar_t h0lambda = static_cast<accscalar_t>(1) - h1lambda;
+    //
+    const accscalar_t w1r = area_pixel_compute_source_index<accscalar_t>(
+        rwidth, w2, align_corners, /*cubic=*/false);
+    const int w1 = w1r;
+    const int w1p = (w1 < width1 - 1) ? 1 : 0;
+    const accscalar_t w1lambda = w1r - w1;
+    const accscalar_t w0lambda = static_cast<accscalar_t>(1) - w1lambda;
+    //
+    for (int n = 0; n < batchsize; n++) {
+      for (int c = 0; c < channels; ++c) {
+        const accscalar_t val = t0lambda *
+                (h0lambda *
+                     (w0lambda * idata[n][c][t1][h1][w1] +
+                      w1lambda * idata[n][c][t1][h1][w1 + w1p]) +
+                 h1lambda *
+                     (w0lambda * idata[n][c][t1][h1 + h1p][w1] +
+                      w1lambda * idata[n][c][t1][h1 + h1p][w1 + w1p])) +
+            t1lambda *
+                (h0lambda *
+                     (w0lambda * idata[n][c][t1 + t1p][h1][w1] +
+                      w1lambda * idata[n][c][t1 + t1p][h1][w1 + w1p]) +
+                 h1lambda *
+                     (w0lambda * idata[n][c][t1 + t1p][h1 + h1p][w1] +
+                      w1lambda * idata[n][c][t1 + t1p][h1 + h1p][w1 + w1p]));
+        odata[n][c][t2][h2][w2] = static_cast<scalar_t>(val);
+      }
+    }
+  }
+}
+
+// Backward (adjoint) operation 1 <- 2 (accumulates)
+template <typename scalar_t, typename accscalar_t>
+C10_LAUNCH_BOUNDS_1(1024)
+__global__ void upsample_trilinear3d_backward_out_frame(
+    const int n,
+    const accscalar_t rdepth,
+    const accscalar_t rheight,
+    const accscalar_t rwidth,
+    const bool align_corners,
+    PackedTensorAccessor<scalar_t, 5> idata,
+    const PackedTensorAccessor<scalar_t, 5> odata) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+
+  const int batchsize = idata.size(0);
+  const int channels = idata.size(1);
+  const int depth1 = idata.size(2);
+  const int height1 = idata.size(3);
+  const int width1 = idata.size(4);
+  const int depth2 = odata.size(2);
+  const int height2 = odata.size(3);
+  const int width2 = odata.size(4);
+
+  if (index < n) {
+    const int w2 = (index % (height2 * width2)) % width2; // 0:width2-1
+    const int h2 = (index % (height2 * width2)) / width2; // 0:height2-1
+    const int t2 = index / (height2 * width2); // 0:depth2-1
+    // special case: just copy
+    if (depth1 == depth2 && height1 == height2 && width1 == width2) {
+      const int t1 = t2;
+      const int h1 = h2;
+      const int w1 = w2;
+
+      for (int n = 0; n < batchsize; n++) {
+        for (int c = 0; c < channels; ++c) {
+          const scalar_t val = odata[n][c][t1][h1][w1];
+          idata[n][c][t2][h2][w2] = val;
+        }
+      }
+      return;
+    }
+    //
+    const accscalar_t t1r = area_pixel_compute_source_index<accscalar_t>(
+        rdepth, t2, align_corners, /*cubic=*/false);
+    const int t1 = t1r;
+    const int t1p = (t1 < depth1 - 1) ? 1 : 0;
+    const accscalar_t t1lambda = t1r - t1;
+    const accscalar_t t0lambda = static_cast<accscalar_t>(1) - t1lambda;
+    //
+    const accscalar_t h1r = area_pixel_compute_source_index<accscalar_t>(
+        rheight, h2, align_corners, /*cubic=*/false);
+    const int h1 = h1r;
+    const int h1p = (h1 < height1 - 1) ? 1 : 0;
+    const accscalar_t h1lambda = h1r - h1;
+    const accscalar_t h0lambda = static_cast<accscalar_t>(1) - h1lambda;
+    //
+    const accscalar_t w1r = area_pixel_compute_source_index<accscalar_t>(
+        rwidth, w2, align_corners, /*cubic=*/false);
+    const int w1 = w1r;
+    const int w1p = (w1 < width1 - 1) ? 1 : 0;
+    const accscalar_t w1lambda = w1r - w1;
+    const accscalar_t w0lambda = static_cast<accscalar_t>(1) - w1lambda;
+    //
+    for (int n = 0; n < batchsize; n++) {
+      for (int c = 0; c < channels; ++c) {
+        const scalar_t d2val = odata[n][c][t2][h2][w2];
+        atomicAdd(
+            &idata[n][c][t1][h1][w1],
+            static_cast<scalar_t>(t0lambda * h0lambda * w0lambda * d2val));
+        atomicAdd(
+            &idata[n][c][t1][h1][w1 + w1p],
+            static_cast<scalar_t>(t0lambda * h0lambda * w1lambda * d2val));
+        atomicAdd(
+            &idata[n][c][t1][h1 + h1p][w1],
+            static_cast<scalar_t>(t0lambda * h1lambda * w0lambda * d2val));
+        atomicAdd(
+            &idata[n][c][t1][h1 + h1p][w1 + w1p],
+            static_cast<scalar_t>(t0lambda * h1lambda * w1lambda * d2val));
+        atomicAdd(
+            &idata[n][c][t1 + t1p][h1][w1],
+            static_cast<scalar_t>(t1lambda * h0lambda * w0lambda * d2val));
+        atomicAdd(
+            &idata[n][c][t1 + t1p][h1][w1 + w1p],
+            static_cast<scalar_t>(t1lambda * h0lambda * w1lambda * d2val));
+        atomicAdd(
+            &idata[n][c][t1 + t1p][h1 + h1p][w1],
+            static_cast<scalar_t>(t1lambda * h1lambda * w0lambda * d2val));
+        atomicAdd(
+            &idata[n][c][t1 + t1p][h1 + h1p][w1 + w1p],
+            static_cast<scalar_t>(t1lambda * h1lambda * w1lambda * d2val));
+      }
+    }
+  }
+}
+
+static void upsample_trilinear3d_out_cuda_template(
+    Tensor& output,
+    const Tensor& input,
+    IntArrayRef output_size,
+    bool align_corners) {
+  TensorArg input_arg{input, "input", 1}, output_arg{output, "output", 2};
+  checkAllSameGPU("upsample_trilinear3d_out_cuda", {input_arg, output_arg});
+
+  TORCH_CHECK(
+      output_size.size() == 3,
+      "It is expected output_size equals to 3, but got size ",
+      output_size.size());
+
+  int output_depth = output_size[0];
+  int output_height = output_size[1];
+  int output_width = output_size[2];
+
+  int nbatch = input.size(0);
+  int channels = input.size(1);
+  int input_depth = input.size(2);
+  int input_height = input.size(3);
+  int input_width = input.size(4);
+
+  upsample_3d_shape_check(
+      input,
+      Tensor(),
+      nbatch,
+      channels,
+      input_depth,
+      input_height,
+      input_width,
+      output_depth,
+      output_height,
+      output_width);
+
+  output.resize_({input.size(0),
+                  input.size(1),
+                  output_depth,
+                  output_height,
+                  output_width});
+  output.zero_();
+
+  AT_ASSERT(
+      input_depth > 0 && input_height > 0 && input_width > 0 &&
+      output_depth > 0 && output_height > 0 && output_width > 0);
+
+  const int num_kernels = output_depth * output_height * output_width;
+  const int num_threads = std::min(
+      at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "upsample_trilinear3d_out_frame", [&] {
+        using accscalar_t = at::acc_type<scalar_t, true>;
+
+        auto idata = input.packed_accessor<scalar_t, 5>();
+        auto odata = output.packed_accessor<scalar_t, 5>();
+
+        const accscalar_t rdepth = area_pixel_compute_scale<accscalar_t>(
+            input_depth, output_depth, align_corners);
+        const accscalar_t rheight = area_pixel_compute_scale<accscalar_t>(
+            input_height, output_height, align_corners);
+        const accscalar_t rwidth = area_pixel_compute_scale<accscalar_t>(
+            input_width, output_width, align_corners);
+
+        upsample_trilinear3d_out_frame<scalar_t, accscalar_t>
+            <<<cuda::ATenCeilDiv(num_kernels, num_threads),
+               num_threads,
+               0,
+               stream>>>(
+                num_kernels,
+                rdepth,
+                rheight,
+                rwidth,
+                align_corners,
+                idata,
+                odata);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+static void upsample_trilinear3d_backward_out_cuda_template(
+    Tensor& grad_input,
+    const Tensor& grad_output_,
+    IntArrayRef output_size,
+    IntArrayRef input_size,
+    bool align_corners) {
+  TensorArg grad_input_arg{grad_input, "grad_input", 1},
+      grad_output_arg{grad_output_, "grad_output_", 2};
+  checkAllSameGPU(
+      "upsample_trilinear3d_backward_out_cuda",
+      {grad_output_arg, grad_input_arg});
+
+  TORCH_CHECK(
+      output_size.size() == 3,
+      "It is expected output_size equals to 3, but got size ",
+      output_size.size());
+
+  TORCH_CHECK(
+      input_size.size() == 5,
+      "It is expected input_size equals to 5, but got size ",
+      input_size.size());
+
+  int output_depth = output_size[0];
+  int output_height = output_size[1];
+  int output_width = output_size[2];
+
+  int nbatch = input_size[0];
+  int channels = input_size[1];
+  int input_depth = input_size[2];
+  int input_height = input_size[3];
+  int input_width = input_size[4];
+
+  upsample_3d_shape_check(
+      Tensor(),
+      grad_output_,
+      nbatch,
+      channels,
+      input_depth,
+      input_height,
+      input_width,
+      output_depth,
+      output_height,
+      output_width);
+  Tensor grad_output = grad_output_.contiguous();
+
+  grad_input.resize_(
+      {nbatch, channels, input_depth, input_height, input_width});
+  grad_input.zero_();
+
+  const int num_kernels = output_depth * output_height * output_width;
+  const int num_threads = std::min(
+      at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(),
+      "upsample_trilinear3d_backward_out_frame",
+      [&] {
+        using accscalar_t = at::acc_type<scalar_t, true>;
+
+        auto idata = grad_input.packed_accessor<scalar_t, 5>();
+        auto odata = grad_output.packed_accessor<scalar_t, 5>();
+
+        const accscalar_t rdepth = area_pixel_compute_scale<accscalar_t>(
+            input_depth, output_depth, align_corners);
+        const accscalar_t rheight = area_pixel_compute_scale<accscalar_t>(
+            input_height, output_height, align_corners);
+        const accscalar_t rwidth = area_pixel_compute_scale<accscalar_t>(
+            input_width, output_width, align_corners);
+
+        upsample_trilinear3d_backward_out_frame<scalar_t, accscalar_t>
+            <<<cuda::ATenCeilDiv(num_kernels, num_threads),
+               num_threads,
+               0,
+               stream>>>(
+                num_kernels,
+                rdepth,
+                rheight,
+                rwidth,
+                align_corners,
+                idata,
+                odata);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+} // namespace
 
 Tensor& upsample_trilinear3d_out_cuda(
     Tensor& output,
     const Tensor& input,
     IntArrayRef output_size,
     bool align_corners) {
-    return at::legacy::th::_thnn_upsample_trilinear3d_forward_out(
-        output, input, output_size, align_corners);
+  upsample_trilinear3d_out_cuda_template(
+      output, input, output_size, align_corners);
+  return output;
 }
 
 Tensor upsample_trilinear3d_cuda(
     const Tensor& input,
     IntArrayRef output_size,
     bool align_corners) {
-    return at::legacy::th::_thnn_upsample_trilinear3d_forward(
-        input, output_size, align_corners);
+  Tensor output = at::empty_like(input);
+  upsample_trilinear3d_out_cuda_template(
+      output, input, output_size, align_corners);
+  return output;
 }
 
 Tensor& upsample_trilinear3d_backward_out_cuda(
@@ -28,8 +387,9 @@ Tensor& upsample_trilinear3d_backward_out_cuda(
     IntArrayRef output_size,
     IntArrayRef input_size,
     bool align_corners) {
-    return at::legacy::th::_thnn_upsample_trilinear3d_backward_out(
-        grad_input, grad_output, output_size, input_size, align_corners);
+  upsample_trilinear3d_backward_out_cuda_template(
+      grad_input, grad_output, output_size, input_size, align_corners);
+  return grad_input;
 }
 
 Tensor upsample_trilinear3d_backward_cuda(
@@ -37,9 +397,11 @@ Tensor upsample_trilinear3d_backward_cuda(
     IntArrayRef output_size,
     IntArrayRef input_size,
     bool align_corners) {
-    return at::legacy::th::_thnn_upsample_trilinear3d_backward(
-        grad_output, output_size, input_size, align_corners);
+  Tensor grad_input = at::empty_like(grad_output);
+  upsample_trilinear3d_backward_out_cuda_template(
+      grad_input, grad_output, output_size, input_size, align_corners);
+  return grad_input;
 }
 
-} // native
-} // at
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/cuda/WeightNorm.cu b/aten/src/ATen/native/cuda/WeightNorm.cu
index fcb737fd95b8..8392feadb228 100644
--- a/aten/src/ATen/native/cuda/WeightNorm.cu
+++ b/aten/src/ATen/native/cuda/WeightNorm.cu
@@ -415,10 +415,10 @@ std::tuple<Tensor, Tensor> weight_norm_cuda_backward
 {
   // These checks should always succeed, because weight_norm_fused_backward should only
   // ever be recorded in the autograd graph via weight_norm, which passes contiguous v and g.
-  AT_CHECK(saved_v.is_contiguous(), "saved_v must be contiguous");
-  AT_CHECK(saved_g.is_contiguous(), "saved_g must be contiguous");
-  AT_CHECK(saved_norms.is_contiguous(), "saved_norms must be contiguous");
-  AT_CHECK(dim == 0 || dim == saved_v.dim() - 1, "fused kernels can only be applied for first or last dim")
+  TORCH_CHECK(saved_v.is_contiguous(), "saved_v must be contiguous");
+  TORCH_CHECK(saved_g.is_contiguous(), "saved_g must be contiguous");
+  TORCH_CHECK(saved_norms.is_contiguous(), "saved_norms must be contiguous");
+  TORCH_CHECK(dim == 0 || dim == saved_v.dim() - 1, "fused kernels can only be applied for first or last dim")
 
   auto grad_v = at::empty_like(saved_v);
   auto grad_g = at::empty_like(saved_g);
diff --git a/aten/src/ATen/native/cudnn/Conv.cpp b/aten/src/ATen/native/cudnn/Conv.cpp
index 7cc49cea4727..938f037e501c 100644
--- a/aten/src/ATen/native/cudnn/Conv.cpp
+++ b/aten/src/ATen/native/cudnn/Conv.cpp
@@ -217,10 +217,10 @@ Tensor narrowGroup(const Tensor& t, int dim, int group_idx, int64_t groups) {
 // Used on pad, stride and dilation
 static void check_args(CheckedFrom c, IntArrayRef args, size_t expected_size, const char* arg_name)
 {
-  AT_CHECK(args.size() <= expected_size,
+  TORCH_CHECK(args.size() <= expected_size,
            "Too many ", arg_name, " values (", args.size(), ") supplied, expecting ",
            expected_size, " (while checking arguments for ", c, ")");
-  AT_CHECK(args.size() >= expected_size,
+  TORCH_CHECK(args.size() >= expected_size,
            "Not enough ", arg_name, " values (", args.size(), ") supplied, expecting ",
            expected_size, " (while checking arguments for ", c, ")");
 
diff --git a/aten/src/ATen/native/cudnn/LossCTC.cpp b/aten/src/ATen/native/cudnn/LossCTC.cpp
index c634305d33d2..f315b098dcbb 100644
--- a/aten/src/ATen/native/cudnn/LossCTC.cpp
+++ b/aten/src/ATen/native/cudnn/LossCTC.cpp
@@ -46,14 +46,14 @@ std::tuple<Tensor, Tensor> _cudnn_ctc_loss(const Tensor& log_probs_t, const Tens
   checkBackend(c, {*log_probs}, Backend::CUDA);
   checkBackend(c, {*targets}, Backend::CPU);
   int64_t batch_size = log_probs->size(1);
-  AT_CHECK(input_lengths_.size() == batch_size, "input_lengths needs to have size to match batch_size");
-  AT_CHECK(target_lengths_.size() == batch_size, "target_lengths needs to have size to match batch_size");
+  TORCH_CHECK(input_lengths_.size() == batch_size, "input_lengths needs to have size to match batch_size");
+  TORCH_CHECK(target_lengths_.size() == batch_size, "target_lengths needs to have size to match batch_size");
 
   std::vector<int> input_lengths(input_lengths_.begin(), input_lengths_.end());
   std::vector<int> target_lengths(target_lengths_.begin(), target_lengths_.end());
 
   setCuDNNStreamToCurrent();
-  AT_CHECK(BLANK == 0, "blank must be label 0 for cudnn_ctc_loss");
+  TORCH_CHECK(BLANK == 0, "blank must be label 0 for cudnn_ctc_loss");
   // checked in dispatch:
   // assert other conditions for cudnnCTCLoss: all label lengths <= 256
   // all input lengths = logprob.size(0)
diff --git a/aten/src/ATen/native/cudnn/RNN.cpp b/aten/src/ATen/native/cudnn/RNN.cpp
index 39e0e1cd49be..3dbe44e9c075 100644
--- a/aten/src/ATen/native/cudnn/RNN.cpp
+++ b/aten/src/ATen/native/cudnn/RNN.cpp
@@ -627,7 +627,7 @@ Tensor _cudnn_rnn_flatten_weight(
     bool fn_bidirectional
     ) {
 
-  AT_CHECK(weight_arr.size() > 0,
+  TORCH_CHECK(weight_arr.size() > 0,
            "_cudnn_rnn_flatten_weight_: cannot flatten empty weight list");
 
   auto any_param = weight_arr[0];
@@ -701,7 +701,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _cudnn_rnn(
   // TODO: Set device to input
 
   if (fn.rnn.mode != CUDNN_LSTM) {
-    AT_CHECK(!cx.defined(),
+    TORCH_CHECK(!cx.defined(),
              "rnn: illegal defined cx for non-LSTM RNN");
   }
 
@@ -714,9 +714,9 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _cudnn_rnn(
   auto hidden_size = _hidden_size(fn.rnn, fn.tensors);
   auto output_size = _output_size(fn.rnn, fn.tensors);
 
-  AT_CHECK(hx.is_contiguous(),
+  TORCH_CHECK(hx.is_contiguous(),
            "rnn: hx is not contiguous");
-  AT_CHECK(!cx.defined() || cx.is_contiguous(),
+  TORCH_CHECK(!cx.defined() || cx.is_contiguous(),
            "rnn: cx is not contiguous");
 
   auto x = input.contiguous();
@@ -750,7 +750,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _cudnn_rnn(
     w_desc.set(weight_buf, 3);
   }
 
-  AT_CHECK(!cx.defined() || cx.sizes().equals(hidden_size),
+  TORCH_CHECK(!cx.defined() || cx.sizes().equals(hidden_size),
            "Expected cell size ", IntArrayRef{hidden_size}, ", got ", cx.sizes());
 
   size_t workspace_size;
@@ -842,7 +842,7 @@ std::tuple<Tensor, Tensor, Tensor> _cudnn_rnn_backward_input(
   auto handle = getCudnnHandle();
 
   if (fn.rnn.mode != CUDNN_LSTM) {
-    AT_CHECK(!cx.defined(),
+    TORCH_CHECK(!cx.defined(),
              "rnn: illegal defined cx for non-LSTM RNN");
   }
 
@@ -857,9 +857,9 @@ std::tuple<Tensor, Tensor, Tensor> _cudnn_rnn_backward_input(
   auto hidden_size = _hidden_size(fn.rnn, fn.tensors);
   auto output_size = _output_size(fn.rnn, fn.tensors);
 
-  AT_CHECK(hx.is_contiguous(),
+  TORCH_CHECK(hx.is_contiguous(),
            "rnn: hx is not contiguous");
-  AT_CHECK(!cx.defined() || cx.is_contiguous(),
+  TORCH_CHECK(!cx.defined() || cx.is_contiguous(),
            "rnn: cx is not contiguous");
 
   auto x = input.contiguous();
@@ -873,24 +873,24 @@ std::tuple<Tensor, Tensor, Tensor> _cudnn_rnn_backward_input(
   AT_ASSERTM(cx.defined() || !output_mask[2], "illegally required grad of cx for non-LSTM RNN");
   auto dcx = cx.defined() ? at::empty(hidden_size, cx.options()) : Tensor();
 
-  AT_CHECK(fn_train,
+  TORCH_CHECK(fn_train,
            "cudnn RNN backward can only be called in training mode");
 
-  AT_CHECK(input.sizes().equals(input_size),
+  TORCH_CHECK(input.sizes().equals(input_size),
            "Expected input size ", IntArrayRef{input_size}, ", got ", input.sizes());
-  AT_CHECK(output.sizes().equals(output_size),
+  TORCH_CHECK(output.sizes().equals(output_size),
            "Expected output size ", IntArrayRef{output_size}, ", got ", output.sizes());
 
-  AT_CHECK(!hx.defined() || hx.sizes().equals(hidden_size),
+  TORCH_CHECK(!hx.defined() || hx.sizes().equals(hidden_size),
            "Expected hidden size ", IntArrayRef{hidden_size}, ", got ", hx.sizes());
-  AT_CHECK(!cx.defined() || cx.sizes().equals(hidden_size),
+  TORCH_CHECK(!cx.defined() || cx.sizes().equals(hidden_size),
            "Expected cell size ", IntArrayRef{hidden_size}, ", got ", cx.sizes());
-  AT_CHECK(!dhy.defined() || dhy.sizes().equals(hidden_size),
+  TORCH_CHECK(!dhy.defined() || dhy.sizes().equals(hidden_size),
            "Expected d_hidden size ", IntArrayRef{hidden_size}, ", got ", dhy.sizes());
-  AT_CHECK(!dcy.defined() || dcy.sizes().equals(hidden_size),
+  TORCH_CHECK(!dcy.defined() || dcy.sizes().equals(hidden_size),
            "Expected d_cell size ", IntArrayRef{hidden_size}, ", got ", dcy.sizes());
 
-  AT_CHECK(dhy.is_cuda() && dy.is_cuda() && (!dcy.defined() || dcy.is_cuda()),
+  TORCH_CHECK(dhy.is_cuda() && dy.is_cuda() && (!dcy.defined() || dcy.is_cuda()),
            "Gradients aren't CUDA tensors");
 
   cudnnRNNAlgo_t algo = get_algo(fn.rnn, fn.tensors, input);
@@ -965,7 +965,7 @@ std::vector<Tensor> _cudnn_rnn_backward_weight(
   auto handle = getCudnnHandle();
 
   if (fn.rnn.mode != CUDNN_LSTM) {
-    AT_CHECK(!cx.defined(),
+    TORCH_CHECK(!cx.defined(),
              "rnn: illegal defined cx for non-LSTM RNN");
   }
 
@@ -978,20 +978,20 @@ std::vector<Tensor> _cudnn_rnn_backward_weight(
   auto input_size = _input_size(fn.tensors);
   auto hidden_size = _hidden_size(fn.rnn, fn.tensors);
 
-  AT_CHECK(fn_train,
+  TORCH_CHECK(fn_train,
            "cudnn RNN backward can only be called in training mode");
 
-  AT_CHECK(input.sizes().equals(input_size),
+  TORCH_CHECK(input.sizes().equals(input_size),
            "Expected input size ", IntArrayRef{input_size}, ", got ", input.sizes());
-  AT_CHECK(!hx.defined() || hx.sizes().equals(hidden_size),
+  TORCH_CHECK(!hx.defined() || hx.sizes().equals(hidden_size),
            "Expected hidden size ", IntArrayRef{hidden_size}, ", got ", hx.sizes());
 
   // TODO: the above were the only checks in rnn.py, but it doesn't seem
   // like these checks are enough
 
-  AT_CHECK(hx.is_contiguous(),
+  TORCH_CHECK(hx.is_contiguous(),
            "rnn: hx is not contiguous");
-  AT_CHECK(!cx.defined() || cx.is_contiguous(),
+  TORCH_CHECK(!cx.defined() || cx.is_contiguous(),
            "rnn: cx is not contiguous");
 
   auto x = input.contiguous();
@@ -1236,7 +1236,7 @@ std::pair<Tensor, hidden_type> _cudnn_impl(
     AT_WARN(WEIGHT_FORMAT_WARN);
   }
 
-  AT_CHECK(_batch_sizes.dim() == 1, "batch_sizes tensor should be 1D");
+  TORCH_CHECK(_batch_sizes.dim() == 1, "batch_sizes tensor should be 1D");
   IntArrayRef batch_sizes { _batch_sizes.data<int64_t>(), static_cast<size_t>(_batch_sizes.size(0)) };
 
   auto & dropout_state = get_dropout_state(dropout_p, train, input.options());
diff --git a/aten/src/ATen/native/layer_norm.cpp b/aten/src/ATen/native/layer_norm.cpp
new file mode 100644
index 000000000000..1dce347c92ea
--- /dev/null
+++ b/aten/src/ATen/native/layer_norm.cpp
@@ -0,0 +1,122 @@
+#include <ATen/NativeFunctions.h>
+
+#include <functional>
+#include <numeric>
+#include <tuple>
+#include <vector>
+
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/CPUApplyUtils.h>
+#include <ATen/Config.h>
+#include <ATen/Parallel.h>
+#include <ATen/native/cpu/layer_norm_kernel.h>
+
+namespace at {
+namespace native {
+
+namespace {
+
+std::tuple<Tensor, Tensor, Tensor> layer_norm_forward_cpu(
+    const Tensor& X,
+    const Tensor& gamma /* optional */,
+    const Tensor& beta /* optional */,
+    int64_t M,
+    int64_t N,
+    double eps) {
+  Tensor Y = at::native::empty_like(X);
+  Tensor mean = at::empty({M}, X.options());
+  Tensor rstd = at::empty({M}, X.options());
+  LayerNormKernel(kCPU, X, gamma, beta, M, N, eps, &Y, &mean, &rstd);
+  return std::make_tuple(Y, mean, rstd);
+}
+
+} // namespace
+
+Tensor layer_norm(
+    const Tensor& input,
+    IntArrayRef normalized_shape,
+    const Tensor& weight /* optional */,
+    const Tensor& bias /* optional */,
+    double eps,
+    bool cudnn_enabled) {
+  const int normalized_ndim = normalized_shape.size();
+  TORCH_CHECK(
+      normalized_ndim >= 1,
+      "Expected normalized_shape to be at least 1-dimensional, i.e., ",
+      "containing at least one element, but got normalized_shape = ",
+      normalized_shape);
+  TORCH_CHECK(
+      !weight.defined() || weight.sizes().equals(normalized_shape),
+      "Expected weight to be of same shape as normalized_shape, but got ",
+      "weight of shape ",
+      weight.sizes(),
+      " and normalized_shape = ",
+      normalized_shape);
+  TORCH_CHECK(
+      !bias.defined() || bias.sizes().equals(normalized_shape),
+      "Expected bias to be of same shape as normalized_shape, but got ",
+      "bias of shape ",
+      bias.sizes(),
+      " and normalized_shape = ",
+      normalized_shape);
+
+  const auto input_shape = input.sizes();
+  const auto input_ndim = input.dim();
+
+  if (input_ndim < normalized_ndim ||
+      !input_shape.slice(input_ndim - normalized_ndim)
+           .equals(normalized_shape)) {
+    std::stringstream ss;
+    ss << "Given normalized_shape=" << normalized_shape
+       << ", expected input with shape [*";
+    for (auto size : normalized_shape) {
+      ss << ", " << size;
+    }
+    ss << "], but got input of size" << input_shape;
+    AT_ERROR(ss.str());
+  }
+
+  const int axis = input_ndim - normalized_ndim;
+  const int64_t M = std::accumulate(
+      input_shape.cbegin(),
+      input_shape.cbegin() + axis,
+      1LL,
+      std::multiplies<int64_t>());
+  const int64_t N = std::accumulate(
+      input_shape.cbegin() + axis,
+      input_shape.cend(),
+      1LL,
+      std::multiplies<int64_t>());
+
+  // TODO(yangxm): Remove this check after backward pass landed.
+  const auto is_forward = [](const Tensor& tensor) {
+    return tensor.is_variable() && !tensor.requires_grad();
+  };
+  if (input.device().is_cpu() && is_forward(input) && is_forward(weight) &&
+      is_forward(bias)) {
+    return std::get<0>(layer_norm_forward_cpu(
+        input.contiguous(), weight.contiguous(), bias.contiguous(), M, N, eps));
+  }
+
+  // Apply layer norm
+  auto input_reshaped = input.contiguous().view({1, M, -1});
+  auto out = at::batch_norm(
+      input_reshaped, {}, {}, {}, {}, true, 0, eps, cudnn_enabled);
+  out = out.view(input_shape);
+
+  if (weight.defined() && bias.defined()) {
+    return bias.addcmul(out, weight, 1);
+  } else if (weight.defined()) {
+    return out.mul(weight);
+  } else if (bias.defined()) {
+    return out.add(bias);
+  } else {
+    return out;
+  }
+}
+
+DEFINE_DISPATCH(LayerNormKernel);
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/miopen/Conv_miopen.cpp b/aten/src/ATen/native/miopen/Conv_miopen.cpp
index cffbb1b1ac77..d7e9dab945db 100644
--- a/aten/src/ATen/native/miopen/Conv_miopen.cpp
+++ b/aten/src/ATen/native/miopen/Conv_miopen.cpp
@@ -211,10 +211,10 @@ Tensor narrowGroup(const Tensor& t, int dim, int group_idx, int64_t groups) {
 // Used on pad, stride and dilation
 static void check_args(CheckedFrom c, IntArrayRef args, size_t expected_size, const char* arg_name)
 {
-  AT_CHECK(args.size() <= expected_size,
+  TORCH_CHECK(args.size() <= expected_size,
            "Too many ", arg_name, " values (", args.size(), ") supplied, expecting ",
            expected_size, " (while checking arguments for ", c, ")");
-  AT_CHECK(args.size() >= expected_size,
+  TORCH_CHECK(args.size() >= expected_size,
            "Not enough ", arg_name, " values (", args.size(), ") supplied, expecting ",
            expected_size, " (while checking arguments for ", c, ")");
 
diff --git a/aten/src/ATen/native/mkl/LinearAlgebra.cpp b/aten/src/ATen/native/mkl/LinearAlgebra.cpp
index a6ecdcde198c..126329cdfe31 100644
--- a/aten/src/ATen/native/mkl/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/mkl/LinearAlgebra.cpp
@@ -52,30 +52,36 @@ static inline void gemm_batched(const CBLAS_TRANSPOSE trans_A, const CBLAS_TRANS
 
 template <typename scalar_t>
 static inline void baddbmm_mkl_template(const Tensor& res, const Tensor& mat1, const Tensor& mat2, Scalar beta_, Scalar alpha_) {
-  auto is_transposed = [&](const Tensor& t) {
+  auto is_transposed = [&](const TensorAccessor<scalar_t, 2>& t) {
     return t.stride(0) == 1 && t.stride(1) >= t.size(0);
   };
-  const CBLAS_TRANSPOSE trans_A = is_transposed(mat1[0]) ? CblasTrans : CblasNoTrans;
-  const CBLAS_TRANSPOSE trans_B = is_transposed(mat2[0]) ? CblasTrans : CblasNoTrans;
 
-  const int batch_size = mat1.size(0);
-  const int M = mat1.size(1);
-  const int N = mat2.size(2);
-  const int K = mat1.size(2);
+  auto mat1_acc = mat1.accessor<scalar_t, 3>();
+  auto mat2_acc = mat2.accessor<scalar_t, 3>();
+  auto res_acc = res.accessor<scalar_t, 3>();
+
+  const CBLAS_TRANSPOSE trans_A = is_transposed(mat1_acc[0]) ? CblasTrans : CblasNoTrans;
+  const CBLAS_TRANSPOSE trans_B = is_transposed(mat2_acc[0]) ? CblasTrans : CblasNoTrans;
+
+  const int batch_size = mat1_acc.size(0);
+  const int M = mat1_acc.size(1);
+  const int N = mat2_acc.size(2);
+  const int K = mat1_acc.size(2);
   scalar_t alpha = alpha_.to<scalar_t>();
   scalar_t beta = beta_.to<scalar_t>();
 
-  const int lda = is_transposed(mat1[0]) ? mat1[0].stride(1) : mat1[0].stride(0);
-  const int ldb = is_transposed(mat2[0]) ? mat2[0].stride(1) : mat2[0].stride(0);
+  const int lda = is_transposed(mat1_acc[0]) ? mat1_acc[0].stride(1) : mat1_acc[0].stride(0);
+  const int ldb = is_transposed(mat2_acc[0]) ? mat2_acc[0].stride(1) : mat2_acc[0].stride(0);
   const int ldc = res[0].stride(0);
 
   std::vector<const scalar_t*> A(batch_size);
   std::vector<const scalar_t*> B(batch_size);
   std::vector<scalar_t*> C(batch_size);
+
   for (int64_t batch = 0; batch < batch_size; batch++) {
-    A[batch] = mat1[batch].data<scalar_t>();
-    B[batch] = mat2[batch].data<scalar_t>();
-    C[batch] = res[batch].data<scalar_t>();
+    A[batch] = mat1_acc[batch].data();
+    B[batch] = mat2_acc[batch].data();
+    C[batch] = res_acc[batch].data();
   }
 
   gemm_batched(trans_A, trans_B, batch_size, M, N, K, alpha, A.data(), lda, B.data(), ldb, beta, C.data(), ldc);
diff --git a/aten/src/ATen/native/mkl/SpectralOps.cpp b/aten/src/ATen/native/mkl/SpectralOps.cpp
index b2e135a524b9..58401d2d30ae 100644
--- a/aten/src/ATen/native/mkl/SpectralOps.cpp
+++ b/aten/src/ATen/native/mkl/SpectralOps.cpp
@@ -176,7 +176,7 @@ Tensor _fft_mkl(const Tensor& self, int64_t signal_ndim,
       osize = output_sizes[i];
       istride = complex_input ? input.stride(i) >> 1 : input.stride(i);
       ostride = onumel;
-      AT_CHECK(isize <= MKL_LONG_MAX && osize <= MKL_LONG_MAX && ostride <= MKL_LONG_MAX,
+      TORCH_CHECK(isize <= MKL_LONG_MAX && osize <= MKL_LONG_MAX && ostride <= MKL_LONG_MAX,
                "MKL FFT: input signal numel exceeds allowed range [1 ~ ", MKL_LONG_MAX, "]");
       if (!need_contiguous && istride > MKL_LONG_MAX) {
         // If we didn't plan to contiguous-fy but the `istride` exceeds bound,
@@ -186,7 +186,7 @@ Tensor _fft_mkl(const Tensor& self, int64_t signal_ndim,
         // fine as `inumel` is non-decreasing.
         need_contiguous = true;
       }
-      AT_CHECK(!need_contiguous || inumel <= MKL_LONG_MAX,
+      TORCH_CHECK(!need_contiguous || inumel <= MKL_LONG_MAX,
                "MKL FFT: input signal numel exceeds allowed range [1 ~ ", MKL_LONG_MAX, "]");
       inumel *= isize;
       onumel *= osize;
diff --git a/aten/src/ATen/native/mkldnn/Pooling.cpp b/aten/src/ATen/native/mkldnn/Pooling.cpp
index 0e1b351bdde1..6314773b5839 100644
--- a/aten/src/ATen/native/mkldnn/Pooling.cpp
+++ b/aten/src/ATen/native/mkldnn/Pooling.cpp
@@ -73,7 +73,7 @@ static Tensor _mkldnn_pool2d(
     IntArrayRef dilation,
     bool ceil_mode,
     ideep::algorithm algo) {
-  AT_CHECK(!ceil_mode, "Currently Mkldnn Pooling operators do not support ceil_mode.");
+  TORCH_CHECK(!ceil_mode, "Currently Mkldnn Pooling operators do not support ceil_mode.");
   auto kernel_size_vec = expand_param_if_needed(kernel_size, "kernel_size", 2);
   auto stride_vec = expand_param_if_needed(stride, "stride", 2);
   auto padding_vec = expand_param_if_needed(padding, "padding", 2);
diff --git a/aten/src/ATen/native/mkldnn/UnaryOps.cpp b/aten/src/ATen/native/mkldnn/UnaryOps.cpp
new file mode 100644
index 000000000000..5045acd60a57
--- /dev/null
+++ b/aten/src/ATen/native/mkldnn/UnaryOps.cpp
@@ -0,0 +1,46 @@
+#include <ATen/ATen.h>
+#include <ATen/Config.h>
+#include <ATen/NativeFunctions.h>
+
+#if !AT_MKLDNN_ENABLED()
+
+namespace at {
+namespace native {
+
+Tensor mkldnn_sigmoid(const Tensor& self) {
+  AT_ERROR("mkldnn_sigmoid: ATen not compiled with MKLDNN support");
+}
+
+Tensor& mkldnn_sigmoid_(Tensor& self) {
+  AT_ERROR("mkldnn_sigmoid_: ATen not compiled with MKLDNN support");
+}
+
+} // namespace native
+} // namespace at
+
+#else // AT_MKLDNN_EBABLED
+
+#include <ATen/native/mkldnn/MKLDNNCommon.h>
+
+namespace at {
+namespace native {
+
+Tensor mkldnn_sigmoid(const Tensor& self) {
+  ideep::tensor& x = itensor_from_mkldnn(self);
+  ideep::tensor y;
+  ideep::eltwise_forward::compute(
+      x, y, ideep::algorithm::eltwise_logistic, ideep::prop_kind::forward);
+  return new_with_itensor_mkldnn(std::move(y), self.options());
+}
+
+Tensor& mkldnn_sigmoid_(Tensor& self) {
+  ideep::tensor& x = itensor_from_mkldnn(self);
+  ideep::eltwise_forward::compute(
+      x, x, ideep::algorithm::eltwise_logistic, ideep::prop_kind::forward);
+  return self;
+}
+
+} // namespace native
+} // namespace at
+
+#endif // AT_MKLDNN_EBABLED
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 2fec270eabbf..7c179ec58505 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -221,9 +221,6 @@
 
 - func: as_strided(Tensor(a) self, int[] size, int[] stride, int? storage_offset=None) -> Tensor(a)
   variants: function, method
-  cpu_half: True
-  cpu_bool: True
-  cuda_bool: True
   dispatch:
     CPU: as_strided_tensorimpl
     CUDA: as_strided_tensorimpl
@@ -424,7 +421,7 @@
 - func: constant_pad_nd(Tensor self, int[] pad, Scalar value=0) -> Tensor
   variants: function
 
-- func: contiguous(Tensor self) -> Tensor
+- func: contiguous(Tensor self, *, MemoryFormat memory_format=contiguous_format) -> Tensor
   variants: method
 
 - func: convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups) -> Tensor
@@ -456,28 +453,8 @@
   variants: method
   device_guard: False
 
-- func: s_copy_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!)
-  cpu_half: True
-  cpu_bool: True
-  cuda_bool: True
-  dispatch:
-    CPU: _s_copy__cpu
-    CUDA: _s_copy__cuda
-    QuantizedCPU: _s_copy__quantized
-
-- func: _s_copy_from(Tensor self, Tensor dst, bool non_blocking=False) -> Tensor
-  cpu_half: True
-  cpu_bool: True
-  cuda_bool: True
-  dispatch:
-    CUDA: _s_copy_from_cuda
-
-- func: _copy_same_type_(Tensor(a!) self, Tensor src) -> void
-  cpu_half: True
-  cpu_bool: True
-  cuda_bool: True
-  dispatch:
-    CPU: _copy_same_type__cpu
+- func: _copy_from(Tensor self, Tensor dst, bool non_blocking=False) -> Tensor
+  dispatch: {}
 
 - func: cos(Tensor self) -> Tensor
   variants: function, method
@@ -696,9 +673,6 @@
     CUDA: _embedding_bag_per_sample_weights_backward_cuda
 
 - func: empty(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
-  cpu_half: True
-  cpu_bool: True
-  cuda_bool: True
   dispatch:
     CPU: empty_cpu
     CUDA: empty_cuda
@@ -711,9 +685,6 @@
 
 - func: resize_(Tensor(a!) self, int[] size) -> Tensor(a!)
   variants: method
-  cpu_bool: True
-  cuda_bool: True
-  cpu_half: True
   device_guard: False
   dispatch:
     CPU: resize_cpu_
@@ -729,9 +700,6 @@
   device_guard: False
 
 - func: empty_strided(int[] size, int[] stride, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
-  cpu_half: True
-  cpu_bool: True
-  cuda_bool: True
   dispatch:
     CPU: empty_strided_cpu
     CUDA: empty_strided_cuda
@@ -1411,6 +1379,9 @@
 - func: pinverse(Tensor self, float rcond=1e-15) -> Tensor
   variants: function, method
 
+- func: poisson_nll_loss(Tensor input, Tensor target, bool log_input, bool full, float eps, int reduction) -> Tensor
+  variants: function
+
 - func: scalar_tensor(Scalar s, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
 
 - func: rand(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
@@ -1622,12 +1593,17 @@
 
 - func: sigmoid(Tensor self) -> Tensor
   variants: function, method
+  dispatch:
+    CPU: sigmoid
+    CUDA: sigmoid
+    MkldnnCPU: mkldnn_sigmoid
 
 - func: sigmoid_(Tensor(a!) self) -> Tensor(a!)
   variants: function, method
   dispatch:
     CPU: _sigmoid__cpu
     CUDA: _sigmoid__cuda
+    MkldnnCPU: mkldnn_sigmoid_
 
 - func: sigmoid(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
@@ -1829,6 +1805,12 @@
 - func: std(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor
   variants: function, method
 
+- func: std_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor)
+  variants: function
+
+- func: std_mean(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
+  variants: function
+
 - func: std(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
 
 # FIXME: These could be combined as optional<ScalarType> but for https://github.com/pytorch/pytorch/issues/6593.
@@ -2005,6 +1987,12 @@
 
 - func: var(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
 
+- func: var_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor)
+  variants: function
+
+- func: var_mean(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
+  variants: function
+
 - func: view_as(Tensor self, Tensor other) -> Tensor
   variants: method
   device_guard: False
@@ -2128,9 +2116,6 @@
 
 - func: clone(Tensor self) -> Tensor
   variants: function, method
-  cpu_half: True
-  cpu_bool: True
-  cuda_bool: True
   dispatch:
     CPU: clone
     CUDA: clone
@@ -2139,9 +2124,6 @@
     MkldnnCPU: mkldnn_clone
 
 - func: resize_as_(Tensor(a!) self, Tensor the_template) -> Tensor(a!)
-  cpu_bool: True
-  cuda_bool: True
-  cpu_half: True
   variants: function, method
   dispatch:
     CPU: resize_as_
@@ -2166,9 +2148,6 @@
 
 - func: zero_(Tensor(a!) self) -> Tensor(a!)
   variants: method, function
-  cpu_half: True
-  cpu_bool: True
-  cuda_bool: True
   dispatch:
     CPU: zero_
     CUDA: zero_
@@ -2547,7 +2526,7 @@
 
 - func: to_mkldnn_backward(Tensor grad, Tensor input) -> Tensor
 
-- func: quantize_linear(Tensor self, float scale, int zero_point) -> Tensor
+- func: quantize_linear(Tensor self, float scale, int zero_point, ScalarType dtype) -> Tensor
   variants: function, method
   dispatch:
     CPU: quantize_linear_cpu
@@ -2557,6 +2536,11 @@
   dispatch:
     QuantizedCPU: dequantize_quant
 
+- func: dequantize_linear(Tensor self, float scale, int zero_point, ScalarType dtype) -> Tensor
+  variants: function, method
+  dispatch:
+    CPU: dequantize_linear_cpu
+
 - func: q_scale(Tensor self) -> Scalar
   variants: function, method
   dispatch:
@@ -2572,6 +2556,10 @@
   dispatch:
     QuantizedCPU: int_repr_quant
 
+- func: _per_tensor_affine_qtensor(Tensor self, float scale, int zero_point) -> Tensor
+  dispatch:
+    CPU: per_tensor_affine_qtensor_cpu
+
 # to(Device) must not exist because all constructors of Device also works for
 # TensorOptions. Otherwise, an ambiguity error is thrown.
 # See NOTE [ TensorOptions Constructors ].
@@ -2603,12 +2591,7 @@
   variants: method
 
 # NB: Does NOT check precondition that numel == 1
-# WARNING: Use of cpu_half here is generally not supported; please
-# don't use it.
 - func: _local_scalar_dense(Tensor self) -> Scalar
-  cpu_half: True
-  cpu_bool: True
-  cuda_bool: True
   dispatch:
     CPU: _local_scalar_dense_cpu
     CUDA: _local_scalar_dense_cuda
@@ -2699,8 +2682,6 @@
 
 - func: is_set_to(Tensor self, Tensor tensor) -> bool
   variants: method
-  cpu_bool: True
-  cuda_bool: True
   device_guard: False
 
 - func: masked_fill_(Tensor(a!) self, Tensor mask, Scalar value) -> Tensor(a!)
@@ -2722,9 +2703,6 @@
   variants: function, method
 
 - func: view(Tensor(a) self, int[] size) -> Tensor(a)
-  cpu_half: True
-  cpu_bool: True
-  cuda_bool: True
   variants: method
   device_guard: False
 
@@ -2951,6 +2929,9 @@
 
 - func: uniform_(Tensor(a!) self, float from=0, float to=1, *, Generator? generator=None) -> Tensor(a!)
   variants: method
+  dispatch:
+    CPU: uniform_cpu_
+    CUDA: uniform_cuda_
 
 - func: normal_(Tensor(a!) self, float mean=0, float std=1, *, Generator? generator=None) -> Tensor(a!)
   variants: method
@@ -3657,15 +3638,27 @@
 
 - func: adaptive_avg_pool3d(Tensor self, int[3] output_size, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
+  dispatch:
+    CPU: adaptive_avg_pool3d_out_cpu
+    CUDA: adaptive_avg_pool3d_out_cuda
 
 - func: adaptive_avg_pool3d(Tensor self, int[3] output_size) -> Tensor
   python_module: nn
+  dispatch:
+    CPU: adaptive_avg_pool3d_cpu
+    CUDA: adaptive_avg_pool3d_cuda
 
 - func: adaptive_avg_pool3d_backward(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
+  dispatch:
+    CPU: adaptive_avg_pool3d_backward_out_cpu
+    CUDA: adaptive_avg_pool3d_backward_out_cuda
 
 - func: adaptive_avg_pool3d_backward(Tensor grad_output, Tensor self) -> Tensor
   python_module: nn
+  dispatch:
+    CPU: adaptive_avg_pool3d_backward_cpu
+    CUDA: adaptive_avg_pool3d_backward_cuda
 
 # Return: (Tensor output, Tensor indices)
 - func: adaptive_max_pool2d(Tensor self, int[2] output_size, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
@@ -3804,18 +3797,30 @@
     CUDA: fractional_max_pool3d_backward_cuda
 
 # Return: (Tensor output, Tensor indices)
-- func: max_pool2d_with_indices(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False, *, Tensor(a!) output, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
+- func: max_pool2d_with_indices(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
   python_module: nn
+  dispatch:
+    CPU: max_pool2d_with_indices_out_cpu
+    CUDA: max_pool2d_with_indices_out_cuda
 
 # Return: (Tensor output, Tensor indices)
 - func: max_pool2d_with_indices(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)
   python_module: nn
+  dispatch:
+    CPU: max_pool2d_with_indices_cpu
+    CUDA: max_pool2d_with_indices_cuda
 
 - func: max_pool2d_with_indices_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, bool ceil_mode, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
+  dispatch:
+    CPU: max_pool2d_with_indices_backward_out_cpu
+    CUDA: max_pool2d_with_indices_backward_out_cuda
 
 - func: max_pool2d_with_indices_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, bool ceil_mode, Tensor indices) -> Tensor
   python_module: nn
+  dispatch:
+    CPU: max_pool2d_with_indices_backward_cpu
+    CUDA: max_pool2d_with_indices_backward_cuda
 
 # Return: (Tensor output, Tensor indices)
 - func: max_pool3d_with_indices(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False, *, Tensor(a!) output, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
diff --git a/aten/src/ATen/native/quantized/Copy.cpp b/aten/src/ATen/native/quantized/Copy.cpp
index 3e66085cdf54..3951d86bc243 100644
--- a/aten/src/ATen/native/quantized/Copy.cpp
+++ b/aten/src/ATen/native/quantized/Copy.cpp
@@ -1,27 +1,28 @@
-#include <ATen/native/Copy.h>
+#include <ATen/native/quantized/Copy.h>
 
 #include <ATen/ATen.h>
-#include <ATen/native/TensorIterator.h>
-#include <ATen/native/cpu/Loops.h>
 #include <ATen/quantized/Quantizer.h>
 
 namespace at {
 namespace native {
-Tensor& _s_copy__quantized(Tensor& self, const Tensor& src, bool /* unused */) {
-  AT_CHECK(
-      self.scalar_type() == at::kQInt8,
-      "Quantized copy only works with kQInt8 as target Tensor");
-  AT_CHECK(
+Tensor& quantized_copy_(Tensor& self, const Tensor& src) {
+  TORCH_CHECK(
       src.scalar_type() == at::kFloat,
       "Quantized copy only works with kFloat as source Tensor");
-  qint8* self_data = self.data<qint8>();
-  float* src_data = src.data<float>();
-  for (int i = 0; i < self.numel(); ++i) {
-    self_data[i] = quantize_uint8(
-        self.q_scale().to<float>(),
-        self.q_zero_point().to<uint8_t>(),
-        src_data[i]);
-  }
+  TORCH_CHECK(self.is_contiguous() && src.is_contiguous(),
+      "Quantized copy only works with contiguous Tensors");
+  TORCH_CHECK(self.sizes().equals(src.sizes()),
+      "Quantized copy only works with Tensors with the same shape");
+  AT_DISPATCH_QINT_TYPES(self.scalar_type(), "Copy", [&]() {
+    float* src_data = src.data<float>();
+    scalar_t* self_data = self.data<scalar_t>();
+    for (int i = 0; i < self.numel(); ++i) {
+      self_data[i] = quantize_val<scalar_t>(
+          self.q_scale().to<float>(),
+          self.q_zero_point().to<int32_t>(),
+          src_data[i]);
+    }
+  });
   return self;
 }
 } // namespace native
diff --git a/aten/src/ATen/native/quantized/Copy.h b/aten/src/ATen/native/quantized/Copy.h
new file mode 100644
index 000000000000..0e55387a8545
--- /dev/null
+++ b/aten/src/ATen/native/quantized/Copy.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+
+namespace at {
+namespace native {
+
+Tensor& quantized_copy_(Tensor& self, const Tensor& src);
+
+}
+}
diff --git a/aten/src/ATen/native/quantized/QTensor.cpp b/aten/src/ATen/native/quantized/QTensor.cpp
index 0166632daa33..20ceb5d32cea 100644
--- a/aten/src/ATen/native/quantized/QTensor.cpp
+++ b/aten/src/ATen/native/quantized/QTensor.cpp
@@ -7,8 +7,8 @@
 namespace at {
 namespace native {
 
-Tensor quantize_linear_cpu(const Tensor& self, double scale, int64_t zero_point) {
-  auto quantizer = make_per_tensor_affine_quantizer(scale, zero_point);
+Tensor quantize_linear_cpu(const Tensor& self, double scale, int64_t zero_point, ScalarType dtype) {
+  auto quantizer = make_per_tensor_affine_quantizer(scale, zero_point, dtype);
   return quantizer->quantize(self);
 }
 
@@ -16,6 +16,21 @@ Tensor dequantize_quant(const Tensor& self) {
   return get_qtensorimpl(self)->quantizer()->dequantize(self);
 }
 
+Tensor dequantize_linear_cpu(const Tensor& self, double scale, int64_t zero_point, ScalarType dtype) {
+  AT_CHECK(isQIntType(toQIntType(self.scalar_type())),
+           "Scalar type for quantized Tensor must have same underlying type as input.");
+  AT_CHECK(dtype == ScalarType::Float, "ScalarType for target Tensor must be float.");
+  Tensor f = at::empty(self.sizes(), self.options().dtype(dtype));
+  AT_DISPATCH_QINT_TYPES(
+      toQIntType(self.scalar_type()), "dequantize_linear_cpu", [&]() {
+        underlying_t* qdata = self.data<underlying_t>();
+        auto* fdata = f.data<float>();
+        for (int i = 0; i < self.numel(); ++i) {
+          fdata[i] = (static_cast<float>(qdata[i]) - zero_point) * scale;
+        }});
+  return f;
+}
+
 Scalar q_scale_quant(const Tensor& self) {
   auto quantizer = get_qtensorimpl(self)->quantizer();
   AT_ASSERT(quantizer->qscheme() == kPerTensorAffine);
@@ -33,12 +48,27 @@ Quantizer* quantizer(const Tensor& self) {
 }
 
 Tensor int_repr_quant(const Tensor& self) {
-  Tensor dst = at::empty(self.sizes(), self.options().dtype(at::kByte));
-  uint8_t* self_data = reinterpret_cast<uint8_t *>(self.data<qint8>());
-  uint8_t* dst_data = dst.data<uint8_t>();
-  if (self.numel() > 0) {
-    memcpy(dst_data, self_data, self.numel());
-  }
+  Tensor dst;
+  AT_DISPATCH_QINT_TYPES(
+      self.scalar_type(), "int_repr", [&]() {
+        dst = at::empty(self.sizes(), self.options().dtype(UNDERLYING_TYPE));
+        underlying_t* self_data = reinterpret_cast<underlying_t *>(self.data<scalar_t>());
+        underlying_t* dst_data = dst.data<underlying_t>();
+        if (self.numel() > 0) {
+          memcpy(dst_data, self_data, self.nbytes());
+        }});
+  return dst;
+}
+
+Tensor per_tensor_affine_qtensor_cpu(const Tensor& self, double scale, int64_t zero_point) {
+  Tensor dst = at::_empty_affine_quantized(self.sizes(), self.options().dtype(toQIntType(self.scalar_type())), scale, zero_point);
+  AT_DISPATCH_QINT_TYPES(dst.scalar_type(), "per_tensor_affine_qtensor", [&]() {
+    underlying_t* self_data = self.data<underlying_t>();
+    underlying_t* dst_data = reinterpret_cast<underlying_t *>(dst.data<scalar_t>());
+    if (self.numel() > 0) {
+      memcpy(dst_data, self_data, self.numel());
+    }
+  });
   return dst;
 }
 
diff --git a/aten/src/ATen/native/quantized/TensorFactories.cpp b/aten/src/ATen/native/quantized/TensorFactories.cpp
index 100d19c5036f..d3268a935a00 100644
--- a/aten/src/ATen/native/quantized/TensorFactories.cpp
+++ b/aten/src/ATen/native/quantized/TensorFactories.cpp
@@ -11,7 +11,8 @@ namespace native {
 // support quantizer in python frontend, once
 // that is ready, we'll change to use quantizer
 Tensor empty_affine_quantized_cpu(IntArrayRef size, const TensorOptions& options, double scale, int64_t zero_point) {
-  return new_qtensor_cpu(size, options, make_per_tensor_affine_quantizer(scale, zero_point));
+  TORCH_CHECK(options.has_dtype(), "Must provide data type for Tensor creation functions.");
+  return new_qtensor_cpu(size, options, make_per_tensor_affine_quantizer(scale, zero_point, typeMetaToScalarType(options.dtype())));
 }
 
 }} // at::native
diff --git a/aten/src/ATen/native/quantized/cpu/fake_quantize.cpp b/aten/src/ATen/native/quantized/cpu/fake_quantize.cpp
index 3efb36e2edd5..b7a8d1fe9992 100644
--- a/aten/src/ATen/native/quantized/cpu/fake_quantize.cpp
+++ b/aten/src/ATen/native/quantized/cpu/fake_quantize.cpp
@@ -136,11 +136,13 @@ class FakeQuantizePerTensorAffineOp_backward : public c10::OperatorKernel {
 
 static auto registry = c10::RegisterOperators()
 .op("quantized::fake_quantize_per_tensor_affine_forward(Tensor X, float scale, int zero_point, int num_bits = 8, int quant_delay = 0, int iter = 0) -> Tensor",
-    c10::kernel<FakeQuantizePerTensorAffineOp_forward>(),
-    c10::dispatchKey(CPUTensorId()))
+    c10::RegisterOperators::options()
+      .kernel<FakeQuantizePerTensorAffineOp_forward>()
+      .dispatchKey(CPUTensorId()))
 .op("quantized::fake_quantize_per_tensor_affine_backward(Tensor X, Tensor dY, float scale, int zero_point, int num_bits=8, int quant_delay=0, int iter = 0) -> Tensor",
-    c10::kernel<FakeQuantizePerTensorAffineOp_backward>(),
-    c10::dispatchKey(CPUTensorId()));
+    c10::RegisterOperators::options()
+      .kernel<FakeQuantizePerTensorAffineOp_backward>()
+      .dispatchKey(CPUTensorId()));
 
 }  // namespace
 }}  // namespace at::native
diff --git a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h
index 05692969253d..498ce8aca4c8 100644
--- a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h
+++ b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h
@@ -20,6 +20,14 @@ struct FBGEMM_API PackedFCWeight {
   int w_zp;
 };
 
+struct FBGEMM_API PackedConvWeight {
+  std::unique_ptr<fbgemm::PackBMatrix<int8_t>> w;
+  std::vector<int32_t> col_offsets;
+  std::vector<int32_t> kernel;
+  float w_scale;
+  int32_t w_zp;
+};
+
 // Convert the weight from uint8 to int8.
 static void convert_uint8_int8(
     int K,
diff --git a/aten/src/ATen/native/quantized/cpu/qadd.cpp b/aten/src/ATen/native/quantized/cpu/qadd.cpp
new file mode 100644
index 000000000000..a8361835ba92
--- /dev/null
+++ b/aten/src/ATen/native/quantized/cpu/qadd.cpp
@@ -0,0 +1,49 @@
+#include <ATen/ATen.h>
+#include <ATen/core/Type.h>
+#include <ATen/core/op_registration/op_registration.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/cpu/Loops.h>
+#include <ATen/quantized/Quantizer.h>
+
+#include <algorithm>
+
+namespace at { namespace native {
+namespace {
+template <bool ReLUFused = false>
+class QAddInt8 final : public c10::OperatorKernel {
+ public:
+  Tensor operator()(at::Tensor qa, at::Tensor qb,
+                    double scale, int64_t zero_point) {
+    AT_ASSERTM(qa.numel() == qb.numel(), "Add operands must be the same size!");
+    TORCH_CHECK(qa.scalar_type() == qb.scalar_type(), "Add operands should have same data type.");
+    auto a = qa.dequantize();
+    auto b = qb.dequantize();
+    auto c = at::empty_like(a);
+    auto iter = TensorIterator::binary_op(c, a, b);
+
+    if (ReLUFused) {
+      binary_kernel(*iter, [&](float a_val, float b_val) -> float {
+        return std::max<float>(a_val + b_val, 0);
+      });
+    } else {
+      binary_kernel(*iter, [&](float a_val, float b_val) -> float {
+        return a_val + b_val;
+      });
+    }
+    return c.quantize_linear(scale, zero_point, qa.scalar_type());  // Requantize
+  }
+};
+
+static auto registry = c10::RegisterOperators()
+.op("quantized::add(Tensor qa, Tensor qb, float scale, int zero_point)"
+     "-> Tensor qc",
+    c10::RegisterOperators::options()
+      .kernel<QAddInt8</*ReLUFused=*/false>>()
+      .dispatchKey(QuantizedCPUTensorId()))
+.op("quantized::add_relu(Tensor qa, Tensor qb, float scale, int zero_point)"
+     "-> Tensor qc",
+    c10::RegisterOperators::options()
+      .kernel<QAddInt8</*ReLUFused=*/true>>()
+      .dispatchKey(QuantizedCPUTensorId()));
+}  // namespace
+}}  // namespace at::native
diff --git a/aten/src/ATen/native/quantized/cpu/qconv.cpp b/aten/src/ATen/native/quantized/cpu/qconv.cpp
new file mode 100644
index 000000000000..7675a1cd02e6
--- /dev/null
+++ b/aten/src/ATen/native/quantized/cpu/qconv.cpp
@@ -0,0 +1,156 @@
+#include <ATen/ATen.h>
+#include <ATen/core/Type.h>
+#include <ATen/core/op_registration/op_registration.h>
+#include <ATen/cpp_custom_type_hack.h>
+#include <ATen/native/quantized/cpu/fbgemm_utils.h>
+#include <ATen/quantized/Quantizer.h>
+
+namespace at {
+namespace native {
+namespace {
+class QConv2dInt8 final : public c10::OperatorKernel {
+ public:
+#ifdef USE_FBGEMM
+  Tensor operator()(
+      Tensor act,
+      Tensor packed_weight,
+      Tensor bias,
+      const std::vector<int64_t>& stride,
+      const std::vector<int64_t>& padding,
+      const std::vector<int64_t>& dilation,
+      const std::vector<int64_t>& output_padding,
+      int64_t groups,
+      double output_scale,
+      int64_t output_zero_point) {
+    TORCH_CHECK(
+        fbgemm::fbgemmSupportedCPU(), "Your CPU does not support FBGEMM.");
+    TORCH_CHECK(
+        act.ndimension() == 4,
+        "Activations are supposed to have 4 dimensions.");
+    TORCH_CHECK(stride.size() == 2, "2D convolution only");
+    TORCH_CHECK(padding.size() == 2, "2D convolution only");
+    TORCH_CHECK(dilation.size() == 2, "2D convolution only");
+    TORCH_CHECK(output_padding.size() == 2, "2D convolution only");
+    TORCH_CHECK(
+        (dilation[0] == 1 && dilation[1] == 1),
+        "Currently dilation should be 1");
+    TORCH_CHECK(
+        (output_padding[0] == 0 && output_padding[1] == 0),
+        "Currently output padding should be 0");
+
+    // inputs are in NHWC format
+    int N = act.size(0);
+    int H = act.size(1);
+    int W = act.size(2);
+    int C = act.size(3);
+    int K = bias.size(0);
+
+    Tensor act_contig = act.contiguous();
+    const uint8_t* act_ptr =
+        reinterpret_cast<uint8_t*>(act_contig.data<c10::quint8>());
+
+    PackedConvWeight& pack_ptr =
+        cpp_custom_type_hack::cast<PackedConvWeight>(packed_weight);
+    auto packB = pack_ptr.w.get();
+    // packB->printPackedMatrix("PackedB inside QConv2dInt8:");
+    auto& col_offsets = pack_ptr.col_offsets;
+    auto& kernel = pack_ptr.kernel;
+
+    std::vector<int32_t> row_offset_buf(
+        fbgemm::PackAWithIm2Col<uint8_t>::rowOffsetBufferSize());
+
+    int pad_l = padding[0];
+    int pad_t = padding[1];
+    int stride_h = stride[0];
+    int stride_w = stride[1];
+    int kernel_h = kernel[0];
+    int kernel_w = kernel[1];
+
+    fbgemm::conv_param_t<> conv_p(
+        N, // Batch size
+        C, // Number of input channels
+        K, // Number of output channels
+        {H, W},
+        groups,
+        {kernel_h, kernel_w},
+        {stride_h, stride_w},
+        {pad_l, pad_t, pad_l, pad_t});
+
+    fbgemm::PackAWithIm2Col<uint8_t> packA(
+        conv_p,
+        act_ptr,
+        nullptr,
+        act.q_zero_point().toInt(),
+        row_offset_buf.data());
+
+    fbgemm::DoNothing<> NoOpObj{};
+
+    auto bias_contig = bias.contiguous();
+
+    float act_scale = act.q_scale().toFloat();
+    int32_t act_zero_point = act.q_zero_point().toInt();
+
+    float weight_scale_float = pack_ptr.w_scale;
+    int32_t weight_zero_point_int32 = pack_ptr.w_zp;
+
+    float output_multiplier_float =
+        (act_scale * weight_scale_float) / static_cast<float>(output_scale);
+
+    fbgemm::ReQuantizeOutput<false> outputProcObj(
+        NoOpObj,
+        &output_multiplier_float,
+        output_zero_point,
+        act_zero_point,
+        &weight_zero_point_int32,
+        packA.getRowOffsetBuffer(),
+        col_offsets.data(),
+        bias_contig.data<int32_t>(),
+        K,
+        groups);
+
+    Tensor output = _empty_affine_quantized(
+        {N, H, W, K},
+        device(kCPU).dtype(kQUInt8),
+        output_scale,
+        output_zero_point);
+    auto buffer = at::zeros_like(output, output.options().dtype(at::kInt));
+
+    // Do the GEMM
+    fbgemm::fbgemmPacked(
+        packA,
+        *packB,
+        reinterpret_cast<uint8_t*>(output.data<c10::quint8>()),
+        buffer.data<int32_t>(),
+        K,
+        outputProcObj,
+        0 /* thread_id*/,
+        1 /* num_threads */);
+
+    return output;
+  }
+#else // USE_FBGEMM
+  Tensor operator()(
+      Tensor /* activation */,
+      Tensor /* packed_weight */,
+      Tensor /* bias */,
+      const std::vector<int64_t>& /* stride */,
+      const std::vector<int64_t>& /* padding */,
+      const std::vector<int64_t>& /* dilation */,
+      const std::vector<int64_t>& /* output padding */,
+      int64_t /* groups */,
+      double /* output scale */,
+      int64_t /* output_zero_point */) {
+    TORCH_CHECK(
+        false, "This PyTorch installation was not built with FBGEMM operators");
+  }
+#endif // USE_FBGEMM
+};
+
+static auto registry = c10::RegisterOperators().op(
+    "quantized::fbgemm_conv2d",
+    c10::RegisterOperators::options().kernel<QConv2dInt8>().dispatchKey(
+        QuantizedCPUTensorId()));
+
+} // namespace
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp
new file mode 100644
index 000000000000..f009a6aad81b
--- /dev/null
+++ b/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp
@@ -0,0 +1,76 @@
+#include <ATen/ATen.h>
+#include <ATen/core/Type.h>
+#include <ATen/core/op_registration/op_registration.h>
+#include <ATen/cpp_custom_type_hack.h>
+#include <ATen/native/quantized/cpu/fbgemm_utils.h>
+#include <ATen/quantized/Quantizer.h>
+
+namespace caffe2 {
+#ifdef USE_FBGEMM
+// Required for cpp_custom_type_hack to work
+CAFFE_KNOWN_TYPE(PackedConvWeight);
+#endif
+} // namespace caffe2
+
+namespace at {
+namespace native {
+namespace {
+class QConvPackWeightInt8 final : public c10::OperatorKernel {
+ public:
+#ifdef USE_FBGEMM
+  Tensor operator()(Tensor weight, int64_t groups) {
+    TORCH_CHECK(
+        weight.ndimension() == 4, "Weights are expected to have 4 dimensions");
+    TORCH_CHECK(groups == 1, "Groupwise convolutions are not supported yet");
+    // weights in RS(C/G)K format
+    // matrix dimensions after im2col
+    int NDim = weight.size(3) / groups;
+    int KDim = weight.size(0) * weight.size(1) * groups * weight.size(2);
+    auto weight_config = weight.contiguous();
+    int weight_zero_point_int32 = weight.q_zero_point().toInt();
+    TORCH_CHECK(
+        weight_zero_point_int32 == 0,
+        "Only symmetric quantization is supported for weights yet");
+    const int8_t* weight_ptr_int8 =
+        reinterpret_cast<int8_t*>(weight_config.data<c10::quint8>());
+
+    std::vector<int32_t> col_offsets(NDim * groups);
+    std::vector<int32_t> kernel{static_cast<int>(weight.size(0)),
+                                static_cast<int>(weight.size(1))};
+    std::vector<int8_t> weight_int8(KDim * NDim * groups);
+    auto ret_ptr = guts::make_unique<PackedConvWeight>(
+        PackedConvWeight{guts::make_unique<fbgemm::PackBMatrix<int8_t>>(
+                             fbgemm::matrix_op_t::NoTranspose,
+                             KDim,
+                             NDim,
+                             weight_ptr_int8,
+                             NDim,
+                             nullptr, // PackBMatrix manages ownership of pmat
+                             groups),
+                         col_offsets,
+                         kernel,
+                         weight.q_scale().toFloat(),
+                         weight_zero_point_int32});
+    // TODO: we will need to replace this with torchscript classes at a later
+    // point.
+    return cpp_custom_type_hack::create(std::move(ret_ptr), weight.options());
+  }
+#else // USE_FBGEMM
+  Tensor operator()(
+      Tensor, /* weight */
+      int64_t /* groups */
+  ) {
+    TORCH_CHECK(
+        false, "This PyTorch installation was not built with FBGEMM operators");
+  }
+#endif // USE_FBGEMM
+};
+
+static auto registry = c10::RegisterOperators().op(
+    "quantized::fbgemm_conv_prepack",
+    c10::RegisterOperators::options().kernel<QConvPackWeightInt8>().dispatchKey(
+        QuantizedCPUTensorId()));
+
+} // namespace
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/quantized/cpu/qfc.cpp b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
similarity index 91%
rename from aten/src/ATen/native/quantized/cpu/qfc.cpp
rename to aten/src/ATen/native/quantized/cpu/qlinear.cpp
index 62c02d63cd76..f276010e9412 100644
--- a/aten/src/ATen/native/quantized/cpu/qfc.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
@@ -32,7 +32,7 @@ class QFCInt8 final : public c10::OperatorKernel {
     // TODO: contiguous is called for further jit optimizations.
     auto input_contig = input.contiguous();
     const auto* input_ptr =
-        reinterpret_cast<uint8_t*>(input_contig.data<c10::qint8>());
+        reinterpret_cast<uint8_t*>(input_contig.data<c10::quint8>());
 
     AT_ASSERT(input.dim() >= 2);
     // C(output) = A(input) x B(weight), where C, A, B are M x N, M x K, K x N
@@ -95,6 +95,8 @@ class QFCInt8 final : public c10::OperatorKernel {
 
     // TODO: contiguous is called for further jit optimizations.
     auto bias_contig = bias.contiguous();
+    const auto* bias_ptr =
+        reinterpret_cast<int32_t*>(bias_contig.data<c10::qint32>());
 
     // After the uint8 * int8 matrix multiplication is performed, this operation
     // does:
@@ -108,13 +110,13 @@ class QFCInt8 final : public c10::OperatorKernel {
         /*Bq_zero_point=*/&weight_zero_point_int32,
         /*row_offsets=*/packA.getRowOffsetBuffer(),
         /*col_offsets=*/col_offsets.data(),
-        /*bias=*/bias_contig.data<int32_t>(),
+        /*bias=*/bias_ptr,
         /*nCol=*/N);
 
     // Allocate output Tensor and a buffer for fbgemmPacked to use
     auto output = _empty_affine_quantized(
         {M, N},
-        at::device(kCPU).dtype(kQInt8),
+        at::device(kCPU).dtype(kQUInt8),
         output_scale,
         output_zero_point);
 
@@ -124,7 +126,7 @@ class QFCInt8 final : public c10::OperatorKernel {
     fbgemm::fbgemmPacked(
         /*packA=*/packA,
         /*packB=*/*packB,
-        /*C=*/reinterpret_cast<uint8_t*>(output.data<c10::qint8>()),
+        /*C=*/reinterpret_cast<uint8_t*>(output.data<c10::quint8>()),
         /*C_buffer=*/buffer.data<int32_t>(),
         /*ldc=*/N,
         /*outProcess=*/outputProcObj,
@@ -152,11 +154,13 @@ class QFCInt8 final : public c10::OperatorKernel {
 static auto registry =
     c10::RegisterOperators()
         .op("quantized::fbgemm_linear(Tensor X, Tensor W_prepack, Tensor b, float Y_scale_i, int Y_zero_point_i) -> Tensor Y",
-            c10::kernel<QFCInt8<false>>(),
-            c10::dispatchKey(QuantizedCPUTensorId()))
+            c10::RegisterOperators::options()
+              .kernel<QFCInt8<false>>()
+              .dispatchKey(QuantizedCPUTensorId()))
         .op("quantized::fbgemm_linear_relu(Tensor X, Tensor W_prepack, Tensor b, float Y_scale_i, int Y_zero_point_i) -> Tensor Y",
-            c10::kernel<QFCInt8<true>>(),
-            c10::dispatchKey(QuantizedCPUTensorId()));
+            c10::RegisterOperators::options()
+              .kernel<QFCInt8<true>>()
+              .dispatchKey(QuantizedCPUTensorId()));
 } // namespace
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/quantized/cpu/qfc_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp
similarity index 89%
rename from aten/src/ATen/native/quantized/cpu/qfc_prepack.cpp
rename to aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp
index 0d1e18f6733c..ead10f05f611 100644
--- a/aten/src/ATen/native/quantized/cpu/qfc_prepack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp
@@ -45,16 +45,13 @@ class QFCPackWeightInt8 final : public c10::OperatorKernel {
     auto N = weight.size(0);
     auto K = weight.size(1);
 
-    int32_t weight_zero_point_int32 = weight.q_zero_point().toInt() - 128;
+    int32_t weight_zero_point_int32 = weight.q_zero_point().toInt();
 
     // TODO: contiguous is called for further JIT optimizations.
     auto weight_contig = weight.contiguous();
 
-    std::vector<int8_t> weight_int8(K * N);
-    int8_t* weight_ptr_int8 = weight_int8.data();
-    uint8_t* weight_ptr_uint8 =
-        reinterpret_cast<uint8_t*>(weight_contig.data<c10::qint8>());
-    convert_uint8_int8(K, N, weight_ptr_uint8, weight_ptr_int8);
+    int8_t* weight_ptr_int8 =
+        reinterpret_cast<int8_t*>(weight_contig.data<c10::qint8>());
 
     std::vector<int32_t> col_offsets(N);
     calc_col_offsets_transpose(
@@ -95,8 +92,9 @@ class QFCPackWeightInt8 final : public c10::OperatorKernel {
 
 static auto registry = c10::RegisterOperators().op(
     "quantized::fbgemm_linear_prepack(Tensor W) -> Tensor W_prepack",
-    c10::kernel<QFCPackWeightInt8>(),
-    c10::dispatchKey(QuantizedCPUTensorId()));
+    c10::RegisterOperators::options()
+      .kernel<QFCPackWeightInt8>()
+      .dispatchKey(QuantizedCPUTensorId()));
 } // namespace
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/quantized/cpu/qrelu.cpp b/aten/src/ATen/native/quantized/cpu/qrelu.cpp
index e179ff5b6fbe..ee540901087a 100644
--- a/aten/src/ATen/native/quantized/cpu/qrelu.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qrelu.cpp
@@ -14,13 +14,13 @@ class QReluInt8 final : public c10::OperatorKernel {
  public:
   Tensor operator()(Tensor qx) {
     Tensor qy = at::_empty_affine_quantized(qx.sizes(),
-                                            at::device(kCPU).dtype(kQInt8),
+                                            at::device(kCPU).dtype(kQUInt8),
                                             qx.q_scale().toDouble(),
                                             qx.q_zero_point().toLong());
     auto iter = TensorIterator::unary_op(qy, qx);
     const auto zero_point = qx.q_zero_point().toByte();
-    unary_kernel(*iter, [&](c10::qint8 value) -> c10::qint8 {
-      return c10::qint8(std::max(value.val_, zero_point));
+    unary_kernel(*iter, [&](c10::quint8 value) -> c10::quint8 {
+      return c10::quint8(std::max(value.val_, zero_point));
     });
     return qy;
   }
@@ -28,8 +28,9 @@ class QReluInt8 final : public c10::OperatorKernel {
 
 static auto registry = c10::RegisterOperators().op(
     "quantized::relu(Tensor qx) -> Tensor",
-    c10::kernel<QReluInt8>(),
-    c10::dispatchKey(QuantizedCPUTensorId()));
+    c10::RegisterOperators::options()
+      .kernel<QReluInt8>()
+      .dispatchKey(QuantizedCPUTensorId()));
 
 }  // namespace
 }}  // namespace at::native
diff --git a/aten/src/ATen/native/quantized/cpu/qsumrelu.cpp b/aten/src/ATen/native/quantized/cpu/qsumrelu.cpp
deleted file mode 100644
index 0fdf368d5b58..000000000000
--- a/aten/src/ATen/native/quantized/cpu/qsumrelu.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-#include <ATen/ATen.h>
-#include <ATen/core/Type.h>
-#include <ATen/core/op_registration/op_registration.h>
-#include <ATen/native/TensorIterator.h>
-#include <ATen/native/cpu/Loops.h>
-#include <ATen/quantized/Quantizer.h>
-
-#include <algorithm>
-
-namespace at { namespace native {
-namespace {
-class QSumReLUInt8 final : public c10::OperatorKernel {
- public:
-  Tensor operator()(at::Tensor qa, at::Tensor qb,
-                     double scale, int64_t zero_point) {
-    AT_ASSERTM(qa.numel() == qb.numel(), "Sum operands must be the same size!");
-    auto a = qa.dequantize();
-    auto b = qb.dequantize();
-    auto c = at::empty_like(a);
-    auto iter = TensorIterator::binary_op(c, a, b);
-    binary_kernel(*iter, [&](float a_val, float b_val) -> float {
-      return std::max<float>(a_val + b_val, 0);
-    });
-    return c.quantize_linear(scale, zero_point);  // Requantize
-  }
-};
-
-static auto registry = c10::RegisterOperators().op(
-    "quantized::sum_relu(Tensor qa, Tensor qb, float scale, int zero_point)"
-     "-> Tensor qc",
-    c10::kernel<QSumReLUInt8>(),
-    c10::dispatchKey(QuantizedCPUTensorId()));
-}  // namespace
-}}  // namespace at::native
diff --git a/aten/src/ATen/native/quantized/cuda/fake_quantize_per_tensor_affine.cu b/aten/src/ATen/native/quantized/cuda/fake_quantize_per_tensor_affine.cu
new file mode 100644
index 000000000000..c50260c972c6
--- /dev/null
+++ b/aten/src/ATen/native/quantized/cuda/fake_quantize_per_tensor_affine.cu
@@ -0,0 +1,165 @@
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+#include <ATen/core/op_registration/op_registration.h>
+#include <cmath>
+
+/* FakeQuantize Op for PerTensorAffine quantization scheme */
+namespace at { namespace native {
+namespace {
+/* Fake-quantizes the 'inputs' tensor.
+Args:
+  X: Forward input tensor.
+  scale: scale of per tensor affine quantization
+  zero_point: zero_point of per tensor affine quantization
+  num_bits: Number of quantization bits.
+  quant_delay: Count of global steps for which to delay the quantization.
+               See note below.
+  iter: The current quantization iteration used for `quant_delay`.
+Returns:
+  Quantized tensor (double dtype).
+
+Notes:
+  - quant_delay might be set to non-zero to help weights stabilize in the
+    beginning of the training.
+  - quantization range [0, 2^bits - 1]
+*/
+class FakeQuantizePerTensorAffineOp_forward : public c10::OperatorKernel {
+ public:
+  at::Tensor operator()(
+      at::Tensor X,
+      double scale,
+      int64_t zero_point,
+      int64_t num_bits = 8,
+      int64_t quant_delay = 0,
+      int64_t iter = 0
+    ) {
+    // Sanity checks.
+    TORCH_CHECK(X.is_cuda());
+    TORCH_CHECK(X.scalar_type() == ScalarType::Float);
+    if (num_bits > 32 || num_bits < 1) {
+      throw std::invalid_argument("`num_bits` should be in the [1, 32] range.");
+    }
+    if (zero_point < 0) {
+      throw std::invalid_argument("`zero_point` must be a positive integer.");
+    }
+    if (quant_delay < 0) {
+      throw std::invalid_argument("`quant_delay` must be a positive integer.");
+    }
+
+    if (quant_delay != 0 && iter < 0) {
+      throw std::invalid_argument(
+        "`iter` must be >=0 for non-zero `quant_delay`");
+    }
+
+    auto Y = at::empty_like(X);
+
+    if (quant_delay > 0 && iter <= quant_delay) {
+      Y.copy_(X);  // We might want to just return the input here.
+      return Y;
+    }
+
+    float inv_scale = 1.0f / scale;
+    const float quant_min = 0;
+    const float quant_max = (1 << num_bits) - 1;
+    at::cuda::CUDA_tensor_apply2<float, float>(
+        X,
+        Y,
+        [=] __device__ (
+            const float& input_val,
+            float& result_val) {
+          result_val = (fminf(quant_max, fmaxf(quant_min, (std::round(input_val * inv_scale + zero_point)))) - zero_point) * scale;
+        });
+    return Y;
+  }
+};
+
+/* Backward path to fake-quantize the 'inputs' tensor.
+
+Args:
+  X: Forward input tensor.
+  dY: Backward input tensor.
+  scale: scale of per tensor affine quantization
+  zero_point: zero_point of per tensor affine quantization
+  num_bits: Number of quantization bits.
+  quant_delay: Count of global steps for which to delay the quantization.
+               See note in forward.
+  iter: The current quantization iteration used for `quant_delay`.
+Returns:
+  Quantized tensor (double dtype).
+
+Notes:
+  - quant_delay might be set to non-zero to help weights stabilize in the
+    beginning of the training.
+  - quantization range [0, 2^bits - 1]
+*/
+class FakeQuantizePerTensorAffineOp_backward : public c10::OperatorKernel {
+ public:
+  at::Tensor operator()(
+      at::Tensor X,
+      at::Tensor dY,
+      double scale,
+      int64_t zero_point,
+      int64_t num_bits = 8,
+      int64_t quant_delay = 0,
+      int64_t iter = 0) {
+    // Sanity checks.
+    TORCH_CHECK(X.is_cuda());
+    TORCH_CHECK(X.scalar_type() == ScalarType::Float);
+    if (num_bits > 32 || num_bits < 1) {
+      throw std::invalid_argument("`num_bits` should be in the [1, 32] range.");
+    }
+    if (zero_point < 0) {
+      throw std::invalid_argument("`zero_point` must be a positive integer.");
+    }
+    if (quant_delay < 0) {
+      throw std::invalid_argument("`quant_delay` must be a positive integer.");
+    }
+    if (X.numel() <= 0) {
+      return X;
+    }
+    if (X.numel() != dY.numel()) {
+      throw std::invalid_argument("`X` and `dY` are not the same size");
+    }
+
+    if (quant_delay != 0 && iter < 0) {
+      throw std::invalid_argument(
+        "`iter` must be >=0 for non-zero `quant_delay`");
+    }
+
+    auto dX = at::zeros_like(dY);
+    if (quant_delay > 0 && iter <= quant_delay) {
+      dX.copy_(dY);
+      return dX;
+    }
+
+    float inv_scale = 1.0f / scale;
+    const float quant_min = 0;
+    const float quant_max = (1 << num_bits) - 1;
+    auto mask = at::empty_like(dY);
+    at::cuda::CUDA_tensor_apply2<float, float>(
+        X,
+        mask,
+        [=] __device__ (
+            const float& input_val,
+            float& result_val) {
+          float Xq = std::round(input_val * inv_scale + zero_point);
+          result_val = float(Xq >= quant_min && Xq <= quant_max);
+        });
+    dX = mask * dY;
+    return dX;
+  }
+};
+
+static auto registry =
+  c10::RegisterOperators()
+  .op("quantized::fake_quantize_per_tensor_affine_forward(Tensor X, float scale, int zero_point, int num_bits = 8, int quant_delay = 0, int iter = 0) -> Tensor",
+      c10::RegisterOperators::options()
+      .kernel<FakeQuantizePerTensorAffineOp_forward>()
+      .dispatchKey(CUDATensorId()))
+  .op("quantized::fake_quantize_per_tensor_affine_backward(Tensor X, Tensor dY, float scale, int zero_point, int num_bits=8, int quant_delay=0, int iter = 0) -> Tensor",
+      c10::RegisterOperators::options()
+      .kernel<FakeQuantizePerTensorAffineOp_backward>()
+      .dispatchKey(CUDATensorId()));
+
+} // namespace
+}} // namespace at::native
diff --git a/aten/src/ATen/native/sparse/SparseTensor.cpp b/aten/src/ATen/native/sparse/SparseTensor.cpp
index c4ec81d0c2ef..62860f80c6ff 100644
--- a/aten/src/ATen/native/sparse/SparseTensor.cpp
+++ b/aten/src/ATen/native/sparse/SparseTensor.cpp
@@ -52,13 +52,13 @@ Tensor &_coalesced_sparse_(SparseTensor& self, bool coalesced) {
 }
 
 Tensor indices_sparse(const Tensor& self) {
-  AT_CHECK(self.is_coalesced(),
+  TORCH_CHECK(self.is_coalesced(),
            "Cannot get indices on an uncoalesced tensor, please call .coalesce() first");
   return get_sparse_impl(self)->indices().alias();
 }
 
 Tensor values_sparse(const Tensor& self) {
-  AT_CHECK(self.is_coalesced(),
+  TORCH_CHECK(self.is_coalesced(),
            "Cannot get values on an uncoalesced tensor, please call .coalesce() first");
   return get_sparse_impl(self)->values().alias();
 }
@@ -91,7 +91,6 @@ SparseTensor new_with_dims_sparse(int64_t sparse_dim, int64_t dense_dim, ArrayRe
   return self;
 }
 
-// Does NOT make copies of indices and values
 SparseTensor new_with_dims_and_tensor_sparse(
     int64_t sparse_dim,
     int64_t dense_dim,
@@ -101,7 +100,16 @@ SparseTensor new_with_dims_and_tensor_sparse(
     const TensorOptions& options) {
   SparseTensor self = new_sparse(options);
   get_sparse_impl(self)->resize_(sparse_dim, dense_dim, size);
-  alias_into_sparse(self, indices, values);
+  // NOTE: There is no guarantee that `indices` and `values` don't contain AutogradMeta. However,
+  // we want to maintain the invariant that `indices_` and `values_` of a sparse tensor don't
+  // contain AutogradMeta, and to achieve that we shallow-copy `indices` and `values` here.
+  auto indices_shallow_copy = LongTensor(indices.unsafeGetTensorImpl()->shallow_copy_and_detach(
+    /*version_counter=*/indices.unsafeGetTensorImpl()->version_counter(),
+    /*allow_tensor_metadata_change=*/true));
+  auto values_shallow_copy = Tensor(values.unsafeGetTensorImpl()->shallow_copy_and_detach(
+    /*version_counter=*/values.unsafeGetTensorImpl()->version_counter(),
+    /*allow_tensor_metadata_change=*/true));
+  alias_into_sparse(self, indices_shallow_copy, values_shallow_copy);
   return self;
 }
 
@@ -109,7 +117,7 @@ SparseTensor new_with_dims_and_tensor_sparse(
 
 /** Empty init **/
 Tensor empty_sparse(IntArrayRef size, const TensorOptions& options) {
-  AT_CHECK(!options.pinned_memory(), "Only dense CPU tensors can be pinned");
+  TORCH_CHECK(!options.pinned_memory(), "Only dense CPU tensors can be pinned");
   return new_with_dims_sparse(size.size(), 0, size, options);
 }
 
@@ -137,11 +145,11 @@ Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values_, const Ten
   Tensor values = expand_values_if_needed(values_);
 
   // arg checking
-  AT_CHECK(!options.has_layout() || options.layout() == kSparse, "expected sparse layout, but got layout ", options.layout());
+  TORCH_CHECK(!options.has_layout() || options.layout() == kSparse, "expected sparse layout, but got layout ", options.layout());
   // the following checks are redundant because they are also checked in SparseTensorImpl::set_indices_and_values_unsafe
   // but we need to ensure them in order to infer the shape.
-  AT_CHECK(indices.dim() == 2, "indices must be sparse_dim x nnz, but got: ", indices.sizes())
-  AT_CHECK(!indices.is_sparse(), "expected indices to be a dense tensor, but got indices of layout ", indices.layout());
+  TORCH_CHECK(indices.dim() == 2, "indices must be sparse_dim x nnz, but got: ", indices.sizes())
+  TORCH_CHECK(!indices.is_sparse(), "expected indices to be a dense tensor, but got indices of layout ", indices.layout());
 
   // If sizes are not given, it is inferred as max index of each dim.
   int64_t sparse_dim = indices.size(0);
@@ -161,7 +169,7 @@ Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values_, const Ten
     auto cpu_computed_indices_sizes_accessor = cpu_computed_indices_sizes.accessor<int64_t, 1>();
     for (int64_t d = 0; d < sparse_dim; d++) {
       int64_t min_index_in_dim = cpu_min_indices_accessor[d];
-      AT_CHECK(min_index_in_dim >= 0,
+      TORCH_CHECK(min_index_in_dim >= 0,
                "found negative index ", min_index_in_dim, " for dim ", d);
       computed_sizes[static_cast<size_t>(d)] = cpu_computed_indices_sizes_accessor[d];
     }
@@ -186,14 +194,14 @@ Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values_, ArrayRef<
   Tensor values = expand_values_if_needed(values_);
 
   // arg checking
-  AT_CHECK(!options.has_layout() || options.layout() == kSparse, "expected sparse layout, but got layout ", options.layout());
+  TORCH_CHECK(!options.has_layout() || options.layout() == kSparse, "expected sparse layout, but got layout ", options.layout());
   // the following checks are redundant because they are also checked in SparseTensorImpl::set_indices_and_values_unsafe
   // but we need to ensure them in order to infer the shape.
-  AT_CHECK(indices.dim() == 2, "indices must be sparse_dim x nnz, but got: ", indices.sizes())
-  AT_CHECK(!indices.is_sparse(), "expected indices to be a dense tensor, but got indices of layout ", indices.layout());
+  TORCH_CHECK(indices.dim() == 2, "indices must be sparse_dim x nnz, but got: ", indices.sizes())
+  TORCH_CHECK(!indices.is_sparse(), "expected indices to be a dense tensor, but got indices of layout ", indices.layout());
   int64_t sparse_dim = indices.size(0);
   int64_t dense_dim = values.dim() - 1;
-  AT_CHECK(size.size() == sparse_dim + dense_dim,
+  TORCH_CHECK(size.size() == sparse_dim + dense_dim,
            "number of dimensions must be sparse_dim (", sparse_dim, ") + dense_dim (", dense_dim, "), but got ", size.size());
 
   // Check to make sure all indices are within the boundaries of `size`
@@ -214,11 +222,11 @@ Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values_, ArrayRef<
       // NB: This used to sync ndim times to access each entry; now we copy
       // everything to CPU first and then access it.
       int64_t min_index_in_dim = cpu_min_indices_accessor[d];
-      AT_CHECK(min_index_in_dim >= 0,
+      TORCH_CHECK(min_index_in_dim >= 0,
                "found negative index ", min_index_in_dim, " for dim ", d);
       int64_t max_index_in_dim = cpu_max_indices_accessor[d];
       int64_t dim_size = size[static_cast<size_t>(d)];
-      AT_CHECK(max_index_in_dim < dim_size,
+      TORCH_CHECK(max_index_in_dim < dim_size,
                "size is inconsistent with indices: for dim ", d, ", size is ", dim_size, " but found index ", max_index_in_dim);
     }
   }
@@ -236,7 +244,7 @@ Tensor _sparse_coo_tensor_unsafe(const Tensor& indices, const Tensor& values_, A
   Tensor values = expand_values_if_needed(values_);
 
   // arg checking
-  AT_CHECK(!options.has_layout() || options.layout() == kSparse, "expected sparse layout, but got layout ", options.layout());
+  TORCH_CHECK(!options.has_layout() || options.layout() == kSparse, "expected sparse layout, but got layout ", options.layout());
 
   int64_t sparse_dim = indices.size(0);
   int64_t dense_dim = values.dim() - 1;
@@ -288,8 +296,8 @@ SparseTensor dense_to_sparse(const Tensor& self, int64_t sparse_dim){
   int64_t dims = self.dim();
   // TODO: it seems like sparse_dim == 0 could be supported even if self.dim() > 0,
   // but this would take some work and doesn't seem particularly useful.
-  AT_CHECK(sparse_dim > 0 || self.dim() == 0, "sparse_dim must be >0 if dimensionality > 0");
-  AT_CHECK(sparse_dim <= dims,
+  TORCH_CHECK(sparse_dim > 0 || self.dim() == 0, "sparse_dim must be >0 if dimensionality > 0");
+  TORCH_CHECK(sparse_dim <= dims,
     "sparse_dim must be less than or equal to self.dim()");
   at::TensorOptions sparse_options = self.options().layout(kSparse);
   std::vector<int64_t> sizes = self.sizes().vec();
@@ -325,6 +333,9 @@ SparseTensor dense_to_sparse(const Tensor& self, int64_t sparse_dim){
 // NB: Dropped the resizeNd variants
 
 Tensor sparse_to_dense(const SparseTensor& self) {
+  if(self.scalar_type() == ScalarType::Half && self.options().device().is_cpu()) {
+    AT_ERROR("to_dense() not supported for float16 on CPU");
+  }
   Tensor dst = at::zeros(self.sizes(), self.options().layout(kStrided));
   return dst.add_(self);
 }
@@ -441,12 +452,12 @@ void inline sparse_mask_out_cpu_kernel(
 }
 
 SparseTensor& sparse_mask_out_cpu(SparseTensor& r, const Tensor& t, const SparseTensor& mask) {
-  AT_CHECK(mask.is_coalesced(), "sparse_mask: mask is uncoalesced");
-  AT_CHECK(mask.sizes().equals(t.sizes()), "sparse_mask: operands have incompatible sizes; self has size ",
+  TORCH_CHECK(mask.is_coalesced(), "sparse_mask: mask is uncoalesced");
+  TORCH_CHECK(mask.sizes().equals(t.sizes()), "sparse_mask: operands have incompatible sizes; self has size ",
       t.sizes(), " but mask has size ", mask.sizes());
   AT_ASSERT(!t.is_cuda()); // we were supposed to have dispatched on this
-  AT_CHECK(!r.is_cuda(), "sparse_mask: expected 'out' to be CPU, but got CUDA");
-  AT_CHECK(!mask.is_cuda(), "sparse_mask: expected 'mask' to be CPU, but got CUDA");
+  TORCH_CHECK(!r.is_cuda(), "sparse_mask: expected 'out' to be CPU, but got CUDA");
+  TORCH_CHECK(!mask.is_cuda(), "sparse_mask: expected 'mask' to be CPU, but got CUDA");
   resize_as_sparse_(r, mask);
   if (mask._nnz() == 0) {
     return r.zero_();
diff --git a/aten/src/ATen/native/sparse/SparseTensorMath.cpp b/aten/src/ATen/native/sparse/SparseTensorMath.cpp
index 67b7d24edc80..6aa1966d3f4b 100644
--- a/aten/src/ATen/native/sparse/SparseTensorMath.cpp
+++ b/aten/src/ATen/native/sparse/SparseTensorMath.cpp
@@ -99,7 +99,7 @@ SparseTensor& log1p_out_sparse(SparseTensor& r, const SparseTensor& t) {
 
   if (is_same_tensor(r, t)) {
     // don't have in-place log1p for uncoalesced input because coalesce() is not in-place
-    AT_CHECK(
+    TORCH_CHECK(
       r.is_coalesced(), "log1p: in-place on uncoalesced tensors is not supported yet!");
   }
   else {
@@ -110,7 +110,7 @@ SparseTensor& log1p_out_sparse(SparseTensor& r, const SparseTensor& t) {
 }
 
 SparseTensor& log1p_sparse_(SparseTensor& t) {
-  AT_CHECK(t.is_coalesced(), "log1p: in-place on uncoalesced tensors is not supported yet!");
+  TORCH_CHECK(t.is_coalesced(), "log1p: in-place on uncoalesced tensors is not supported yet!");
   return log1p_out_sparse(t, t);
 }
 
@@ -123,7 +123,7 @@ SparseTensor& log1p_sparse_(SparseTensor& t) {
 SparseTensor& pow_out_sparse_scalar(SparseTensor& r, const SparseTensor& t_, Scalar value) {
   AT_ASSERT(r.is_sparse());
   AT_ASSERT(t_.is_sparse());
-  AT_CHECK(value.toDouble() != 0, "pow: cannot raise to zeroth power on sparse tensor; it would make the result tensor dense");
+  TORCH_CHECK(value.toDouble() != 0, "pow: cannot raise to zeroth power on sparse tensor; it would make the result tensor dense");
 
   // This coalesce is why we can't easily provide an inplace variant
   SparseTensor t = t_.coalesce();
@@ -191,10 +191,10 @@ SparseTensor& add_out_sparse_cpu(SparseTensor& r, const SparseTensor& t, const S
   AT_ASSERT(r.is_sparse());
   AT_ASSERT(t.is_sparse());
   AT_ASSERT(!t.is_cuda());  // the dispatch argument
-  AT_CHECK(!r.is_cuda(), "add: expected 'out' to be CPU tensor, but got CUDA tensor");
-  AT_CHECK(!src.is_cuda(), "add: expected 'other' to be a CPU tensor, but got a CUDA tensor");
+  TORCH_CHECK(!r.is_cuda(), "add: expected 'out' to be CPU tensor, but got CUDA tensor");
+  TORCH_CHECK(!src.is_cuda(), "add: expected 'other' to be a CPU tensor, but got a CUDA tensor");
 
-  AT_CHECK(t.sizes().equals(src.sizes()), "add: expected sizes of 'self' and 'other' to match, but ", t.sizes(), " != ", src.sizes());
+  TORCH_CHECK(t.sizes().equals(src.sizes()), "add: expected sizes of 'self' and 'other' to match, but ", t.sizes(), " != ", src.sizes());
 
   if (src._nnz() == 0) {
     return copy_sparse_to_sparse_(r, t);
@@ -203,7 +203,7 @@ SparseTensor& add_out_sparse_cpu(SparseTensor& r, const SparseTensor& t, const S
     return mul_out_sparse_scalar(r, src, value);
   }
 
-  AT_CHECK(is_same_density(t, src), "add: expected 'self' and 'other' to have same density, but 'self' has ", t.sparse_dim(), " sparse dimensions while 'other' has ", src.sparse_dim(), " sparse dimensions");
+  TORCH_CHECK(is_same_density(t, src), "add: expected 'self' and 'other' to have same density, but 'self' has ", t.sparse_dim(), " sparse dimensions while 'other' has ", src.sparse_dim(), " sparse dimensions");
 
   // saving those because they can be overwritten when doing in-place operations
   int64_t t_nnz = t._nnz(), s_nnz = src._nnz(), max_nnz = t_nnz + s_nnz;
@@ -336,10 +336,10 @@ Tensor& add_out_dense_sparse_cpu(Tensor& r, const Tensor& dense, SparseTensorRef
   AT_ASSERT(sparse_.is_sparse());
 
   AT_ASSERT(!dense.is_cuda()); // dispatch argument
-  AT_CHECK(!r.is_cuda(), "add: expected 'out' to be CPU tensor, but got CUDA tensor");
-  AT_CHECK(!sparse_.is_cuda(), "add: expected 'other' to be a CPU tensor, but got a CUDA tensor");
+  TORCH_CHECK(!r.is_cuda(), "add: expected 'out' to be CPU tensor, but got CUDA tensor");
+  TORCH_CHECK(!sparse_.is_cuda(), "add: expected 'other' to be a CPU tensor, but got a CUDA tensor");
 
-  AT_CHECK(dense.sizes().equals(sparse_.sizes()), "add: expected 'self' and 'other' to have same size, but self has size ",
+  TORCH_CHECK(dense.sizes().equals(sparse_.sizes()), "add: expected 'self' and 'other' to have same size, but self has size ",
     dense.sizes(), " while other has size ", sparse_.sizes(), " (FYI: dense-sparse addition does not currently support broadcasting)");
 
   r.resize_as_(dense);
@@ -384,12 +384,12 @@ SparseTensor& mul_out_sparse_cpu(SparseTensor& r, const Tensor& t_, const Tensor
     return mul_out_sparse_zerodim(r, src_, t_);
   }
 
-  AT_CHECK(t_.sizes().equals(src_.sizes()), "mul operands have incompatible sizes");
+  TORCH_CHECK(t_.sizes().equals(src_.sizes()), "mul operands have incompatible sizes");
   AT_ASSERT(!t_.is_cuda()); // dispatch argument
-  AT_CHECK(!r.is_cuda(), "mul: expected 'out' to be CPU tensor, but got CUDA tensor");
-  AT_CHECK(!src_.is_cuda(), "mul: expected 'other' to be a CPU tensor, but got a CUDA tensor");
+  TORCH_CHECK(!r.is_cuda(), "mul: expected 'out' to be CPU tensor, but got CUDA tensor");
+  TORCH_CHECK(!src_.is_cuda(), "mul: expected 'other' to be a CPU tensor, but got a CUDA tensor");
 
-  AT_CHECK(t_.sizes().equals(src_.sizes()), "mul: expected 'self' and 'other' to have same sizes, but ", t_.sizes(), " != ", src_.sizes());
+  TORCH_CHECK(t_.sizes().equals(src_.sizes()), "mul: expected 'self' and 'other' to have same sizes, but ", t_.sizes(), " != ", src_.sizes());
 
   if (src_._nnz() == 0 || t_._nnz() == 0) {
     r.resize_as_(src_);
@@ -536,24 +536,24 @@ Tensor& s_addmm_out_sparse_dense_cpu(
 ) {
   // TODO: This error message seems awfully opaque
   AT_ASSERT(!t.is_cuda());
-  AT_CHECK(!r.is_cuda(), "addmm: expected 'out' to be CPU tensor, but got CUDA tensor");
-  AT_CHECK(!sparse_.is_cuda(), "addmm: expected 'mat1' to be a CPU tensor, but got a CUDA tensor");
-  AT_CHECK(!dense.is_cuda(), "addmm: expected 'mat2' to be a CPU tensor, but got a CUDA tensor");
+  TORCH_CHECK(!r.is_cuda(), "addmm: expected 'out' to be CPU tensor, but got CUDA tensor");
+  TORCH_CHECK(!sparse_.is_cuda(), "addmm: expected 'mat1' to be a CPU tensor, but got a CUDA tensor");
+  TORCH_CHECK(!dense.is_cuda(), "addmm: expected 'mat2' to be a CPU tensor, but got a CUDA tensor");
 
-  AT_CHECK(sparse_.sparse_dim() == 2, "addmm: matrices expected, got ", sparse_.sparse_dim(), "D tensor");
-  AT_CHECK(sparse_.dense_dim() == 0, "addmm: scalar values expected, got ", sparse_.dense_dim(), "D values");
-  AT_CHECK(dense.dim() == 2, "addmm: matrices expected, got ", dense.dim(), "D tensor");
+  TORCH_CHECK(sparse_.sparse_dim() == 2, "addmm: matrices expected, got ", sparse_.sparse_dim(), "D tensor");
+  TORCH_CHECK(sparse_.dense_dim() == 0, "addmm: scalar values expected, got ", sparse_.dense_dim(), "D values");
+  TORCH_CHECK(dense.dim() == 2, "addmm: matrices expected, got ", dense.dim(), "D tensor");
 
   // ixj * jxk = ixk
   int64_t dim_i = sparse_.size(0);
   int64_t dim_j = sparse_.size(1);
   int64_t dim_k = dense.size(1);
 
-  AT_CHECK(dense.size(0) == dim_j,
+  TORCH_CHECK(dense.size(0) == dim_j,
       "addmm: Argument #3 (dense): Expected dim 0 size ", dim_j, ", got ", dense.size(0));
-  AT_CHECK(t.size(0) == dim_i,
+  TORCH_CHECK(t.size(0) == dim_i,
       "addmm: Argument #1 (t): Expected dim 0 size ", dim_i, ", got ", t.size(0));
-  AT_CHECK(t.size(1) == dim_k,
+  TORCH_CHECK(t.size(1) == dim_k,
       "addmm: Argument #1 (t): Expected dim 1 size ", dim_k, ", got ", t.size(1));
 
   r.resize_({dim_i, dim_k});
@@ -629,21 +629,21 @@ SparseTensor& hspmm_out_sparse_cpu(SparseTensor& r, const SparseTensor& sparse_,
   Scalar alpha = 1;
 
   AT_ASSERT(!sparse_.is_cuda()); // dispatch argument
-  AT_CHECK(!r.is_cuda(), "hspmm: expected 'out' to be CPU tensor, but got CUDA tensor");
-  AT_CHECK(!dense.is_cuda(), "hspmm: expected 'other' to be a CPU tensor, but got a CUDA tensor");
+  TORCH_CHECK(!r.is_cuda(), "hspmm: expected 'out' to be CPU tensor, but got CUDA tensor");
+  TORCH_CHECK(!dense.is_cuda(), "hspmm: expected 'other' to be a CPU tensor, but got a CUDA tensor");
 
-  AT_CHECK(sparse_.sparse_dim() == 2,
+  TORCH_CHECK(sparse_.sparse_dim() == 2,
       "hspmm: Argument #2: matrices expected, got ", sparse_.sparse_dim(), "D tensor");
-  AT_CHECK(sparse_.dense_dim() == 0,
+  TORCH_CHECK(sparse_.dense_dim() == 0,
       "hspmm: Argument #2: scalar values expected, got ", sparse_.dense_dim(), "D values");
-  AT_CHECK(dense.dim() == 2,
+  TORCH_CHECK(dense.dim() == 2,
       "hspmm: Argument #3: matrices expected, got ", dense.dim(), "D tensor");
 
   int64_t m = sparse_.size(0);
   int64_t k = sparse_.size(1);
   int64_t n = dense.size(1);
 
-  AT_CHECK(dense.size(0) == k,
+  TORCH_CHECK(dense.size(0) == k,
       "hspmm: Argument #3: Expected dim 0 size ", k, ", got ", dense.size(0));
 
   get_sparse_impl(r)->raw_resize_(1, 1, {m, n});
@@ -714,15 +714,15 @@ SparseTensor& _sspaddmm_out_cpu(
     Scalar alpha
 ) {
   AT_ASSERT(!t.is_cuda()); // dispatch argument
-  AT_CHECK(!r.is_cuda(), "sspaddmm: expected 'out' to be CPU tensor, but got CUDA tensor");
-  AT_CHECK(!sparse_.is_cuda(), "sspaddmm: expected 'mat1' to be a CPU tensor, but got a CUDA tensor");
-  AT_CHECK(!dense.is_cuda(), "sspaddmm: expected 'mat2' to be a CPU tensor, but got a CUDA tensor");
+  TORCH_CHECK(!r.is_cuda(), "sspaddmm: expected 'out' to be CPU tensor, but got CUDA tensor");
+  TORCH_CHECK(!sparse_.is_cuda(), "sspaddmm: expected 'mat1' to be a CPU tensor, but got a CUDA tensor");
+  TORCH_CHECK(!dense.is_cuda(), "sspaddmm: expected 'mat2' to be a CPU tensor, but got a CUDA tensor");
 
-  AT_CHECK(sparse_.sparse_dim() == 2,
+  TORCH_CHECK(sparse_.sparse_dim() == 2,
       "sspaddmm: Argument #2: matrices expected, got ", sparse_.sparse_dim(), "D tensor");
-  AT_CHECK(sparse_.dense_dim() == 0,
+  TORCH_CHECK(sparse_.dense_dim() == 0,
       "sspaddmm: Argument #2: scalar values expected, got ", sparse_.dense_dim(), "D values");
-  AT_CHECK(dense.dim() == 2,
+  TORCH_CHECK(dense.dim() == 2,
       "sspaddmm: Argument #2: matrices expected, got ", dense.dim(), "D tensor");
 
   SparseTensor sparse = sparse_.coalesce();
@@ -736,11 +736,11 @@ SparseTensor& _sspaddmm_out_cpu(
   // See test_saddmm
   get_sparse_impl(r)->raw_resize_(2, 0, {dim_i, dim_k});
 
-  AT_CHECK(dense.size(0) == dim_j,
+  TORCH_CHECK(dense.size(0) == dim_j,
       "sspaddmm: Argument #3: Expected dim 0 size ", dim_j, ", got ", dense.size(0));
-  AT_CHECK(t.size(0) == dim_i,
+  TORCH_CHECK(t.size(0) == dim_i,
       "sspaddmm: Argument #1: Expected dim 0 size ", dim_i, ", got ", t.size(0));
-  AT_CHECK(t.size(1) == dim_k,
+  TORCH_CHECK(t.size(1) == dim_k,
       "sspaddmm: Argument #1: Expected dim 1 size ", dim_k, ", got ", t.size(1));
 
   int64_t nnz        = sparse._nnz();
@@ -858,7 +858,7 @@ Tensor _sparse_sum(const SparseTensor& input, IntArrayRef dims_to_sum, ScalarTyp
 }
 
 Tensor _sparse_sum(const SparseTensor& input, IntArrayRef dims_to_sum) {
-  AT_CHECK(input._nnz() > 0, "_sparse_sum: sparse tensor input._nnz() == 0, please call torch.sparse.sum(input) instead.")
+  TORCH_CHECK(input._nnz() > 0, "_sparse_sum: sparse tensor input._nnz() == 0, please call torch.sparse.sum(input) instead.")
 
   const int64_t input_dim = input.dim();
   auto dims_to_sum_b = dim_list_to_bitset(dims_to_sum, input_dim);
@@ -975,8 +975,8 @@ Tensor _sparse_sum(const SparseTensor& input, IntArrayRef dims_to_sum) {
 // - grad.values might have zeros
 // --------------------------------------------------------------------
 Tensor _sparse_sum_backward_cpu(const Tensor& grad_, const SparseTensor& input_, IntArrayRef dims_to_sum) {
-  AT_CHECK(!grad_.is_cuda(), "_sparse_sum_backward_cpu: expected 'grad_' to be CPU tensor, but got CUDA tensor");
-  AT_CHECK(!input_.is_cuda(), "_sparse_sum_backward_cpu: expected 'input_' to be CPU tensor, but got CUDA tensor");
+  TORCH_CHECK(!grad_.is_cuda(), "_sparse_sum_backward_cpu: expected 'grad_' to be CPU tensor, but got CUDA tensor");
+  TORCH_CHECK(!input_.is_cuda(), "_sparse_sum_backward_cpu: expected 'input_' to be CPU tensor, but got CUDA tensor");
 
   auto input = input_.coalesce();
   const int64_t input_dim = input.dim();
@@ -1009,7 +1009,7 @@ Tensor _sparse_sum_backward_cpu(const Tensor& grad_, const SparseTensor& input_,
   const bool sum_sparse_dim = (sparse_dims_to_sum_size > 0);
 
   if (sum_all_sparse_dim) {
-    AT_CHECK(!grad_.is_sparse(), "_sparse_sum_backward_cpu: expected grad_ Tensor to be dense since all sparse dims are summed");
+    TORCH_CHECK(!grad_.is_sparse(), "_sparse_sum_backward_cpu: expected grad_ Tensor to be dense since all sparse dims are summed");
     auto grad_input_values = grad_;
     auto expand_size = input_values.sizes().vec();
     if (sum_dense_dim) {
@@ -1023,7 +1023,7 @@ Tensor _sparse_sum_backward_cpu(const Tensor& grad_, const SparseTensor& input_,
     return at::_sparse_coo_tensor_with_dims_and_tensors(input_sparse_dim, input_dense_dim, input_sizes, input_indices.clone(), grad_input_values, input.options().dtype(grad_.dtype())); // convert to grad dtype
   }
   else {
-    AT_CHECK(grad_.is_sparse(), "_sparse_sum_backward_cpu: expected grad_ Tensor to be sparse, but got dense");
+    TORCH_CHECK(grad_.is_sparse(), "_sparse_sum_backward_cpu: expected grad_ Tensor to be sparse, but got dense");
     auto grad = grad_.coalesce();
     LongTensor grad_indices = grad._indices();
     Tensor grad_values = grad._values();
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cu
index f364ee7d204f..bea1a8aa94eb 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cu
@@ -66,7 +66,7 @@ inline cusparseHandle_t setCUDASparseStream() {
 }
 
 void Xcoo2csr(const int *coorowind, int64_t nnz, int64_t m, int *csrrowptr) {
-  AT_CHECK((m <= INT_MAX) && (nnz <= INT_MAX),
+  TORCH_CHECK((m <= INT_MAX) && (nnz <= INT_MAX),
     "cusparseXcoo2csr only supports m, nnz with the bound [val] <= ",
     INT_MAX);
 
@@ -117,7 +117,7 @@ void Scsrmm2(char transa, char transb, int64_t m, int64_t n, int64_t k, int64_t
   cusparseOperation_t opa = convertTransToCusparseOperation(transa);
   cusparseOperation_t opb = convertTransToCusparseOperation(transb);
 
-  AT_CHECK((m <= INT_MAX) && (n <= INT_MAX) && (k <= INT_MAX) && (nnz <= INT_MAX)  && (ldb <= INT_MAX) && (ldc <= INT_MAX),
+  TORCH_CHECK((m <= INT_MAX) && (n <= INT_MAX) && (k <= INT_MAX) && (nnz <= INT_MAX)  && (ldb <= INT_MAX) && (ldc <= INT_MAX),
     "cusparseScsrmm2 only supports m, n, k, nnz, ldb, ldc with the bound [val] <= ", INT_MAX);
   int i_m = (int)m;
   int i_n = (int)n;
@@ -144,7 +144,7 @@ void Dcsrmm2(char transa, char transb, int64_t m, int64_t n, int64_t k, int64_t
   cusparseOperation_t opa = convertTransToCusparseOperation(transa);
   cusparseOperation_t opb = convertTransToCusparseOperation(transb);
 
-  AT_CHECK((m <= INT_MAX) && (n <= INT_MAX) && (k <= INT_MAX) && (nnz <= INT_MAX)  && (ldb <= INT_MAX) && (ldc <= INT_MAX),
+  TORCH_CHECK((m <= INT_MAX) && (n <= INT_MAX) && (k <= INT_MAX) && (nnz <= INT_MAX)  && (ldb <= INT_MAX) && (ldc <= INT_MAX),
     "cusparseDcsrmm2 only supports m, n, k, nnz, ldb, ldc with the bound [val] <= ", INT_MAX);
   int i_m = (int)m;
   int i_n = (int)n;
@@ -169,7 +169,7 @@ void Dcsrmm2(char transa, char transb, int64_t m, int64_t n, int64_t k, int64_t
 
 /* format conversion */
 void CreateIdentityPermutation(int64_t nnz, int *P) {
-  AT_CHECK((nnz <= INT_MAX),
+  TORCH_CHECK((nnz <= INT_MAX),
     "Xcsrsort_bufferSizeExt only supports m, n, nnz with the bound [val] <= ",
     INT_MAX);
   int i_nnz = (int)nnz;
@@ -180,7 +180,7 @@ void CreateIdentityPermutation(int64_t nnz, int *P) {
 
 void Xcsrsort_bufferSizeExt(int64_t m, int64_t n, int64_t nnz, const int *csrRowPtr, const int *csrColInd, size_t *pBufferSizeInBytes)
 {
-  AT_CHECK((m <= INT_MAX) && (n <= INT_MAX) && (nnz <= INT_MAX),
+  TORCH_CHECK((m <= INT_MAX) && (n <= INT_MAX) && (nnz <= INT_MAX),
     "Xcsrsort_bufferSizeExt only supports m, n, nnz with the bound [val] <=",
     INT_MAX);
   int i_m = (int)m;
@@ -193,7 +193,7 @@ void Xcsrsort_bufferSizeExt(int64_t m, int64_t n, int64_t nnz, const int *csrRow
 
 void Xcsrsort(int64_t m, int64_t n, int64_t nnz, const int *csrRowPtr, int *csrColInd, int *P, void *pBuffer)
 {
-  AT_CHECK((m <= INT_MAX) && (n <= INT_MAX) && (nnz <= INT_MAX),
+  TORCH_CHECK((m <= INT_MAX) && (n <= INT_MAX) && (nnz <= INT_MAX),
     "Xcsrsort only supports m, n, nnz with the bound [val] <= ",
     INT_MAX);
   int i_m = (int)m;
@@ -209,7 +209,7 @@ void Xcsrsort(int64_t m, int64_t n, int64_t nnz, const int *csrRowPtr, int *csrC
 
 void Xcoosort_bufferSizeExt(int64_t m, int64_t n, int64_t nnz, const int *cooRows, const int *cooCols, size_t *pBufferSizeInBytes)
 {
-  AT_CHECK((m <= INT_MAX) && (n <= INT_MAX) && (nnz <= INT_MAX),
+  TORCH_CHECK((m <= INT_MAX) && (n <= INT_MAX) && (nnz <= INT_MAX),
     "Xcoosort_bufferSizeExt only supports m, n, nnz with the bound [val] <= ",
     INT_MAX);
   int i_m = (int)m;
@@ -222,7 +222,7 @@ void Xcoosort_bufferSizeExt(int64_t m, int64_t n, int64_t nnz, const int *cooRow
 
 void XcoosortByRow(int64_t m, int64_t n, int64_t nnz, int *cooRows, int *cooCols, int *P, void *pBuffer)
 {
-  AT_CHECK((m <= INT_MAX) && (n <= INT_MAX) && (nnz <= INT_MAX),
+  TORCH_CHECK((m <= INT_MAX) && (n <= INT_MAX) && (nnz <= INT_MAX),
     "XcoosortByRow only supports m, n, nnz with the bound [val] <= ",
     INT_MAX);
   int i_m = (int)m;
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cpp b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cpp
index 5bb961591c6f..15cd4b0e15cb 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cpp
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cpp
@@ -9,13 +9,13 @@ namespace at { namespace native {
 using namespace at::sparse;
 
 SparseTensor& sparse_mask_out_cuda(SparseTensor& r, const Tensor& t, const SparseTensor& mask) {
-  AT_CHECK(mask.is_coalesced(), "sparse_mask: mask is uncoalesced");
-  AT_CHECK(mask.sizes().equals(t.sizes()), "sparse_mask: operands have incompatible sizes; self has size ",
+  TORCH_CHECK(mask.is_coalesced(), "sparse_mask: mask is uncoalesced");
+  TORCH_CHECK(mask.sizes().equals(t.sizes()), "sparse_mask: operands have incompatible sizes; self has size ",
       t.sizes(), " but mask has size ", mask.sizes());
   AT_ASSERT(t.is_cuda());  // dispatch argument
-  AT_CHECK(mask.is_cuda(), "sparse_mask: expected 'mask' to be CUDA, but got CPU");
-  AT_CHECK(r.is_cuda(), "sparse_mask: expected 'out' to be CUDA, but got CPU");
-  AT_CHECK(cuda::check_device({r, t, mask}),
+  TORCH_CHECK(mask.is_cuda(), "sparse_mask: expected 'mask' to be CUDA, but got CPU");
+  TORCH_CHECK(r.is_cuda(), "sparse_mask: expected 'out' to be CUDA, but got CPU");
+  TORCH_CHECK(cuda::check_device({r, t, mask}),
       "sparse_mask: arguments are located on different devices; self is on device ", t.get_device(),
       ", mask is on device ", mask.get_device(), ", out is on device ", r.get_device());
   resize_as_sparse_(r, mask);
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
index bf7bb8b73cd8..452fb397c13b 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
@@ -52,14 +52,14 @@ namespace {
 
 Tensor& s_addmm_out_sparse_dense_cuda(Tensor& r_, const Tensor& t, const SparseTensor& sparse_, const Tensor& dense, Scalar beta, Scalar alpha) {
   AT_ASSERT(t.is_cuda()); // dispatch argument
-  AT_CHECK(r_.is_cuda(), "addmm: expected 'out' to be CUDA, but got CPU");
-  AT_CHECK(sparse_.is_cuda(), "addmm: expected 'mat1' to be CUDA, but got CPU");
-  AT_CHECK(dense.is_cuda(), "addmm: expected 'mat2' to be CUDA, but got CPU");
+  TORCH_CHECK(r_.is_cuda(), "addmm: expected 'out' to be CUDA, but got CPU");
+  TORCH_CHECK(sparse_.is_cuda(), "addmm: expected 'mat1' to be CUDA, but got CPU");
+  TORCH_CHECK(dense.is_cuda(), "addmm: expected 'mat2' to be CUDA, but got CPU");
 
-  AT_CHECK(cuda::check_device({sparse_, r_, t, dense}));
+  TORCH_CHECK(cuda::check_device({sparse_, r_, t, dense}));
 
-  AT_CHECK(dense.dim() == 2, "addmm: 2D tensor expected, got ", dense.dim(), "D tensor");
-  AT_CHECK(sparse_.sparse_dim() == 2, "addmm: expected first two dims to be sparse (indices has size 2 at first dim), but got ", sparse_.sparse_dim(), " spase dims");
+  TORCH_CHECK(dense.dim() == 2, "addmm: 2D tensor expected, got ", dense.dim(), "D tensor");
+  TORCH_CHECK(sparse_.sparse_dim() == 2, "addmm: expected first two dims to be sparse (indices has size 2 at first dim), but got ", sparse_.sparse_dim(), " spase dims");
   // no need to check dense_dim because dense_dim + sparse_dim = dim
 
   // mxk * kxn = mxn
@@ -67,11 +67,11 @@ Tensor& s_addmm_out_sparse_dense_cuda(Tensor& r_, const Tensor& t, const SparseT
   int64_t k = sparse_.size(1);
   int64_t n = dense.size(1);
 
-  AT_CHECK(t.size(0) == m,
+  TORCH_CHECK(t.size(0) == m,
       "addmm: Argument #1 (t): Expected dim 0 size ", m, ", got ", t.size(0));
-  AT_CHECK(t.size(1) == n,
+  TORCH_CHECK(t.size(1) == n,
       "addmm: Argument #1 (t): Expected dim 1 size ", n, ", got ", t.size(1));
-  AT_CHECK(dense.size(0) == k,
+  TORCH_CHECK(dense.size(0) == k,
       "addmm: Argument #3 (dense): Expected dim 0 size ", k, ", got ", dense.size(0));
 
   r_.resize_({m, n});
@@ -181,23 +181,23 @@ Tensor& s_addmm_sparse_dense_cuda_(
 
 SparseTensor& hspmm_out_sparse_cuda(SparseTensor& r_, const SparseTensor& sparse_, const Tensor& dense/* , Scalar alpha */) {
   AT_ASSERT(sparse_.is_cuda()); // dispatch argument
-  AT_CHECK(r_.is_cuda(), "hspmm: expected 'out' to be CUDA, but got CPU");
-  AT_CHECK(dense.is_cuda(), "hspmm: expected 'mat2' to be CUDA, but got CPU");
+  TORCH_CHECK(r_.is_cuda(), "hspmm: expected 'out' to be CUDA, but got CPU");
+  TORCH_CHECK(dense.is_cuda(), "hspmm: expected 'mat2' to be CUDA, but got CPU");
 
-  AT_CHECK(cuda::check_device({r_, sparse_, dense}));
+  TORCH_CHECK(cuda::check_device({r_, sparse_, dense}));
 
-  AT_CHECK(sparse_.sparse_dim() == 2,
+  TORCH_CHECK(sparse_.sparse_dim() == 2,
       "hspmm: Argument #2: 2D tensor expected, got ", sparse_.sparse_dim(), "D tensor");
-  AT_CHECK(sparse_.dense_dim() == 0,
+  TORCH_CHECK(sparse_.dense_dim() == 0,
       "hspmm: Argument #2: scalar values expected, got ", sparse_.dense_dim(), "D values");
-  AT_CHECK(dense.dim() == 2,
+  TORCH_CHECK(dense.dim() == 2,
       "hspmm: Argument #3: 2D tensor expected, got ", dense.dim(), "D tensor");
 
   int64_t m = sparse_.size(0);
   int64_t k = sparse_.size(1);
   int64_t n = dense.size(1);
 
-  AT_CHECK(dense.size(0) == k,
+  TORCH_CHECK(dense.size(0) == k,
       "hspmm: Argument #3: Expected dim 0 size ", k, ", got ", dense.size(0));
 
   get_sparse_impl(r_)->resize_and_clear_(1, 1, {m, n});
@@ -252,12 +252,12 @@ Tensor& add_out_dense_sparse_cuda(Tensor& r_, const Tensor& dense, SparseTensorR
   const SparseTensor& sparse = sparse_.tref;
 
   AT_ASSERT(dense.is_cuda()); // dispatch argument
-  AT_CHECK(sparse.is_cuda(), "add: expected 'other' to be CUDA, but got CPU");
-  AT_CHECK(r_.is_cuda(), "add: expected 'out' to be CUDA, but got CPU");
+  TORCH_CHECK(sparse.is_cuda(), "add: expected 'other' to be CUDA, but got CPU");
+  TORCH_CHECK(r_.is_cuda(), "add: expected 'out' to be CUDA, but got CPU");
 
-  AT_CHECK(cuda::check_device({sparse, r_, dense}));
+  TORCH_CHECK(cuda::check_device({sparse, r_, dense}));
 
-  AT_CHECK(dense.sizes().equals(sparse.sizes()), "add: expected 'self' and 'other' to have same size, but self has size ",
+  TORCH_CHECK(dense.sizes().equals(sparse.sizes()), "add: expected 'self' and 'other' to have same size, but self has size ",
     dense.sizes(), " while other has size ", sparse.sizes(), " (FYI: dense-sparse addition does not currently support broadcasting)");
 
   const int64_t nnz = sparse._nnz();
@@ -272,7 +272,7 @@ Tensor& add_out_dense_sparse_cuda(Tensor& r_, const Tensor& dense, SparseTensorR
     r_.resize_as_(dense);
     r_.copy_(dense);
   } else {
-    AT_CHECK(r_.is_contiguous(), "add: CUDA dense-sparse addition with a non-contiguous output tensor does not work; shout if you need it (see https://github.com/pytorch/pytorch/issues/1521 )");
+    TORCH_CHECK(r_.is_contiguous(), "add: CUDA dense-sparse addition with a non-contiguous output tensor does not work; shout if you need it (see https://github.com/pytorch/pytorch/issues/1521 )");
     r = r_.contiguous();
   }
 
@@ -293,7 +293,7 @@ Tensor& add_out_dense_sparse_cuda(Tensor& r_, const Tensor& dense, SparseTensorR
     cudaGetDevice(&curDevice);
     cudaStream_t stream = at::cuda::getCurrentCUDAStream(curDevice);
     if (sparse.dense_dim() == 0) {
-      AT_CHECK(cuda::getApplyGrid(nnz, grid, curDevice), "add: Argument #0: tensor too large or too many dimensions");
+      TORCH_CHECK(cuda::getApplyGrid(nnz, grid, curDevice), "add: Argument #0: tensor too large or too many dimensions");
 
       AT_DISPATCH_ALL_TYPES_AND(
         at::ScalarType::Half, values.scalar_type(), "add_out_dense_sparse_cuda", [&] {
@@ -304,7 +304,7 @@ Tensor& add_out_dense_sparse_cuda(Tensor& r_, const Tensor& dense, SparseTensorR
                 static_cast<uint64_t>(nnz));
           });
     } else {
-      AT_CHECK(cuda::getApplyGrid(nnz * block.x, grid, curDevice), "add: Argument #0: tensor too large or too many dimensions");
+      TORCH_CHECK(cuda::getApplyGrid(nnz * block.x, grid, curDevice), "add: Argument #0: tensor too large or too many dimensions");
 
       // sparseElementwiseKernel needs values to be contiguous too
       values = values.contiguous();
@@ -354,11 +354,11 @@ Tensor& add_out_dense_sparse_cuda(Tensor& r_, const Tensor& dense, SparseTensorR
 
 SparseTensor& add_out_sparse_cuda(SparseTensor& r_, const SparseTensor& t, const SparseTensor& src, Scalar value) {
   AT_ASSERT(t.is_cuda()); // dispatch argument
-  AT_CHECK(src.is_cuda(), "add: expected 'other' to be CUDA, but got CPU");
-  AT_CHECK(r_.is_cuda(), "add: expected 'out' to be CUDA, but got CPU");
+  TORCH_CHECK(src.is_cuda(), "add: expected 'other' to be CUDA, but got CPU");
+  TORCH_CHECK(r_.is_cuda(), "add: expected 'out' to be CUDA, but got CPU");
 
-  AT_CHECK(cuda::check_device({r_, t, src}));
-  AT_CHECK(t.sizes().equals(src.sizes()), "add: expected 'self' and 'other' to have same size, but ", t.sizes(), " != ", src.sizes());
+  TORCH_CHECK(cuda::check_device({r_, t, src}));
+  TORCH_CHECK(t.sizes().equals(src.sizes()), "add: expected 'self' and 'other' to have same size, but ", t.sizes(), " != ", src.sizes());
 
   if (src._nnz() == 0) {
     return copy_sparse_to_sparse_(r_, t);
@@ -367,7 +367,7 @@ SparseTensor& add_out_sparse_cuda(SparseTensor& r_, const SparseTensor& t, const
     return mul_out_sparse_scalar(r_, src, value);
   }
 
-  AT_CHECK(is_same_density(t, src), "add: expected 'self' and 'other' to have same density, but 'self' has ", t.sparse_dim(), " sparse dimensions while 'other' has ", src.sparse_dim(), " sparse dimensions");
+  TORCH_CHECK(is_same_density(t, src), "add: expected 'self' and 'other' to have same density, but 'self' has ", t.sparse_dim(), " sparse dimensions while 'other' has ", src.sparse_dim(), " sparse dimensions");
 
   // We deliberately choose to simply concat the indices and values tensors
   // rather than merging them. This removes the need to synchronously fetch nnz
@@ -413,10 +413,10 @@ SparseTensor& mul_out_sparse_cuda(SparseTensor& r_, const SparseTensor& t_, cons
   }
 
   AT_ASSERT(t_.is_cuda()); // dispatch argument
-  AT_CHECK(src_.is_cuda(), "mul: expected 'other' to be CUDA, but got CPU");
-  AT_CHECK(r_.is_cuda(), "mul: expected 'out' to be CUDA, but got CPU");
-  AT_CHECK(cuda::check_device({r_, t_, src_}));
-  AT_CHECK(t_.sizes().equals(src_.sizes()), "mul: expected 'self' and 'other' to have same size, but ", t_.sizes(), " != ", src_.sizes());
+  TORCH_CHECK(src_.is_cuda(), "mul: expected 'other' to be CUDA, but got CPU");
+  TORCH_CHECK(r_.is_cuda(), "mul: expected 'out' to be CUDA, but got CPU");
+  TORCH_CHECK(cuda::check_device({r_, t_, src_}));
+  TORCH_CHECK(t_.sizes().equals(src_.sizes()), "mul: expected 'self' and 'other' to have same size, but ", t_.sizes(), " != ", src_.sizes());
 
   SparseTensor t = t_.coalesce();
   SparseTensor src = src_.coalesce();
@@ -445,7 +445,7 @@ SparseTensor& mul_out_sparse_cuda(SparseTensor& r_, const SparseTensor& t_, cons
   int curDevice = -1;
   cudaGetDevice(&curDevice);
   cudaStream_t stream = at::cuda::getCurrentCUDAStream(curDevice);
-  AT_CHECK(cuda::getApplyGrid(valueSize, grid, curDevice), "mul: Argument #0: tensor too large or too many dimensions");
+  TORCH_CHECK(cuda::getApplyGrid(valueSize, grid, curDevice), "mul: Argument #0: tensor too large or too many dimensions");
 
   LongTensor resultNnz = at::empty({1}, CUDA(kLong));
   AT_DISPATCH_ALL_TYPES_AND(
@@ -519,8 +519,8 @@ __global__ void _sparse_sum_backward_cuda_kernel(
 }
 
 Tensor _sparse_sum_backward_cuda(const Tensor& grad_, const SparseTensor& input_, IntArrayRef dims_to_sum) {
-  AT_CHECK(grad_.is_cuda(), "_sparse_sum_backward_cuda: expected 'grad_' to be CUDA tensor, but got CPU tensor");
-  AT_CHECK(input_.is_cuda(), "_sparse_sum_backward_cuda: expected 'input_' to be CUDA tensor, but got CPU tensor");
+  TORCH_CHECK(grad_.is_cuda(), "_sparse_sum_backward_cuda: expected 'grad_' to be CUDA tensor, but got CPU tensor");
+  TORCH_CHECK(input_.is_cuda(), "_sparse_sum_backward_cuda: expected 'input_' to be CUDA tensor, but got CPU tensor");
 
   auto input = input_.coalesce();
   const int64_t input_dim = input.dim();
@@ -553,7 +553,7 @@ Tensor _sparse_sum_backward_cuda(const Tensor& grad_, const SparseTensor& input_
   const bool sum_sparse_dim = (sparse_dims_to_sum_size > 0);
 
   if (sum_all_sparse_dim) {
-    AT_CHECK(!grad_.is_sparse(), "_sparse_sum_backward_cuda: expected grad Tensor to be dense since all sparse dims are summed");
+    TORCH_CHECK(!grad_.is_sparse(), "_sparse_sum_backward_cuda: expected grad Tensor to be dense since all sparse dims are summed");
     auto grad_input_values = grad_;
     auto expand_size = input_values.sizes().vec();
     if (sum_dense_dim) {
@@ -566,7 +566,7 @@ Tensor _sparse_sum_backward_cuda(const Tensor& grad_, const SparseTensor& input_
     return at::_sparse_coo_tensor_with_dims_and_tensors(input_sparse_dim, input_dense_dim, input_sizes, input_indices.clone(), grad_input_values,  input.options().dtype(grad_.dtype())); // convert to grad dtype
   }
   else {
-    AT_CHECK(grad_.is_sparse(), "_sparse_sum_backward_cuda: expected grad_ Tensor to be sparse, but got dense");
+    TORCH_CHECK(grad_.is_sparse(), "_sparse_sum_backward_cuda: expected grad_ Tensor to be sparse, but got dense");
     auto grad = grad_.coalesce();
     LongTensor grad_indices = grad._indices();
     Tensor grad_values = grad._values();
@@ -617,7 +617,7 @@ Tensor _sparse_sum_backward_cuda(const Tensor& grad_, const SparseTensor& input_
       int64_t total_threads = input_nnz;
       const dim3 block = dim3(std::min(static_cast<int64_t>(cuda::getApplyBlock().x), total_threads));
       dim3 grid;
-      AT_CHECK(cuda::getApplyGrid(total_threads, grid, curDevice), "_sparse_sum_backward_cuda: input too large or too many dimensions");
+      TORCH_CHECK(cuda::getApplyGrid(total_threads, grid, curDevice), "_sparse_sum_backward_cuda: input too large or too many dimensions");
 
       auto grad_indices_ti = getTensorInfo<int64_t, int64_t>(grad_indices_1D);
       auto input_indices_ti = getTensorInfo<int64_t, int64_t>(input_indices_1D);
diff --git a/aten/src/ATen/native_parse.py b/aten/src/ATen/native_parse.py
index 30a40ae83c34..e93e41dbd46a 100644
--- a/aten/src/ATen/native_parse.py
+++ b/aten/src/ATen/native_parse.py
@@ -127,6 +127,8 @@ def type_argument_translations(arg):
     # we change this at either a JIT schema or C++ level.
     elif default == 'Mean':
         default = 'Reduction::Mean'
+    elif default == 'contiguous_format':
+        default = 'MemoryFormat::Contiguous'
     else:
         try:
             default = int(default)
diff --git a/aten/src/ATen/nn.yaml b/aten/src/ATen/nn.yaml
index 37a50089cf1f..5f15b1ec59bc 100644
--- a/aten/src/ATen/nn.yaml
+++ b/aten/src/ATen/nn.yaml
@@ -118,12 +118,6 @@
 
 # Pooling
 
-- name: _thnn_adaptive_avg_pool3d(Tensor self, IntArrayRef[3] output_size)
-  cname: VolumetricAdaptiveAveragePooling
-  scalar_check:
-    output: 'false'
-    grad_input: 'false'
-
 - name: _thnn_avg_pool2d(Tensor self, IntArrayRef[2] kernel_size, IntArrayRef[2] stride={}, IntArrayRef[2] padding=0, bool ceil_mode=false, bool count_include_pad=true)
   cname: SpatialAveragePooling
   default_init:
@@ -140,14 +134,6 @@
     output: 'false'
     grad_input: 'false'
 
-- name: _thnn_max_pool2d_with_indices(Tensor self, IntArrayRef[2] kernel_size, IntArrayRef[2] stride={}, IntArrayRef[2] padding=0, IntArrayRef[2] dilation=1, bool ceil_mode=false)
-  cname: SpatialDilatedMaxPooling
-  default_init:
-    stride: kernel_size
-  scalar_check:
-    output: 'false'
-    grad_input: 'false'
-
 - name: _thnn_max_pool3d_with_indices(Tensor self, IntArrayRef[3] kernel_size, IntArrayRef[3] stride={}, IntArrayRef[3] padding=0, IntArrayRef[3] dilation=1, bool ceil_mode=false)
   cname: VolumetricDilatedMaxPooling
   default_init:
@@ -168,53 +154,6 @@
     output: 'false'
     grad_input: 'false'
 
-# Upsampling
-
-# Note: The upsampling backwards functions also include an IntArrayRef input_size
-# parameter, which is added by nn_parse.py
-
-- name: _thnn_upsample_linear1d(Tensor self, IntArrayRef[1] output_size, bool align_corners)
-  cname: TemporalUpSamplingLinear
-  scalar_check:
-    self: 'false'
-    grad_input: 'false'
-
-- name: _thnn_upsample_bilinear2d(Tensor self, IntArrayRef[2] output_size, bool align_corners)
-  cname: SpatialUpSamplingBilinear
-  scalar_check:
-    self: 'false'
-    grad_input: 'false'
-
-- name: _thnn_upsample_bicubic2d(Tensor self, IntArrayRef[2] output_size, bool align_corners)
-  cname: SpatialUpSamplingBicubic
-  scalar_check:
-    grad_input: 'false'
-
-- name: _thnn_upsample_trilinear3d(Tensor self, IntArrayRef[3] output_size, bool align_corners)
-  cname: VolumetricUpSamplingTrilinear
-  scalar_check:
-    self: 'false'
-    grad_input: 'false'
-
-- name: _thnn_upsample_nearest1d(Tensor self, IntArrayRef[1] output_size)
-  cname: TemporalUpSamplingNearest
-  scalar_check:
-    self: 'false'
-    grad_input: 'false'
-
-- name: _thnn_upsample_nearest2d(Tensor self, IntArrayRef[2] output_size)
-  cname: SpatialUpSamplingNearest
-  scalar_check:
-    self: 'false'
-    grad_input: 'false'
-
-- name: _thnn_upsample_nearest3d(Tensor self, IntArrayRef[3] output_size)
-  cname: VolumetricUpSamplingNearest
-  scalar_check:
-    self: 'false'
-    grad_input: 'false'
-
-
 # Private functions. These also exist in TH, but we want the backwards functions
 # to implement derivatives.
 
diff --git a/aten/src/ATen/preprocess_declarations.py b/aten/src/ATen/preprocess_declarations.py
index c5e7cdfad0f2..0564a9904411 100644
--- a/aten/src/ATen/preprocess_declarations.py
+++ b/aten/src/ATen/preprocess_declarations.py
@@ -19,6 +19,8 @@
     ],
     'quantized': [
         'QInt8',
+        'QUInt8',
+        'QInt32',
     ]
 }
 
diff --git a/aten/src/ATen/quantized/QTensorImpl.h b/aten/src/ATen/quantized/QTensorImpl.h
index 3aeeb4025702..f41d1ec15ca6 100644
--- a/aten/src/ATen/quantized/QTensorImpl.h
+++ b/aten/src/ATen/quantized/QTensorImpl.h
@@ -25,7 +25,9 @@ struct CAFFE2_API QTensorImpl : public c10::TensorImpl {
     return quantizer_;
   }
 
-  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach() const override {
+  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
+      const c10::VariableVersion& version_counter,
+      bool allow_tensor_metadata_change) const override {
     auto impl = c10::make_intrusive<QTensorImpl>(
         Storage(storage()), type_id(), quantizer_);
     impl->set_sizes_and_strides(sizes(), strides());
@@ -34,6 +36,8 @@ struct CAFFE2_API QTensorImpl : public c10::TensorImpl {
     impl->reserved_ = reserved_;
     impl->refresh_numel();
     impl->refresh_contiguous();
+    impl->set_version_counter(version_counter);
+    impl->set_allow_tensor_metadata_change(allow_tensor_metadata_change);
     return impl;
   }
 
diff --git a/aten/src/ATen/quantized/Quantizer.cpp b/aten/src/ATen/quantized/Quantizer.cpp
index 41c29b7bc414..0e913357e887 100644
--- a/aten/src/ATen/quantized/Quantizer.cpp
+++ b/aten/src/ATen/quantized/Quantizer.cpp
@@ -11,11 +11,263 @@
 
 namespace at {
 
+void checkFloatCPUTensor(std::string fn_name, Tensor t) {
+  TORCH_CHECK(
+      t.scalar_type() == kFloat,
+      fn_name,
+      "expects a Float Tensor.");
+  TORCH_CHECK(
+      t.device() == kCPU,
+      fn_name,
+      "expects a CPU Tensor.");
+}
+
+template <typename T>
+void checkQuantizedCPUTensor(std::string fn_name, Tensor t) {
+  TORCH_CHECK(t.is_quantized(),
+           fn_name,
+           "expects a quantized Tensor.");
+  TORCH_CHECK(t.scalar_type() == caffe2::TypeMeta::Make<T>(),
+           fn_name,
+           "expects a ",
+           caffe2::TypeMeta::Make<T>(),
+           "Tensor");
+  TORCH_CHECK(t.device() == kCPU,
+           fn_name,
+           "expects a CPU quantized Tensor");
+}
+
+template <typename T>
+void checkZeroPoint(std::string fn_name, int32_t zero_point) {
+  TORCH_CHECK(zero_point <= std::numeric_limits<T>::max(),
+           fn_name,
+           "zero_point is out of range.");
+  TORCH_CHECK(zero_point >= std::numeric_limits<T>::min(),
+           fn_name,
+           "zero_point is out of range.");
+}
+
+template <typename T>
+void checkZeroPoints(std::string fn_name, std::vector<int32_t> zero_points) {
+  for (int i = 0; i < zero_points.size(); ++i) {
+    TORCH_CHECK(zero_points[i] <= std::numeric_limits<T>::max(),
+                fn_name,
+                "zero_point",
+                i,
+                "is out of range.");
+    TORCH_CHECK(zero_points[i] >= std::numeric_limits<T>::min(),
+                fn_name,
+                "zero_point",
+                i,
+                "is out of range.");
+  }
+}
+
+#ifdef USE_FBGEMM
+// Note: quantize_val is only explicitly used in test outside of this file
+template <typename T>
+T quantize_val(float scale, int32_t zero_point, float value) {
+  // Internally, fbgemm::Quantize uses std::nearbyint.
+  // std::nearbyint results in nearest integer value according to the current
+  // rounding mode and the default rounding mode is rounds to even in half-way
+  // cases in most popular processor architectures like x86 and ARM. This is
+  // typically faster than an alternatives like std::round that rounds half-way
+  // cases away from zero, and can be consistent with SIMD implementations for
+  // example in x86 using _mm512_cvtps_epi32 or mm512_round_ps with
+  // _MM_FROUND_CUR_DIRECTION option that also follow the current rounding mode.
+  int32_t qvalue;
+  qvalue = fbgemm::Quantize<typename T::underlying>(
+      value,
+      zero_point,
+      scale,
+      /*result_precision=*/CHAR_BIT * sizeof(typename T::underlying));
+  return static_cast<T>(qvalue);
+}
+
+// TODO: dequantize_val?
+
+template <typename T>
+Tensor quantize_tensor(Tensor rtensor, Tensor qtensor, float scale, int32_t zero_point) {
+  auto fn_name = "quantize_tensor";
+  checkFloatCPUTensor(fn_name, rtensor);
+  checkQuantizedCPUTensor<T>(fn_name, qtensor);
+  checkZeroPoint<typename T::underlying>(fn_name, zero_point);
+  const float* rd = rtensor.data<float>();
+  auto qd = reinterpret_cast<typename T::underlying*>(qtensor.data<T>());
+  fbgemm::TensorQuantizationParams qparams;
+  qparams.scale = scale;
+  qparams.zero_point = zero_point;
+  qparams.precision = CHAR_BIT * sizeof(typename T::underlying);
+  fbgemm::Quantize<typename T::underlying>(/*src=*/rd,
+                             /*dst=*/qd,
+                             /*len=*/rtensor.numel(),
+                             /*qparams=*/qparams);
+  return qtensor;
+}
+
+template <typename T>
+Tensor dequantize_tensor(Tensor qtensor, Tensor rtensor, float scale, int32_t zero_point) {
+  auto fn_name = "dequantize_tensor";
+  checkFloatCPUTensor(fn_name, rtensor);
+  checkQuantizedCPUTensor<T>(fn_name, qtensor);
+  checkZeroPoint<typename T::underlying>(fn_name, zero_point);
+  const auto* qd = reinterpret_cast<const typename T::underlying*>(qtensor.data<T>());
+  fbgemm::TensorQuantizationParams qparams;
+  qparams.scale = scale;
+  qparams.zero_point = zero_point;
+  qparams.precision = CHAR_BIT * sizeof(typename T::underlying);
+  float* rd = rtensor.data<float>();
+  fbgemm::Dequantize<typename T::underlying>(/*src=*/qd,
+                              /*dst=*/rd,
+                              /*len=*/qtensor.numel(),
+                              /*qparams=*/qparams);
+  return rtensor;
+}
+#else
+
+template <typename T>
+T quantize_val(float scale, int32_t zero_point, float value) {
+  // std::nearbyint results in nearest integer value according to the current
+  // rounding mode and the default rounding mode is rounds to even in half-way
+  // cases in most popular processor architectures like x86 and ARM. This is
+  // typically faster than an alternatives like std::round that rounds half-way
+  // cases away from zero, and can be consistent with SIMD implementations for
+  // example in x86 using _mm512_cvtps_epi32 or mm512_round_ps with
+  // _MM_FROUND_CUR_DIRECTION option that also follow the current rounding mode.
+  int32_t qvalue;
+  constexpr int32_t qmin = std::numeric_limits<typename T::underlying>::min();
+  constexpr int32_t qmax = std::numeric_limits<typename T::underlying>::max();
+  checkZeroPoint<typename T::underlying>("quantize_val", zero_point);
+  qvalue = static_cast<int32_t>(std::nearbyint(value / scale + zero_point));
+  qvalue = std::max(qvalue, qmin);
+  qvalue = std::min(qvalue, qmax);
+  return static_cast<T>(qvalue);
+}
+
+template <typename T>
+Tensor quantize_tensor(Tensor rtensor, Tensor qtensor, float scale, int32_t zero_point) {
+  auto fn_name = "quantize_tensor";
+  checkFloatCPUTensor(fn_name, rtensor);
+  checkQuantizedCPUTensor<T>(fn_name, qtensor);
+  checkZeroPoint<typename T::underlying>(fn_name, zero_point);
+  const float* rdata = rtensor.data<float>();
+  auto qdata = qtensor.data<T>();
+  for (int i = 0; i < rtensor.numel(); ++i) {
+    qdata[i] = quantize_val<T>(scale, zero_point, rdata[i]);
+  }
+  return qtensor;
+}
+
+template <typename T>
+Tensor dequantize_tensor(Tensor qtensor, Tensor rtensor, float scale, int32_t zero_point) {
+  auto fn_name = "dequantize_tensor";
+  checkFloatCPUTensor(fn_name, rtensor);
+  checkQuantizedCPUTensor<T>(fn_name, qtensor);
+  checkZeroPoint<typename T::underlying>(fn_name, zero_point);
+  const auto* qd = qtensor.data<T>();
+  float* rd = rtensor.data<float>();
+  for (auto i = 0; i < qtensor.numel(); ++i) {
+    // We need to convert the qint8 value to float to ensure the subtraction
+    // subexpression returns a float
+    rd[i] = (static_cast<float>(qd[i].val_) - zero_point) * scale;
+  }
+  return rtensor;
+}
+#endif
+template CAFFE2_API qint8 quantize_val<qint8>(float scale, int32_t zero_point, float value);
+template CAFFE2_API quint8 quantize_val<quint8>(float scale, int32_t zero_point, float value);
+template CAFFE2_API qint32 quantize_val<qint32>(float scale, int32_t zero_point, float value);
+template CAFFE2_API Tensor quantize_tensor<qint8>(Tensor rtensor, Tensor qtensor, float scale, int32_t zero_point);
+template CAFFE2_API Tensor quantize_tensor<quint8>(Tensor rtensor, Tensor qtensor, float scale, int32_t zero_point);
+template CAFFE2_API Tensor quantize_tensor<qint32>(Tensor rtensor, Tensor qtensor, float scale, int32_t zero_point);
+template CAFFE2_API Tensor dequantize_tensor<qint8>(Tensor rtensor, Tensor qtensor, float scale, int32_t zero_point);
+template CAFFE2_API Tensor dequantize_tensor<quint8>(Tensor rtensor, Tensor qtensor, float scale, int32_t zero_point);
+template CAFFE2_API Tensor dequantize_tensor<qint32>(Tensor rtensor, Tensor qtensor, float scale, int32_t zero_point);
+
+
+// TODO: add fbgemm for per channel
+template <typename T>
+Tensor quantize_tensor_per_channel_affine(Tensor rtensor,
+                                          Tensor qtensor,
+                                          std::vector<float> scales,
+                                          std::vector<int32_t> zero_points,
+                                          std::vector<int64_t> axis) {
+  auto fn_name = "quantize_tensor_per_channel_affine";
+  checkFloatCPUTensor(fn_name, rtensor);
+  checkQuantizedCPUTensor<T>(fn_name, qtensor);
+  checkZeroPoints<typename T::underlying>(fn_name, zero_points);
+  int64_t channel_axis = axis[0];
+  TORCH_CHECK(channel_axis < rtensor.dim(), "Channel axis out of range in per channel affine quantization.");
+  int64_t batches = size_to_dim_(channel_axis, rtensor.sizes());
+  int64_t elements_per_channel = size_from_dim_(channel_axis + 1, rtensor.sizes());
+  int64_t channel = rtensor.size(channel_axis);
+  TORCH_CHECK(channel == scales.size(),
+              "length of scales must equal to channel");
+  TORCH_CHECK(channel == zero_points.size(),
+              "length of zero_points must equal to channel");
+  const float* rdata = rtensor.data<float>();
+  auto qdata = qtensor.data<T>();
+  for (auto b = 0; b < batches; ++b) {
+    for (auto c = 0; c < channel; ++c) {
+      for (auto e = 0; e < elements_per_channel; ++e) {
+        auto i = b * channel * elements_per_channel + c * elements_per_channel + e;
+        qdata[i] = quantize_val<T>(scales[c], zero_points[c], rdata[i]);
+      }
+    }
+  }
+  return qtensor;
+}
+
+template <typename T>
+Tensor dequantize_tensor_per_channel_affine(Tensor qtensor,
+                                            Tensor rtensor,
+                                            std::vector<float> scales,
+                                            std::vector<int32_t> zero_points,
+                                            std::vector<int64_t> axis) {
+  auto fn_name = "dequantize_tensor_per_channel_affine";
+  checkFloatCPUTensor(fn_name, rtensor);
+  checkQuantizedCPUTensor<T>(fn_name, qtensor);
+  checkZeroPoints<typename T::underlying>(fn_name, zero_points);
+  int64_t channel_axis = axis[0];
+  TORCH_CHECK(channel_axis < qtensor.dim(),
+              "Channel axis out of range in per channel affine dequantization.");
+  int64_t batches = size_to_dim_(channel_axis, rtensor.sizes());
+  int64_t elements_per_channel = size_from_dim_(channel_axis + 1, rtensor.sizes());
+  int64_t channel = rtensor.size(channel_axis);
+  TORCH_CHECK(channel == scales.size(),
+              "length of scales must equal to channel");
+  TORCH_CHECK(channel == zero_points.size(),
+              "length of zero_points must equal to channel");
+  const auto* qd = qtensor.data<T>();
+  float* rd = rtensor.data<float>();
+  for (auto b = 0; b < batches; ++b) {
+    for (auto c = 0; c < channel; ++c) {
+      for (auto e = 0; e < elements_per_channel; ++e) {
+        auto i = b * channel * elements_per_channel + c * elements_per_channel + e;
+        // We need to convert the qint8 value to float to ensure the subtraction
+        // subexpression returns a float
+        rd[i] = (static_cast<float>(qd[i].val_) - zero_points[c]) * scales[c];
+      }
+    }
+  }
+  return rtensor;
+}
+
 QuantizerPtr make_per_tensor_affine_quantizer(
     double scale,
-    int64_t zero_point) {
-  return c10::make_intrusive<PerTensorAffineQuantizer>(
-      static_cast<float>(scale), static_cast<uint8_t>(zero_point));
+    int64_t zero_point,
+    ScalarType scalar_type) {
+  return c10::make_intrusive<PerTensorAffineQuantizer>(scalar_type,
+      static_cast<float>(scale), static_cast<int32_t>(zero_point));
+}
+
+QuantizerPtr make_per_channel_affine_quantizer(
+    std::vector<float> scales,
+    std::vector<int32_t> zero_points,
+    std::vector<int64_t> axis,
+    ScalarType scalar_type) {
+  return c10::make_intrusive<PerChannelAffineQuantizer>(scalar_type,
+                                                        scales, zero_points, axis);
 }
 
 QTensorImpl* get_qtensorimpl(const Tensor& self) {
@@ -39,7 +291,7 @@ inline Tensor new_qtensor_cpu(
   auto* allocator = at::getCPUAllocator();
   int64_t nelements = at::prod_intlist(sizes);
   auto dtype = options.dtype();
-  AT_CHECK(isQIntType(typeMetaToScalarType(dtype)),
+  TORCH_CHECK(isQIntType(typeMetaToScalarType(dtype)),
            "ScalarType is not supported in new_qtensor_cpu.");
   auto storage = c10::make_intrusive<StorageImpl>(
       dtype,
@@ -53,91 +305,84 @@ inline Tensor new_qtensor_cpu(
   return tensor;
 }
 
-qint8 quantize_uint8(float scale, uint8_t zero_point, float value) {
-  // Internally, fbgemm::Quantize uses std::nearbyint.
-  // std::nearbyint results in nearest integer value according to the current
-  // rounding mode and the default rounding mode is rounds to even in half-way
-  // cases in most popular processor architectures like x86 and ARM. This is
-  // typically faster than an alternatives like std::round that rounds half-way
-  // cases away from zero, and can be consistent with SIMD implementations for
-  // example in x86 using _mm512_cvtps_epi32 or mm512_round_ps with
-  // _MM_FROUND_CUR_DIRECTION option that also follow the current rounding mode.
-  int32_t qvalue;
-#ifdef USE_FBGEMM
-  qvalue = fbgemm::Quantize<uint8_t>(value, zero_point, scale,
-                                     /*result_precision=*/8);
-#else
-  constexpr int32_t qmin = std::numeric_limits<uint8_t>::min();
-  constexpr int32_t qmax = std::numeric_limits<uint8_t>::max();
-  qvalue = static_cast<int32_t>(std::nearbyint(value / scale + zero_point));
-  qvalue = std::max(qvalue, qmin);
-  qvalue = std::min(qvalue, qmax);
-#endif
-  return static_cast<qint8>(qvalue);
-}
-
-Tensor PerTensorAffineQuantizer::quantize(Tensor tensor) {
-  IntArrayRef sizes = tensor.sizes();
+Tensor PerTensorAffineQuantizer::quantize(Tensor rtensor) {
+  TORCH_CHECK(
+      rtensor.scalar_type() == kFloat,
+      "quantize only works on Float Tensor.");
+  TORCH_CHECK(
+      rtensor.device() == kCPU,
+      "quantize only works for CPU backend right now.");
   // Here we need a std::intrusive_ptr<Quantizer>.. but actually "this" is the
   // quantizer that can be reused, so I'm using intrusive_from_this here
-  AT_CHECK(
-      tensor.options().device() == kCPU,
-      "quantize only works for CPU backend right now.");
-  Tensor qv = new_qtensor_cpu(
-      sizes,
-      tensor.options().dtype(at::kQInt8),
+  Tensor qtensor = new_qtensor_cpu(
+      rtensor.sizes(),
+      rtensor.options().dtype(scalar_type_),
       intrusive_from_this());
 
-  tensor = tensor.contiguous();
-  const float* svd = tensor.data<float>();
+  rtensor = rtensor.contiguous();
+  AT_DISPATCH_QINT_TYPES(qtensor.scalar_type(), "quantize_tensor", [&]() {
+    qtensor = quantize_tensor<scalar_t>(rtensor, qtensor, scale_, zero_point_);
+  });
+  return qtensor;
+}
 
-#ifdef USE_FBGEMM
-  auto qvd = reinterpret_cast<uint8_t*>(qv.data<qint8>());
-  fbgemm::TensorQuantizationParams qparams;
-  qparams.scale = scale_;
-  qparams.zero_point = zero_point_;
-  qparams.precision = 8;
-  fbgemm::Quantize<uint8_t>(/*src=*/svd,
-                            /*dst=*/qvd,
-                            /*len=*/tensor.numel(),
-                            /*qparams=*/qparams);
-#else
-  auto qvd = qv.data<qint8>();
-  for (int i = 0; i < tensor.numel(); ++i) {
-    qvd[i] = quantize_uint8(scale_, zero_point_, svd[i]);
-  }
-#endif
-  return qv;
+Tensor PerTensorAffineQuantizer::dequantize(Tensor qtensor) {
+  TORCH_CHECK(qtensor.is_quantized(),
+           "dequantize is only supported in quantized Tensor.");
+  TORCH_CHECK(
+      qtensor.device() == kCPU,
+      "dequantize only works for CPU backend right now.");
+  Tensor rtensor = at::empty(qtensor.sizes(), qtensor.options().dtype(at::kFloat));
+  qtensor = qtensor.contiguous();
+
+  AT_DISPATCH_QINT_TYPES(qtensor.scalar_type(), "dequantize_tensor", [&]() {
+    rtensor = dequantize_tensor<scalar_t>(qtensor, rtensor, scale_, zero_point_);
+  });
+
+  return rtensor;
 }
 
-Tensor PerTensorAffineQuantizer::dequantize(Tensor tensor) {
-  std::vector<int64_t> sizes = tensor.sizes().vec();
-  at::TensorOptions options = tensor.options().dtype(at::kFloat);
+Tensor PerChannelAffineQuantizer::quantize(Tensor rtensor) {
+  TORCH_CHECK(
+      rtensor.scalar_type() == kFloat,
+      "quantize only works on Float Tensor.");
+  TORCH_CHECK(
+      rtensor.device() == kCPU,
+      "quantize only works for CPU backend right now.");
+  // Here we need a std::intrusive_ptr<Quantizer>.. but actually "this" is the
+  // quantizer that can be reused, so I'm using intrusive_from_this here
+  Tensor qtensor = new_qtensor_cpu(
+      rtensor.sizes(),
+      rtensor.options().dtype(scalar_type_),
+      intrusive_from_this());
 
-  Tensor rv = at::empty(sizes, options);
-  float* rvd = rv.data<float>();
-  tensor = tensor.contiguous();
+  rtensor = rtensor.contiguous();
+  AT_DISPATCH_QINT_TYPES(qtensor.scalar_type(),
+                         "quantize_tensor_per_channel_affine",
+                         [&]() {
+    qtensor = quantize_tensor_per_channel_affine<scalar_t>(
+        rtensor, qtensor, scales_, zero_points_, axis_);
+  });
+  return qtensor;
+}
 
-#ifdef USE_FBGEMM
-  const auto* qvd = reinterpret_cast<const uint8_t*>(tensor.data<qint8>());
-  fbgemm::TensorQuantizationParams qparams;
-  qparams.scale = scale_;
-  qparams.zero_point = zero_point_;
-  qparams.precision = 8;
-  fbgemm::Dequantize<uint8_t>(/*src=*/qvd,
-                              /*dst=*/rvd,
-                              /*len=*/tensor.numel(),
-                              /*qparams=*/qparams);
-#else
-  const auto* qvd = tensor.data<qint8>();
-  for (auto i = 0; i < tensor.numel(); ++i) {
-    // We need to convert the qint8 value to float to ensure the subtraction
-    // subexpression returns a float
-    rvd[i] = (static_cast<float>(qvd[i].val_) - zero_point_) * scale_;
-  }
-#endif
+Tensor PerChannelAffineQuantizer::dequantize(Tensor qtensor) {
+  TORCH_CHECK(qtensor.is_quantized(),
+           "dequantize is only supported in quantized Tensor.");
+  TORCH_CHECK(
+      qtensor.device() == kCPU,
+      "dequantize only works for CPU backend right now.");
+  Tensor rtensor = at::empty(qtensor.sizes(), qtensor.options().dtype(at::kFloat));
+  qtensor = qtensor.contiguous();
+
+  AT_DISPATCH_QINT_TYPES(qtensor.scalar_type(),
+                         "dequantize_tensor_per_channel_affine",
+                         [&]() {
+    rtensor = dequantize_tensor_per_channel_affine<scalar_t>(
+        qtensor, rtensor, scales_, zero_points_, axis_);
+  });
 
-  return rv;
+  return rtensor;
 }
 
 Quantizer::~Quantizer() {}
diff --git a/aten/src/ATen/quantized/Quantizer.h b/aten/src/ATen/quantized/Quantizer.h
index e735f8f33c1d..d103b0801768 100644
--- a/aten/src/ATen/quantized/Quantizer.h
+++ b/aten/src/ATen/quantized/Quantizer.h
@@ -43,7 +43,8 @@ using QuantizerPtr = c10::intrusive_ptr<Quantizer>;
  */
 struct CAFFE2_API Quantizer : public c10::intrusive_ptr_target {
   const QScheme qscheme_;
-  explicit Quantizer(QScheme qscheme) : qscheme_(qscheme) {}
+  const ScalarType scalar_type_;
+  explicit Quantizer(QScheme qscheme, ScalarType scalar_type) : qscheme_(qscheme), scalar_type_(scalar_type) {}
   virtual ~Quantizer();
 
   // Copied from torch/csrc/jit/scope.h
@@ -55,10 +56,14 @@ struct CAFFE2_API Quantizer : public c10::intrusive_ptr_target {
     return c10::intrusive_ptr<Quantizer>::reclaim(this);
   }
 
-  virtual QScheme qscheme() {
+  QScheme qscheme() {
     return qscheme_;
   }
 
+  ScalarType scalar_type() {
+    return scalar_type_;
+  }
+
   /**
    * quantize a float Tensor into a quantized Tensor.
    */
@@ -77,7 +82,7 @@ struct CAFFE2_API Quantizer : public c10::intrusive_ptr_target {
  * the most commonly used scheme in this category.
  */
 struct CAFFE2_API UniformQuantizer : public Quantizer {
-  explicit UniformQuantizer(QScheme qscheme) : Quantizer(qscheme) {}
+  explicit UniformQuantizer(QScheme qscheme, ScalarType scalar_type) : Quantizer(qscheme, scalar_type) {}
 };
 
 /**
@@ -86,7 +91,7 @@ struct CAFFE2_API UniformQuantizer : public Quantizer {
  * value. K-means quantization is a representative example in this category.
  */
 struct CAFFE2_API NonUniformQuantizer : public Quantizer {
-  explicit NonUniformQuantizer(QScheme qscheme) : Quantizer(qscheme) {}
+  explicit NonUniformQuantizer(QScheme qscheme, ScalarType scalar_type) : Quantizer(qscheme, scalar_type) {}
 };
 
 // There is also StochasticQuantizer which is uniform but not affine
@@ -95,12 +100,12 @@ struct CAFFE2_API NonUniformQuantizer : public Quantizer {
  * AffineQuantizer uses affine transformation to do quantization.
  *
  * For quantize:
- * Y = clamp((X * scale + zero_point, min, max)
+ * Y = clamp(round(X / scale + zero_point), min, max)
  * For dequantize:
- * X = (Y - zero_point) / scale
+ * X = (Y - zero_point) * scale
  */
 struct CAFFE2_API AffineQuantizer : public UniformQuantizer {
-  explicit AffineQuantizer(QScheme qscheme) : UniformQuantizer(qscheme) {}
+  explicit AffineQuantizer(QScheme qscheme, ScalarType scalar_type) : UniformQuantizer(qscheme, scalar_type) {}
 };
 
 /**
@@ -108,12 +113,12 @@ struct CAFFE2_API AffineQuantizer : public UniformQuantizer {
  * does not have zero_point
  *
  * For quantize:
- * Y = clamp(X * scale, min, max)
+ * Y = clamp(round(X / scale), min, max)
  * For dequantize:
- * X = Y / scale
+ * X = Y * scale
  */
 struct CAFFE2_API SymmetricQuantizer : public UniformQuantizer {
-  explicit SymmetricQuantizer(QScheme qscheme) : UniformQuantizer(qscheme) {}
+  explicit SymmetricQuantizer(QScheme qscheme, ScalarType scalar_type) : UniformQuantizer(qscheme, scalar_type) {}
 };
 
 /**
@@ -121,8 +126,8 @@ struct CAFFE2_API SymmetricQuantizer : public UniformQuantizer {
  * used for quantizing all the values in the given Tensor
  */
 struct CAFFE2_API PerTensorSymmetricQuantizer : public SymmetricQuantizer {
-  explicit PerTensorSymmetricQuantizer(float scale)
-      : SymmetricQuantizer(kPerTensorSymmetric), scale_(scale) {}
+  explicit PerTensorSymmetricQuantizer(ScalarType scalar_type, float scale)
+    : SymmetricQuantizer(kPerTensorSymmetric, scalar_type), scale_(scale) {}
   float scale_{1.0};
 };
 
@@ -138,10 +143,11 @@ struct CAFFE2_API PerTensorSymmetricQuantizer : public SymmetricQuantizer {
  */
 struct CAFFE2_API PerChannelSymmetricQuantizer : public SymmetricQuantizer {
   explicit PerChannelSymmetricQuantizer(
+      ScalarType scalar_type,
       const std::vector<float>& scales,
       const std::vector<int64_t>& axis)
-      : SymmetricQuantizer(kPerChannelSymmetric), scales_(scales), axis_(axis) {
-    AT_CHECK(
+    : SymmetricQuantizer(kPerChannelSymmetric, scalar_type), scales_(scales), axis_(axis) {
+    TORCH_CHECK(
         axis_.size() == 1,
         "Per channel symmetric quantization in multiple axis is not supported yet.");
   }
@@ -164,8 +170,8 @@ struct CAFFE2_API PerChannelSymmetricQuantizer : public SymmetricQuantizer {
  * all the values in the Tensor.
  */
 struct CAFFE2_API PerTensorAffineQuantizer : public AffineQuantizer {
-  explicit PerTensorAffineQuantizer(float scale, uint8_t zero_point)
-      : AffineQuantizer(kPerTensorAffine),
+  explicit PerTensorAffineQuantizer(ScalarType scalar_type, float scale, int32_t zero_point)
+    : AffineQuantizer(kPerTensorAffine, scalar_type),
         scale_(scale),
         zero_point_(zero_point) {}
 
@@ -176,13 +182,14 @@ struct CAFFE2_API PerTensorAffineQuantizer : public AffineQuantizer {
     return scale_;
   }
 
-  uint8_t zero_point() const {
+  int32_t zero_point() const {
     return zero_point_;
   }
 
  private:
   const float scale_;
-  const uint8_t zero_point_;
+  // We use int32_t to support both uint8_t and int32_t data types
+  const int32_t zero_point_;
 };
 
 /**
@@ -192,14 +199,15 @@ struct CAFFE2_API PerTensorAffineQuantizer : public AffineQuantizer {
  */
 struct CAFFE2_API PerChannelAffineQuantizer : public AffineQuantizer {
   explicit PerChannelAffineQuantizer(
+      ScalarType scalar_type,
       const std::vector<float>& scales,
-      const std::vector<uint8_t>& zero_points,
+      const std::vector<int32_t>& zero_points,
       const std::vector<int64_t>& axis)
-      : AffineQuantizer(kPerChannelAffine),
-        scales_(scales),
-        zero_points_(zero_points),
-        axis_(axis) {
-    AT_CHECK(
+    : AffineQuantizer(kPerChannelAffine, scalar_type),
+    scales_(scales),
+    zero_points_(zero_points),
+    axis_(axis) {
+    TORCH_CHECK(
         axis_.size() == 1,
         "Per channel affine quantization in multiple axis is not supported yet.");
   }
@@ -208,7 +216,7 @@ struct CAFFE2_API PerChannelAffineQuantizer : public AffineQuantizer {
     return scales_;
   }
 
-  std::vector<uint8_t> zero_points() const {
+  std::vector<int32_t> zero_points() const {
     return zero_points_;
   }
 
@@ -216,9 +224,12 @@ struct CAFFE2_API PerChannelAffineQuantizer : public AffineQuantizer {
     return axis_;
   }
 
+  Tensor quantize(Tensor tensor) override;
+  Tensor dequantize(Tensor tensor) override;
+
  private:
   const std::vector<float> scales_;
-  const std::vector<uint8_t> zero_points_;
+  const std::vector<int32_t> zero_points_;
   const std::vector<int64_t> axis_;
 };
 
@@ -229,13 +240,21 @@ struct CAFFE2_API PerChannelAffineQuantizer : public AffineQuantizer {
 // This may be called repeatedly, so make sure it's pretty cheap.
 CAFFE2_API QTensorImpl* get_qtensorimpl(const Tensor& self);
 
-// Quantize a float value into a uint8 value given scale and zero_point
-CAFFE2_API qint8 quantize_uint8(float scale, uint8_t zero_point, float value);
+// Quantize a float value into a uint value given scale and zero_point
+template <typename T>
+CAFFE2_API T quantize_val(float scale, int32_t zero_point, float value);
+template <typename T>
+CAFFE2_API Tensor quantize_tensor(Tensor rtensor, Tensor qtensor, float scale, int32_t zero_point);
+template <typename T>
+CAFFE2_API Tensor dequantize_tensor(Tensor qtensor, Tensor rtensor, float scale, int32_t zero_point);
 
 // double and int64_t are because of the native function API, we only have these
 // argument types right now in native functions
 CAFFE2_API QuantizerPtr
-make_per_tensor_affine_quantizer(double scale, int64_t zero_point);
+make_per_tensor_affine_quantizer(double scale, int64_t zero_point, ScalarType scalar_type);
+
+CAFFE2_API QuantizerPtr
+make_per_channel_affine_quantizer(std::vector<float> scales, std::vector<int32_t> zero_points, std::vector<int64_t> axis, ScalarType scalar_type);
 
 // Create a Quantized Tensor given arguments for normal Tensor and a quantizer
 CAFFE2_API Tensor new_qtensor_cpu(
diff --git a/aten/src/ATen/templates/Functions.h b/aten/src/ATen/templates/Functions.h
index d8d8622ce2f7..8dc9ebd4e152 100644
--- a/aten/src/ATen/templates/Functions.h
+++ b/aten/src/ATen/templates/Functions.h
@@ -30,7 +30,7 @@ inline Tensor from_blob(
     const TensorOptions& options = {}) {
   auto device = getType(options).getDeviceFromPtr(data);
   if (options.device().has_index()) {
-    AT_CHECK(
+    TORCH_CHECK(
         options.device() == device,
         "Specified device ", options.device(),
         " does not match device of data ", device);
@@ -71,11 +71,11 @@ inline Tensor from_blob(
 namespace detail {
 
 static inline TypeExtendedInterface & infer_type(const Tensor & t) {
-  AT_CHECK(t.defined(), "undefined Tensor");
+  TORCH_CHECK(t.defined(), "undefined Tensor");
   return getType(t);
 }
 static inline TypeExtendedInterface & infer_type(const TensorList & tl) {
-  AT_CHECK(tl.size() > 0, "expected a non-empty list of Tensors");
+  TORCH_CHECK(tl.size() > 0, "expected a non-empty list of Tensors");
   return getType(tl[0]);
 }
 
diff --git a/aten/src/ATen/templates/LegacyTHFunctions.h b/aten/src/ATen/templates/LegacyTHFunctions.h
index fb91eeb4b2dd..fe34623618db 100644
--- a/aten/src/ATen/templates/LegacyTHFunctions.h
+++ b/aten/src/ATen/templates/LegacyTHFunctions.h
@@ -9,11 +9,11 @@ namespace th {
 namespace detail {
 
 static inline LegacyTHDispatcher & infer_dispatcher(const Tensor & t) {
-  AT_CHECK(t.defined(), "undefined Tensor");
+  TORCH_CHECK(t.defined(), "undefined Tensor");
   return getLegacyTHDispatcher(t);
 }
 static inline LegacyTHDispatcher & infer_dispatcher(const TensorList & tl) {
-  AT_CHECK(tl.size() > 0, "expected a non-empty list of Tensors");
+  TORCH_CHECK(tl.size() > 0, "expected a non-empty list of Tensors");
   return getLegacyTHDispatcher(tl[0]);
 }
 
diff --git a/aten/src/ATen/templates/Tensor.h b/aten/src/ATen/templates/Tensor.h
index b8fab0379096..32a538017398 100644
--- a/aten/src/ATen/templates/Tensor.h
+++ b/aten/src/ATen/templates/Tensor.h
@@ -3,6 +3,7 @@
 #include <ATen/core/Type.h>
 #include <c10/core/Device.h>
 #include <c10/core/Layout.h>
+#include <c10/core/MemoryFormat.h>
 #include <c10/core/Scalar.h>
 #include <c10/core/ScalarType.h>
 #include <ATen/core/SparseTensorRef.h>
@@ -154,7 +155,7 @@ class CAFFE2_API Tensor {
     return impl_.weak_use_count();
   }
 
-  const char * toString() const;
+  std::string toString() const;
 
   IntArrayRef sizes() const {
     return impl_->sizes();
@@ -165,8 +166,8 @@ class CAFFE2_API Tensor {
   int64_t ndimension() const {
     return dim();
   }
-  bool is_contiguous() const {
-    return impl_->is_contiguous();
+  bool is_contiguous(at::MemoryFormat memory_format=at::MemoryFormat::Any) const {
+    return impl_->is_contiguous(memory_format);
   }
 
   // Total bytes consumed by the "view" of elements of the array.  Does not
@@ -193,7 +194,7 @@ class CAFFE2_API Tensor {
     return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties(
         tensorTypeIdToBackend(type_id()),
         scalar_type(),
-        is_variable() && !at::NonVariableTypeMode::is_enabled());
+        is_variable());
   }
   Type & dispatch_type() const {
     return legacyTensorType(*impl_);
@@ -266,7 +267,7 @@ class CAFFE2_API Tensor {
   template<typename T, size_t N>
   TensorAccessor<T,N> accessor() const& {
     static_assert(N > 0, "accessor is used for indexing tensor, for scalars use *data<T>()");
-    AT_CHECK(dim() == N, "expected ", N, " dims but tensor has ", dim());
+    TORCH_CHECK(dim() == N, "expected ", N, " dims but tensor has ", dim());
     return TensorAccessor<T,N>(data<T>(),sizes().data(),strides().data());
   }
   template<typename T, size_t N>
@@ -280,7 +281,7 @@ class CAFFE2_API Tensor {
   template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
   PackedTensorAccessor<T,N,PtrTraits,index_t> packed_accessor() const& {
     static_assert(N > 0, "accessor is used for indexing tensor, for scalars use *data<T>()");
-    AT_CHECK(dim() == N, "expected ", N, " dims but tensor has ", dim());
+    TORCH_CHECK(dim() == N, "expected ", N, " dims but tensor has ", dim());
     return PackedTensorAccessor<T,N,PtrTraits,index_t>(static_cast<typename PtrTraits<T>::PtrType>(data<T>()),sizes().data(),strides().data());
   }
   template<typename T, size_t N,  template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
diff --git a/aten/src/ATen/templates/TensorMethods.h b/aten/src/ATen/templates/TensorMethods.h
index b9c2e9cbf1e9..b78a83ccb294 100644
--- a/aten/src/ATen/templates/TensorMethods.h
+++ b/aten/src/ATen/templates/TensorMethods.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <c10/core/Scalar.h>
+#include <c10/core/MemoryFormat.h>
 #include <c10/macros/Macros.h>
 #include <ATen/core/SparseTensorRef.h>
 #include <c10/core/TensorOptions.h>
@@ -131,7 +132,7 @@ inline bool is_quantized(Tensor self) {
 #define DEFINE_CAST(T, name, _)                  \
   template <>                                    \
   inline T* Tensor::data() const {               \
-    AT_CHECK(                                    \
+    TORCH_CHECK(                                    \
         scalar_type() == ScalarType::name,       \
         "expected scalar type ",                 \
         #name,                                   \
diff --git a/aten/src/ATen/templates/Type.h b/aten/src/ATen/templates/Type.h
index a6f4f479bf19..f6b3d8be7417 100644
--- a/aten/src/ATen/templates/Type.h
+++ b/aten/src/ATen/templates/Type.h
@@ -5,6 +5,7 @@
 #include <c10/util/Deprecated.h>
 #include <ATen/core/Generator.h>
 #include <c10/core/Layout.h>
+#include <c10/core/MemoryFormat.h>
 #include <c10/core/Scalar.h>
 #include <c10/core/ScalarType.h>
 #include <ATen/core/SparseTensorRef.h>
diff --git a/aten/src/ATen/templates/TypeDerived.h b/aten/src/ATen/templates/TypeDerived.h
index fed4545b11df..685771858e2b 100644
--- a/aten/src/ATen/templates/TypeDerived.h
+++ b/aten/src/ATen/templates/TypeDerived.h
@@ -31,7 +31,7 @@ struct ${Type} final : public ${DeviceType}TypeDefault {
     return t.scalar_type();
   }
   ScalarType infer_scalar_type(const TensorList & tl) const {
-    AT_CHECK(tl.size() > 0, "expected a non-empty list of Tensors");
+    TORCH_CHECK(tl.size() > 0, "expected a non-empty list of Tensors");
     return tl[0].scalar_type();
   }
 };
diff --git a/aten/src/ATen/test/CMakeLists.txt b/aten/src/ATen/test/CMakeLists.txt
index 20b643c521e4..43e9ec305fb8 100644
--- a/aten/src/ATen/test/CMakeLists.txt
+++ b/aten/src/ATen/test/CMakeLists.txt
@@ -31,6 +31,7 @@ list(APPEND ATen_CUDA_TEST_SRCS
   ${CMAKE_CURRENT_SOURCE_DIR}/cuda_apply_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/cuda_stream_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/cuda_half_test.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/cuda_distributions_test.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/cuda_optional_test.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/cuda_packedtensoraccessor_test.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/cuda_tensor_interop_test.cpp)
diff --git a/aten/src/ATen/test/cuda_distributions_test.cu b/aten/src/ATen/test/cuda_distributions_test.cu
new file mode 100644
index 000000000000..027c3157a59d
--- /dev/null
+++ b/aten/src/ATen/test/cuda_distributions_test.cu
@@ -0,0 +1,143 @@
+#include <gtest/gtest.h>
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <curand.h>
+#include <curand_kernel.h>
+#include <curand_philox4x32_x.h>
+
+__global__ void expected_randoms(float* x, uint64_t counter_offset) {
+  for(int i=0; i < 4; i++) {
+    curandStatePhilox4_32_10_t state;
+    curand_init(
+            123,
+            i,
+            counter_offset,
+            &state);
+    auto ret = curand_uniform4(&state);
+    x[i] = ret.x;
+  }
+}
+
+TEST(DistributionsTest, TestPhiloxIncrementSmallTensor) {
+  // Test Description:
+  //   In Distributions.cu we mentioned that philox increment
+  //   should be at least the number of curand() random numbers used in
+  //   each thread. In this test, we make sure that uniform_ correctly
+  //   increments philox and doesn't reuse randoms from previous calls
+  //   for a small tensor size of 4.
+  //    - We check that by first getting 4 randoms from uniform_.
+  //      Once we get these 4 randoms, that would mean that philox counter for
+  //      thread 0, 1, 2 and 3, was incremented by 4 (check calc_execution_policy
+  //      function for details).
+  //    - Now get 4 randoms with offset=4 for thread {0,1,2,3} from expected_randoms
+  //      kernel above.
+  //    - Now get 4 more randoms from uniform_ (note thread {0,1,2,3} for this call would
+  //      start from a philox_offset value of 4)
+  //    - the 4 randoms from expected_randoms and the 4 randoms from the previous call
+  //      of uniform_ should match, signifying that the philox offset was 
+  //      incremented properly and no randoms are being reused from previous calls
+
+  // if cuda not available, return
+  if (!at::cuda::is_available()) return;
+
+  // manual seed to 123
+  at::manual_seed(123);
+
+  // get 4 randoms from uniform_(), philox offset is now incremented to 4 by this call
+  at::empty({4}, at::TensorOptions(at::kCUDA)).uniform_();
+
+  // allocate 4 float on host memory
+  float *x;
+  cudaMallocManaged(&x, 4*sizeof(float));
+
+  // launch kernel to get expected randoms
+  expected_randoms<<<1, 1>>>(x, 4);
+
+  // Wait for GPU to finish before accessing on host
+  cudaDeviceSynchronize();
+  
+  // get 4 new float from uniform_()
+  auto self = at::empty({4}, at::TensorOptions(at::kCUDA));
+  self.uniform_();
+  
+  // check randoms from expected_randoms kernel are equal to the randoms from the second
+  // call of uniform_()
+  for (int i = 0; i < 4; i++) {
+    ASSERT_EQ(self[i].item().to<float>(), x[i]);
+  }
+
+  // Free memory
+  cudaFree(x);
+}
+
+TEST(DistributionsTest, TestPhiloxIncrementBigTensor) {
+  // Test Description:
+  //   In Distributions.cu we mentioned that philox increment
+  //   should be at least the number of curand() random numbers used in
+  //   each thread. In this test, we make sure that uniform_ correctly
+  //   increments philox and doesn't reuse randoms from previous calls
+  //   for a big size tensor.
+  //    - First of all, we come up with what the size of the big tensor
+  //      should be for this test. Our goal is to show that when the uniform_
+  //      kernel runs at full occupancy (i.e. when the number of elements is
+  //      greater the number of threads launched), it hits the unroll loop in
+  //      the uniform_ kernel.
+  //    - Hence, we set the size of the tensor in this test to be 8 times the
+  //      maximum number of threads we can launch. This means that, each thread will
+  //      be yielding 8 elements, and as a result, curand_uniform4 will be called twice
+  //      and all the 8 elements in a thread will consume all the float4 from the
+  //      two calls of curand_unfiorm4 as a result of the unroll loop. Therefore,
+  //      after this call to the unform_, counter_offset for the next call to uniform_
+  //      will start from 8. This is what we test next.
+  //    - Now get 4 randoms with offset=8 for thread {0,1,2,3} from expected_randoms
+  //      kernel above.
+  //    - Now get 4 more randoms from uniform_ (note thread {0,1,2,3} for this call would
+  //      start from a philox_offset value of 8)
+  //    - the 4 randoms from expected_randoms kernel and the 4 randoms from the previous call
+  //      of uniform_ should match, signifying that the philox offset was
+  //      incremented properly and no randoms are being reused from previous calls
+
+  // if cuda not available, return
+  if (!at::cuda::is_available()) return;
+
+  // manual seed to 123
+  at::manual_seed(123);
+
+  // calculate maximum number of threads that can be launched
+  // and set the numel to be 8 times that
+  const int block_size = 256;
+  dim3 dim_block(block_size);
+  uint32_t blocks_per_sm = at::cuda::getCurrentDeviceProperties()->maxThreadsPerMultiProcessor / block_size;
+  dim3 grid(static_cast<uint32_t>(at::cuda::getCurrentDeviceProperties()->multiProcessorCount) * blocks_per_sm);
+  auto numel = block_size * grid.x * 8;
+
+  // get numel randoms from uniform_(), philox offset is now incremented to 8 by this call
+  at::empty({numel}, at::TensorOptions(at::kCUDA)).uniform_();
+
+  // allocate 4 float on host memory
+  float *x;
+  cudaMallocManaged(&x, 4*sizeof(float));
+
+  // launch kernel to get expected randoms
+  expected_randoms<<<1, 1>>>(x, 8);
+
+  // Wait for GPU to finish before accessing on host
+  cudaDeviceSynchronize();
+
+  // get 4 new float from uniform_()
+  auto self = at::empty({4}, at::TensorOptions(at::kCUDA));
+  self.uniform_();
+
+  // check randoms from expected_randoms kernel are equal to the randoms from the second
+  // call of uniform_()
+  for (int i = 0; i < 4; i++) {
+    ASSERT_EQ(self[i].item().to<float>(), x[i]);
+  }
+
+  // Free memory
+  cudaFree(x);
+}
diff --git a/aten/src/ATen/test/quantized_test.cpp b/aten/src/ATen/test/quantized_test.cpp
index 103e69e18662..64b4f2e4acc1 100644
--- a/aten/src/ATen/test/quantized_test.cpp
+++ b/aten/src/ATen/test/quantized_test.cpp
@@ -7,7 +7,7 @@
 #include <limits>
 #include <sstream>
 #include <type_traits>
-// For quantize_uint8
+// For quantize_val
 #include <ATen/quantized/Quantizer.h>
 #include <c10/core/ScalarType.h>
 
@@ -18,7 +18,7 @@ TEST(TestQTensor, QuantDequantAPIs) {
   Tensor r = at::ones({num_elements});
   const float scale = 1.0;
   const int32_t zero_point = 2;
-  Tensor qr = r.quantize_linear(scale, zero_point);
+  Tensor qr = r.quantize_linear(scale, zero_point, kQUInt8);
   ASSERT_EQ(qr.q_scale().to<float>(), scale);
   ASSERT_EQ(qr.q_zero_point().to<int32_t>(), zero_point);
   ASSERT_TRUE(qr.is_quantized());
@@ -33,10 +33,10 @@ TEST(TestQTensor, QuantDequantAPIs) {
 
   // Check for correct quantization
   auto r_data = r.data<float>();
-  auto qr_data = qr.data<qint8>();
+  auto qr_data = qr.data<quint8>();
   for (auto i = 0; i < num_elements; ++i) {
     ASSERT_EQ(
-        quantize_uint8(scale, zero_point, r_data[i]).val_, qr_data[i].val_);
+        quantize_val<quint8>(scale, zero_point, r_data[i]).val_, qr_data[i].val_);
   }
 
   // Check for correct dequantization
@@ -60,9 +60,9 @@ TEST(TestQTensor, RoundingMode) {
     6, 6, 8, 8, 10, 10};  // scale = 1.0
 
   Tensor x = from_blob(x_values.data(), x_values.size());
-  Tensor qx = x.quantize_linear(/*scale=*/1.0, zero_point);
+  Tensor qx = x.quantize_linear(/*scale=*/1.0, zero_point, kQUInt8);
 
-  auto qx_data = qx.data<qint8>();
+  auto qx_data = qx.data<quint8>();
   for (int idx = 0; idx < x_values.size(); ++idx) {
     ASSERT_EQ(qx_expect[idx], qx_data[idx].val_)
       << "Tie breaking during rounding element " << idx << " failed!";
@@ -73,7 +73,7 @@ TEST(TestQTensor, Item) {
   Tensor r = at::ones({1});
   const float scale = 1;
   const int32_t zero_point = 2;
-  Tensor qr = r.quantize_linear(scale, zero_point);
+  Tensor qr = r.quantize_linear(scale, zero_point, kQUInt8);
   ASSERT_EQ(r.item().to<float>(), qr.item().to<float>());
 }
 
@@ -82,9 +82,9 @@ TEST(TestQTensor, EmptyQuantized) {
   int zero_point = 10;
   int val = 100;
   int numel = 10;
-  Tensor q = at::_empty_affine_quantized({numel}, at::device(at::kCPU).dtype(kQInt8), scale, zero_point);
+  Tensor q = at::_empty_affine_quantized({numel}, at::device(at::kCPU).dtype(kQUInt8), scale, zero_point);
   // Assigning to QTensor
-  auto* q_data = q.data<qint8>();
+  auto* q_data = q.data<quint8>();
   for (int i = 0; i < numel; ++i) {
     q_data[i].val_ = val;
   }
diff --git a/aten/src/ATen/test/thread_init_test.cpp b/aten/src/ATen/test/thread_init_test.cpp
index 1c0d8576e32d..6f9ae19485e4 100644
--- a/aten/src/ATen/test/thread_init_test.cpp
+++ b/aten/src/ATen/test/thread_init_test.cpp
@@ -1,6 +1,6 @@
 #include <ATen/ATen.h>
 #include <ATen/Parallel.h>
-#include <ATen/test/test_assert.h>
+#include <test/cpp/jit/test_base.h>
 #include <thread>
 
 
@@ -11,8 +11,8 @@
 void test(int given_num_threads) {
   at::init_num_threads();
   auto t = at::ones({1000 * 1000}, at::CPU(at::kFloat));
-  ASSERT(given_num_threads >= 0);
-  ASSERT(at::get_num_threads() == given_num_threads);
+  ASSERT_TRUE(given_num_threads >= 0);
+  ASSERT_EQ(at::get_num_threads(), given_num_threads);
   auto t_sum = t.sum();
   for (int i = 0; i < 1000; ++i) {
     t_sum = t_sum + t.sum();
@@ -38,5 +38,11 @@ int main() {
   at::set_num_threads(5);
   test(at::get_num_threads());
 
+  // test inter-op settings
+  ASSERT_EQ(at::get_num_interop_threads(), std::thread::hardware_concurrency());
+  at::set_num_interop_threads(5);
+  ASSERT_EQ(at::get_num_interop_threads(), 5);
+  ASSERT_ANY_THROW(at::set_num_interop_threads(6));
+
   return 0;
 }
diff --git a/aten/src/TH/THAllocator.cpp b/aten/src/TH/THAllocator.cpp
index b73ea191795d..595462a2b9ba 100644
--- a/aten/src/TH/THAllocator.cpp
+++ b/aten/src/TH/THAllocator.cpp
@@ -89,10 +89,8 @@ THMapAllocator::THMapAllocator(WithFd, const char *filename, int fd, int flags,
     hfilesz.QuadPart = size;
 
     if (flags_ & TH_ALLOCATOR_MAPPED_EXCLUSIVE) {
-      handle_ = CreateFileMapping(INVALID_HANDLE_VALUE, nullptr, PAGE_READWRITE, hfilesz.HighPart, hfilesz.LowPart, filename);
       event_ = CreateEvent(nullptr, FALSE, FALSE, eventname);
     } else if (flags_ & TH_ALLOCATOR_MAPPED_NOCREATE) {
-      handle_ = OpenFileMapping(FILE_MAP_ALL_ACCESS, FALSE, filename);
       event_ = OpenEvent(EVENT_ALL_ACCESS, FALSE, eventname);
     } else {
       AT_ERROR("Expected either TH_ALLOCATOR_MAPPED_EXCLUSIVE or TH_ALLOCATOR_MAPPED_NOCREATE");
@@ -102,6 +100,14 @@ THMapAllocator::THMapAllocator(WithFd, const char *filename, int fd, int flags,
       AT_ERROR("Couldn't open shared event: <", eventname, ">, error code: <", GetLastError(), ">");
     }
 
+    if (flags_ & TH_ALLOCATOR_MAPPED_EXCLUSIVE) {
+      handle_ = CreateFileMapping(INVALID_HANDLE_VALUE, nullptr, PAGE_READWRITE, hfilesz.HighPart, hfilesz.LowPart, filename);
+    } else if (flags_ & TH_ALLOCATOR_MAPPED_NOCREATE) {
+      handle_ = OpenFileMapping(FILE_MAP_ALL_ACCESS, FALSE, filename);
+    } else {
+      AT_ERROR("Expected either TH_ALLOCATOR_MAPPED_EXCLUSIVE or TH_ALLOCATOR_MAPPED_NOCREATE");
+    }
+
     if (handle_ == nullptr) {
       AT_ERROR("Couldn't open shared file mapping: <", filename, ">, error code: <", GetLastError(), ">");
     }
diff --git a/aten/src/TH/THTensor.cpp b/aten/src/TH/THTensor.cpp
index 06b02d8dd5bd..0c5cebd223e1 100644
--- a/aten/src/TH/THTensor.cpp
+++ b/aten/src/TH/THTensor.cpp
@@ -79,7 +79,7 @@ void THTensor_resize(THTensor *self, at::IntArrayRef size, at::IntArrayRef strid
 
 void THTensor_resizeNd(THTensor *self, int nDimension, const int64_t *size, const int64_t *stride)
 {
-  AT_CHECK(nDimension >= 0, "resizeNd nDimension must be non-negative");
+  TORCH_CHECK(nDimension >= 0, "resizeNd nDimension must be non-negative");
   at::IntArrayRef sizes(size, nDimension);
   at::optional<at::IntArrayRef> strides;
   if (stride) {
@@ -167,7 +167,7 @@ void THTensor_stealAndSetStoragePtr(THTensor* tensor, THStorage* storage) {
   // We used to allow this, but this breaks device caching,
   // see Note [We regret making Variable hold a Tensor]
   // Let's put an actual error message for this one.
-  AT_CHECK(tensor->storage().device() == storage->device(),
+  TORCH_CHECK(tensor->storage().device() == storage->device(),
             "Attempted to set the storage of a tensor on device \"", tensor->storage().device(),
              "\" to a storage on different device \"", storage->device(),
             "\".  This is no longer allowed; the devices must match.");
diff --git a/aten/src/TH/THTensor.h b/aten/src/TH/THTensor.h
index c73415dc0816..61c05160b190 100644
--- a/aten/src/TH/THTensor.h
+++ b/aten/src/TH/THTensor.h
@@ -31,6 +31,9 @@
 #include <TH/generic/THTensorMath.h>
 #include <TH/THGenerateBoolType.h>
 
+#include <TH/generic/THTensorMath.h>
+#include <TH/THGenerateHalfType.h>
+
 /* fill and zero*/
 #include <TH/generic/THTensorFill.h>
 #include <TH/THGenerateAllTypes.h>
diff --git a/aten/src/TH/THTensor.hpp b/aten/src/TH/THTensor.hpp
index 1ebaf9094a67..a9c89f222189 100644
--- a/aten/src/TH/THTensor.hpp
+++ b/aten/src/TH/THTensor.hpp
@@ -34,7 +34,7 @@ inline THStorage* THTensor_getStoragePtr(const THTensor* tensor) {
   // for the first time (providing the necessary type).  It is an ERROR to
   // invoke any PyTorch operations on such a half-constructed storage,
   // and this check tests for that case.
-  AT_CHECK(tensor->storage(), "Cannot use PyTorch operations on a half-constructed "
+  TORCH_CHECK(tensor->storage(), "Cannot use PyTorch operations on a half-constructed "
            "tensor.  If this tensor came from Caffe2, please call GetMutableData on "
            "it first; otherwise, this is a bug, please report it.");
   return tensor->storage().unsafeGetStorageImpl();
diff --git a/aten/src/TH/THTensorEvenMoreMath.cpp b/aten/src/TH/THTensorEvenMoreMath.cpp
index a0b9e190998d..432deb26828d 100644
--- a/aten/src/TH/THTensorEvenMoreMath.cpp
+++ b/aten/src/TH/THTensorEvenMoreMath.cpp
@@ -8,3 +8,6 @@
 
 #include <TH/generic/THTensorEvenMoreMath.cpp>
 #include <TH/THGenerateBoolType.h>
+
+#include <TH/generic/THTensorEvenMoreMath.cpp>
+#include <TH/THGenerateHalfType.h>
diff --git a/aten/src/TH/generic/THLapack.cpp b/aten/src/TH/generic/THLapack.cpp
index 1c81ed291dad..23a2b3f8b6f4 100644
--- a/aten/src/TH/generic/THLapack.cpp
+++ b/aten/src/TH/generic/THLapack.cpp
@@ -11,12 +11,8 @@ TH_EXTERNC void dgeev_(char *jobvl, char *jobvr, int *n, double *a, int *lda, do
 TH_EXTERNC void sgeev_(char *jobvl, char *jobvr, int *n, float *a, int *lda, float *wr, float *wi, float* vl, int *ldvl, float *vr, int *ldvr, float *work, int *lwork, int *info);
 TH_EXTERNC void dgesdd_(char *jobz, int *m, int *n, double *a, int *lda, double *s, double *u, int *ldu, double *vt, int *ldvt, double *work, int *lwork, int *iwork, int *info);
 TH_EXTERNC void sgesdd_(char *jobz, int *m, int *n, float *a, int *lda, float *s, float *u, int *ldu, float *vt, int *ldvt, float *work, int *lwork, int *iwork, int *info);
-TH_EXTERNC void dgetrf_(int *m, int *n, double *a, int *lda, int *ipiv, int *info);
-TH_EXTERNC void sgetrf_(int *m, int *n, float *a, int *lda, int *ipiv, int *info);
 TH_EXTERNC void dgetrs_(char *trans, int *n, int *nrhs, double *a, int *lda, int *ipiv, double *b, int *ldb, int *info);
 TH_EXTERNC void sgetrs_(char *trans, int *n, int *nrhs, float *a, int *lda, int *ipiv, float *b, int *ldb, int *info);
-TH_EXTERNC void dgetri_(int *n, double *a, int *lda, int *ipiv, double *work, int *lwork, int *info);
-TH_EXTERNC void sgetri_(int *n, float *a, int *lda, int *ipiv, float *work, int *lwork, int *info);
 TH_EXTERNC void dpotri_(char *uplo, int *n, double *a, int *lda, int *info);
 TH_EXTERNC void spotri_(char *uplo, int *n, float *a, int *lda, int *info);
 TH_EXTERNC void sgeqrf_(int *m, int *n, float *a, int *lda, float *tau, float *work, int *lwork, int *info);
@@ -89,20 +85,6 @@ void THLapack_(gesdd)(char jobz, int m, int n, scalar_t *a, int lda, scalar_t *s
 #endif
 }
 
-/* LU decomposition */
-void THLapack_(getrf)(int m, int n, scalar_t *a, int lda, int *ipiv, int *info)
-{
-#ifdef  USE_LAPACK
-#if defined(TH_REAL_IS_DOUBLE)
-  dgetrf_(&m, &n, a, &lda, ipiv, info);
-#else
-  sgetrf_(&m, &n, a, &lda, ipiv, info);
-#endif
-#else
-  THError("getrf : Lapack library not found in compile time\n");
-#endif
-}
-
 void THLapack_(getrs)(char trans, int n, int nrhs, scalar_t *a, int lda, int *ipiv, scalar_t *b, int ldb, int *info)
 {
 #ifdef  USE_LAPACK
@@ -116,20 +98,6 @@ void THLapack_(getrs)(char trans, int n, int nrhs, scalar_t *a, int lda, int *ip
 #endif
 }
 
-/* Matrix Inverse */
-void THLapack_(getri)(int n, scalar_t *a, int lda, int *ipiv, scalar_t *work, int lwork, int* info)
-{
-#ifdef  USE_LAPACK
-#if defined(TH_REAL_IS_DOUBLE)
-  dgetri_(&n, a, &lda, ipiv, work, &lwork, info);
-#else
-  sgetri_(&n, a, &lda, ipiv, work, &lwork, info);
-#endif
-#else
-  THError("getri : Lapack library not found in compile time\n");
-#endif
-}
-
 /* Cholesky factorization based Matrix Inverse */
 void THLapack_(potri)(char uplo, int n, scalar_t *a, int lda, int *info)
 {
diff --git a/aten/src/TH/generic/THLapack.h b/aten/src/TH/generic/THLapack.h
index 055783464d4f..20d469d1eb6e 100644
--- a/aten/src/TH/generic/THLapack.h
+++ b/aten/src/TH/generic/THLapack.h
@@ -10,11 +10,7 @@ TH_API void THLapack_(syev)(char jobz, char uplo, int n, scalar_t *a, int lda, s
 TH_API void THLapack_(geev)(char jobvl, char jobvr, int n, scalar_t *a, int lda, scalar_t *wr, scalar_t *wi, scalar_t* vl, int ldvl, scalar_t *vr, int ldvr, scalar_t *work, int lwork, int *info);
 /* svd */
 TH_API void THLapack_(gesdd)(char jobz, int m, int n, scalar_t *a, int lda, scalar_t *s, scalar_t *u, int ldu, scalar_t *vt, int ldvt, scalar_t *work, int lwork, int *iwork, int *info);
-/* LU decomposition */
-TH_API void THLapack_(getrf)(int m, int n, scalar_t *a, int lda, int *ipiv, int *info);
 TH_API void THLapack_(getrs)(char trans, int n, int nrhs, scalar_t *a, int lda, int *ipiv, scalar_t *b, int ldb, int *info);
-/* Matrix Inverse */
-TH_API void THLapack_(getri)(int n, scalar_t *a, int lda, int *ipiv, scalar_t *work, int lwork, int* info);
 
 /* Positive Definite matrices */
 /* Matrix inverse based on Cholesky factorization */
diff --git a/aten/src/TH/generic/THTensor.cpp b/aten/src/TH/generic/THTensor.cpp
index 2c28efcaf3f1..fcb7e8f4e2b9 100644
--- a/aten/src/TH/generic/THTensor.cpp
+++ b/aten/src/TH/generic/THTensor.cpp
@@ -79,7 +79,7 @@ THTensor *THTensor_(newWithTensor)(THTensor *tensor)
 /* Storage init */
 THTensor *THTensor_(newWithStorage)(THStorage *storage, ptrdiff_t storageOffset, at::IntArrayRef sizes, at::IntArrayRef strides) {
   if (strides.data()) {
-    AT_CHECK(sizes.size() == strides.size(), "number of sizes and strides must match");
+    TORCH_CHECK(sizes.size() == strides.size(), "number of sizes and strides must match");
   }
   THTensor *self = c10::make_intrusive<at::TensorImpl, at::UndefinedTensorImpl>(
     c10::intrusive_ptr<at::StorageImpl>::reclaim(THStorage_(new)()),
@@ -154,7 +154,7 @@ THTensor *THTensor_(newClone)(THTensor *self)
   THTensor_(resizeAs)(tensor, self);
   at::Tensor tensor_wrap = THTensor_wrap(tensor);
   at::Tensor self_wrap = THTensor_wrap(self);
-  at::_copy_same_type_(tensor_wrap, self_wrap);
+  at::native::copy_(tensor_wrap, self_wrap, false);
   return tensor;
 }
 
@@ -596,7 +596,7 @@ void THTensor_(freeCopyTo)(THTensor *self, THTensor *dst)
   if(self != dst) {
     at::Tensor dst_wrap = THTensor_wrap(dst);
     at::Tensor self_wrap = THTensor_wrap(self);
-    at::_copy_same_type_(dst_wrap, self_wrap);
+    at::native::copy_(dst_wrap, self_wrap, false);
   }
 
   THTensor_(free)(self);
@@ -816,7 +816,7 @@ void THTensor_(catArray)(THTensor *result, THTensor **inputs, int numInputs, int
         THTensor_(narrow)(nt, NULL, dimension, offset, dimSize);
         at::Tensor nt__wrap = THTensor_wrap(nt);
         at::Tensor inputs_wrap = THTensor_wrap(inputs[j]);
-        at::_copy_same_type_(nt__wrap, inputs_wrap);
+        at::native::copy_(nt__wrap, inputs_wrap, false);
         c10::raw::intrusive_ptr::decref(nt);
         offset += dimSize;
       }
diff --git a/aten/src/TH/generic/THTensorApply.hpp b/aten/src/TH/generic/THTensorApply.hpp
index 7d9b6bc9a0ab..a7994c6bbad1 100644
--- a/aten/src/TH/generic/THTensorApply.hpp
+++ b/aten/src/TH/generic/THTensorApply.hpp
@@ -61,11 +61,20 @@ if (std::isnan(val)) break;
 #define th_isnan_break(val)
 #endif
 
-#ifdef _WIN32
-// MSVC doesn't support loop pragmas, but does support others. Create a new macro to account for those differences.
-#define PRAGMA_LOOP(P)    // Noop
+#ifdef _MSC_VER
+#define PRAGMA(P)         __pragma(P)
+# if _MSC_VER < 1920
+// MSVC < 2019 doesn't support loop pragmas.
+#  define PRAGMA_IVDEP    // Noop
+#  define PRAGMA_SIMD     // Noop
+# else
+#  define PRAGMA_IVDEP    PRAGMA(loop(ivdep))
+#  define PRAGMA_SIMD     PRAGMA(omp simd)
+# endif
 #else
-#define PRAGMA_LOOP(P)    _Pragma(#P)
+#define PRAGMA(P)         _Pragma(#P)
+#define PRAGMA_IVDEP      PRAGMA(ivdep)
+#define PRAGMA_SIMD       PRAGMA(simd)
 #endif
 
 #define TH_TENSOR_APPLY2_PARALLEL(SIZE, CONTIG1, CONTIG2, TYPE1, TENSOR1, TYPE2, TENSOR2, CODE, THRESHOLD) \
@@ -76,7 +85,7 @@ if (std::isnan(val)) break;
     TYPE2 *tp = THTensor_getStoragePtr(TENSOR2)->data<TYPE2>()+TENSOR2->storage_offset(); \
     if (tp != (TYPE2*)rp) { \
       at::parallel_for(0, SIZE, (THRESHOLD * 10), [&](int64_t begin, int64_t end) { \
-        PRAGMA_LOOP(ivdep) \
+        PRAGMA_IVDEP \
         for (auto iter = begin; iter < end; iter++) { \
           TYPE2 *TENSOR2##_data = tp+iter; \
           TYPE1 *TENSOR1##_data = rp+iter; \
@@ -85,7 +94,7 @@ if (std::isnan(val)) break;
       }); \
     } else { \
       at::parallel_for(0, SIZE, (THRESHOLD * 10), [&](int64_t begin, int64_t end) { \
-        PRAGMA_LOOP(simd) \
+        PRAGMA_SIMD \
         for (auto iter = begin; iter < end; iter++) { \
           TYPE2* TENSOR2##_data = tp+iter; \
           TYPE1* TENSOR1##_data = rp+iter; \
@@ -165,7 +174,7 @@ if (std::isnan(val)) break;
     TYPE3 *srcp = THTensor_getStoragePtr(TENSOR3)->data<TYPE3>()+TENSOR3->storage_offset(); \
     if (tp != (TYPE2*)rp) { \
       at::parallel_for(0, SIZE, (THRESHOLD * 10), [&](int64_t begin, int64_t end) { \
-        PRAGMA_LOOP(ivdep) \
+        PRAGMA_IVDEP \
         for (auto iter = begin; iter < end; iter++) { \
           TYPE1 *TENSOR1##_data = rp+iter; \
           TYPE2 *TENSOR2##_data = tp+iter; \
@@ -175,7 +184,7 @@ if (std::isnan(val)) break;
       }); \
     } else { \
       at::parallel_for(0, SIZE, (THRESHOLD * 10), [&](int64_t begin, int64_t end) { \
-        PRAGMA_LOOP(simd) \
+        PRAGMA_SIMD \
         for (auto iter = begin; iter < end; iter++) { \
           TYPE1 *TENSOR1##_data = rp+iter; \
           TYPE2 *TENSOR2##_data = tp+iter; \
diff --git a/aten/src/TH/generic/THTensorConv.cpp b/aten/src/TH/generic/THTensorConv.cpp
index 375042787a38..1fbbc6e491ae 100644
--- a/aten/src/TH/generic/THTensorConv.cpp
+++ b/aten/src/TH/generic/THTensorConv.cpp
@@ -591,8 +591,8 @@ void THTensor_(conv2DRevger)(THTensor *r_, scalar_t beta, scalar_t alpha, THTens
   scalar_t *output_data;
   ptrdiff_t nelem;
 
-  AT_CHECK(!t_->is_empty() && t_->dim() == 3, "input: non-empty 3D Tensor expected, got size: ", t_->sizes());
-  AT_CHECK(!k_->is_empty() && k_->dim() == 3, "kernel: non-empty 3D Tensor expected, got size: ", k_->sizes());
+  TORCH_CHECK(!t_->is_empty() && t_->dim() == 3, "input: non-empty 3D Tensor expected, got size: ", t_->sizes());
+  TORCH_CHECK(!k_->is_empty() && k_->dim() == 3, "kernel: non-empty 3D Tensor expected, got size: ", k_->sizes());
   THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
   THArgCheck(scol >= 1, 6, "Stride should be a positive integer");
 
@@ -696,8 +696,8 @@ void THTensor_(conv2DRevgerm)(THTensor *r_, scalar_t beta, scalar_t alpha, THTen
   scalar_t *output_data;
   ptrdiff_t nelem;
 
-  AT_CHECK(!t_->is_empty() && t_->dim() == 4, "input: non-empty 4D Tensor expected, got size: ", t_->sizes());
-  AT_CHECK(!k_->is_empty() && k_->dim() == 4, "kernel: non-empty 4D Tensor expected, got size: ", k_->sizes());
+  TORCH_CHECK(!t_->is_empty() && t_->dim() == 4, "input: non-empty 4D Tensor expected, got size: ", t_->sizes());
+  TORCH_CHECK(!k_->is_empty() && k_->dim() == 4, "kernel: non-empty 4D Tensor expected, got size: ", k_->sizes());
   THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
   THArgCheck(scol >= 1, 6, "Stride should be a positive integer");
 
@@ -807,8 +807,8 @@ void THTensor_(conv2Dger)(THTensor *r_, scalar_t beta, scalar_t alpha, THTensor
   scalar_t *output_data;
   ptrdiff_t nelem;
 
-  AT_CHECK(!t_->is_empty() && t_->dim() == 3, "input: non-empty 3D Tensor expected, got size: ", t_->sizes());
-  AT_CHECK(!k_->is_empty() && k_->dim() == 3, "kernel: non-empty 3D Tensor expected, got size: ", k_->sizes());
+  TORCH_CHECK(!t_->is_empty() && t_->dim() == 3, "input: non-empty 3D Tensor expected, got size: ", t_->sizes());
+  TORCH_CHECK(!k_->is_empty() && k_->dim() == 3, "kernel: non-empty 3D Tensor expected, got size: ", k_->sizes());
   THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
   THArgCheck(scol >= 1, 6, "Stride should be a positive integer");
   THArgCheck(*vf == 'V' || *vf == 'F', 7, "type of convolution can 'V' or 'F'");
@@ -941,8 +941,8 @@ void THTensor_(conv2Dmv)(THTensor *r_, scalar_t beta, scalar_t alpha, THTensor *
   scalar_t *output_data;
   ptrdiff_t nelem;
 
-  AT_CHECK(!t_->is_empty() && t_->dim() == 3, "input: non-empty 3D Tensor expected, got size: ", t_->sizes());
-  AT_CHECK(!k_->is_empty() && k_->dim() == 4, "kernel: non-empty 4D Tensor expected, got size: ", k_->sizes());
+  TORCH_CHECK(!t_->is_empty() && t_->dim() == 3, "input: non-empty 3D Tensor expected, got size: ", t_->sizes());
+  TORCH_CHECK(!k_->is_empty() && k_->dim() == 4, "kernel: non-empty 4D Tensor expected, got size: ", k_->sizes());
   THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
   THArgCheck(scol >= 1, 6, "Stride should be a positive integer");
   THArgCheck(*vf == 'V' || *vf == 'F', 7, "type of convolution can 'V' or 'F'");
@@ -1082,8 +1082,8 @@ void THTensor_(conv2Dmm)(THTensor *r_, scalar_t beta, scalar_t alpha, THTensor *
   scalar_t *weight_data;
   scalar_t *output_data;
 
-  AT_CHECK(!t_->is_empty() && t_->dim() == 4, "input: non-empty 4D Tensor expected, got size: ", t_->sizes());
-  AT_CHECK(!k_->is_empty() && k_->dim() == 4, "kernel: non-empty 4D Tensor expected, got size: ", k_->sizes());
+  TORCH_CHECK(!t_->is_empty() && t_->dim() == 4, "input: non-empty 4D Tensor expected, got size: ", t_->sizes());
+  TORCH_CHECK(!k_->is_empty() && k_->dim() == 4, "kernel: non-empty 4D Tensor expected, got size: ", k_->sizes());
   THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
   THArgCheck(scol >= 1, 6, "Stride should be a positive integer");
   THArgCheck(*vf == 'V' || *vf == 'F', 7, "type of convolution can 'V' or 'F'");
@@ -1232,8 +1232,8 @@ void THTensor_(conv2Dmul)(THTensor *r_, scalar_t beta, scalar_t alpha, THTensor
   scalar_t *output_data;
   ptrdiff_t nelem;
 
-  AT_CHECK(!t_->is_empty() && t_->dim() == 2, "input: non-empty 2D Tensor expected, got size: ", t_->sizes());
-  AT_CHECK(!k_->is_empty() && k_->dim() == 2, "kernel: non-empty 2D Tensor expected, got size: ", k_->sizes());
+  TORCH_CHECK(!t_->is_empty() && t_->dim() == 2, "input: non-empty 2D Tensor expected, got size: ", t_->sizes());
+  TORCH_CHECK(!k_->is_empty() && k_->dim() == 2, "kernel: non-empty 2D Tensor expected, got size: ", k_->sizes());
   THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
   THArgCheck(scol >= 1, 6, "Stride should be a positive integer");
 
@@ -1291,8 +1291,8 @@ void THTensor_(conv2Dcmul)(THTensor *r_, scalar_t beta, scalar_t alpha, THTensor
   ptrdiff_t nelem;
   int64_t k;
 
-  AT_CHECK(!t_->is_empty() && t_->dim() == 3, "input: non-empty 3D Tensor expected, got size: ", t_->sizes());
-  AT_CHECK(!k_->is_empty() && k_->dim() == 3, "kernel: non-empty 3D Tensor expected, got size: ", k_->sizes());
+  TORCH_CHECK(!t_->is_empty() && t_->dim() == 3, "input: non-empty 3D Tensor expected, got size: ", t_->sizes());
+  TORCH_CHECK(!k_->is_empty() && k_->dim() == 3, "kernel: non-empty 3D Tensor expected, got size: ", k_->sizes());
   THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
   THArgCheck(scol >= 1, 6, "Stride should be a positive integer");
 
@@ -1369,8 +1369,8 @@ void THTensor_(conv2Dmap)(THTensor *r_, scalar_t beta, scalar_t alpha, THTensor
   ptrdiff_t nelem;
   int64_t k;
 
-  AT_CHECK(!t_->is_empty() && t_->dim() == 3, "input: non-empty 3D Tensor expected, got size: ", t_->sizes());
-  AT_CHECK(!k_->is_empty() && k_->dim() == 3, "kernel: non-empty 3D Tensor expected, got size: ", k_->sizes());
+  TORCH_CHECK(!t_->is_empty() && t_->dim() == 3, "input: non-empty 3D Tensor expected, got size: ", t_->sizes());
+  TORCH_CHECK(!k_->is_empty() && k_->dim() == 3, "kernel: non-empty 3D Tensor expected, got size: ", k_->sizes());
   THArgCheck(THTensor_nDimensionLegacyAll(map) == 2 , 4, "map: 2D Tensor expected");
   THArgCheck(srow >= 1, 6, "Stride should be a positive integer");
   THArgCheck(scol >= 1, 7, "Stride should be a positive integer");
@@ -1457,8 +1457,8 @@ void THTensor_(conv3DRevger)(THTensor *r_, scalar_t beta, scalar_t alpha, THTens
   ptrdiff_t nelem;
   int64_t k, i;
 
-  AT_CHECK(!t_->is_empty() && t_->dim() == 4, "input: non-empty 4D Tensor expected, got size: ", t_->sizes());
-  AT_CHECK(!k_->is_empty() && k_->dim() == 4, "kernel: non-empty 4D Tensor expected, got size: ", k_->sizes());
+  TORCH_CHECK(!t_->is_empty() && t_->dim() == 4, "input: non-empty 4D Tensor expected, got size: ", t_->sizes());
+  TORCH_CHECK(!k_->is_empty() && k_->dim() == 4, "kernel: non-empty 4D Tensor expected, got size: ", k_->sizes());
   THArgCheck(sdepth >= 1, 5, "Stride should be a positive integer");
   THArgCheck(srow >= 1, 6, "Stride should be a positive integer");
   THArgCheck(scol >= 1, 7, "Stride should be a positive integer");
@@ -1543,8 +1543,8 @@ void THTensor_(conv3Dger)(THTensor *r_, scalar_t beta, scalar_t alpha, THTensor
   ptrdiff_t nelem;
   int64_t k, i;
 
-  AT_CHECK(!t_->is_empty() && t_->dim() == 4, "input: non-empty 4D Tensor expected, got size: ", t_->sizes());
-  AT_CHECK(!k_->is_empty() && k_->dim() == 4, "kernel: non-empty 4D Tensor expected, got size: ", k_->sizes());
+  TORCH_CHECK(!t_->is_empty() && t_->dim() == 4, "input: non-empty 4D Tensor expected, got size: ", t_->sizes());
+  TORCH_CHECK(!k_->is_empty() && k_->dim() == 4, "kernel: non-empty 4D Tensor expected, got size: ", k_->sizes());
   THArgCheck(sdepth >= 1, 5, "Stride should be a positive integer");
   THArgCheck(srow >= 1, 6, "Stride should be a positive integer");
   THArgCheck(scol >= 1, 7, "Stride should be a positive integer");
@@ -1634,8 +1634,8 @@ void THTensor_(conv3Dmv)(THTensor *r_, scalar_t beta, scalar_t alpha, THTensor *
   ptrdiff_t nelem;
   int64_t k, i;
 
-  AT_CHECK(!t_->is_empty() && t_->dim() == 4, "input: non-empty 4D Tensor expected, got size: ", t_->sizes());
-  AT_CHECK(!k_->is_empty() && k_->dim() == 5, "kernel: non-empty 5D Tensor expected, got size: ", k_->sizes());
+  TORCH_CHECK(!t_->is_empty() && t_->dim() == 4, "input: non-empty 4D Tensor expected, got size: ", t_->sizes());
+  TORCH_CHECK(!k_->is_empty() && k_->dim() == 5, "kernel: non-empty 5D Tensor expected, got size: ", k_->sizes());
   THArgCheck(sdepth >= 1, 5, "Stride should be a positive integer");
   THArgCheck(srow >= 1, 6, "Stride should be a positive integer");
   THArgCheck(scol >= 1, 7, "Stride should be a positive integer");
@@ -1729,8 +1729,8 @@ void THTensor_(conv3Dmul)(THTensor *r_, scalar_t beta, scalar_t alpha, THTensor
   scalar_t *output_data;
   ptrdiff_t nelem;
 
-  AT_CHECK(!t_->is_empty() && t_->dim() == 3, "input: non-empty 3D Tensor expected, got size: ", t_->sizes());
-  AT_CHECK(!k_->is_empty() && k_->dim() == 3, "kernel: non-empty 3D Tensor expected, got size: ", k_->sizes());
+  TORCH_CHECK(!t_->is_empty() && t_->dim() == 3, "input: non-empty 3D Tensor expected, got size: ", t_->sizes());
+  TORCH_CHECK(!k_->is_empty() && k_->dim() == 3, "kernel: non-empty 3D Tensor expected, got size: ", k_->sizes());
   THArgCheck(sdepth >= 1, 5, "Stride should be a positive integer");
   THArgCheck(srow >= 1, 6, "Stride should be a positive integer");
   THArgCheck(scol >= 1, 7, "Stride should be a positive integer");
@@ -1796,8 +1796,8 @@ void THTensor_(conv3Dcmul)(THTensor *r_, scalar_t beta, scalar_t alpha, THTensor
   ptrdiff_t nelem;
   int64_t k;
 
-  AT_CHECK(!t_->is_empty() && t_->dim() == 4, "input: non-empty 4D Tensor expected, got size: ", t_->sizes());
-  AT_CHECK(!k_->is_empty() && k_->dim() == 4, "kernel: non-empty 4D Tensor expected, got size: ", k_->sizes());
+  TORCH_CHECK(!t_->is_empty() && t_->dim() == 4, "input: non-empty 4D Tensor expected, got size: ", t_->sizes());
+  TORCH_CHECK(!k_->is_empty() && k_->dim() == 4, "kernel: non-empty 4D Tensor expected, got size: ", k_->sizes());
   THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
   THArgCheck(scol >= 1, 6, "Stride should be a positive integer");
   THArgCheck(*vf == 'V' || *vf == 'F', 7, "type of convolution can 'V' or 'F'");
@@ -1882,8 +1882,8 @@ void THTensor_(conv3Dmap)(THTensor *r_, scalar_t beta, scalar_t alpha, THTensor
   int64_t nmaps;
   int64_t k;
 
-  AT_CHECK(!t_->is_empty() && t_->dim() == 4, "input: non-empty 4D Tensor expected, got size: ", t_->sizes());
-  AT_CHECK(!k_->is_empty() && k_->dim() == 4, "kernel: non-empty 4D Tensor expected, got size: ", k_->sizes());
+  TORCH_CHECK(!t_->is_empty() && t_->dim() == 4, "input: non-empty 4D Tensor expected, got size: ", t_->sizes());
+  TORCH_CHECK(!k_->is_empty() && k_->dim() == 4, "kernel: non-empty 4D Tensor expected, got size: ", k_->sizes());
   THArgCheck(THTensor_nDimensionLegacyAll(map) == 2 , 4, "map: 2D Tensor expected");
   THArgCheck(srow >= 1, 6, "Stride should be a positive integer");
   THArgCheck(scol >= 1, 7, "Stride should be a positive integer");
diff --git a/aten/src/TH/generic/THTensorEvenMoreMath.cpp b/aten/src/TH/generic/THTensorEvenMoreMath.cpp
index 00da33304a1a..8596f8d38df2 100644
--- a/aten/src/TH/generic/THTensorEvenMoreMath.cpp
+++ b/aten/src/TH/generic/THTensorEvenMoreMath.cpp
@@ -11,7 +11,7 @@ void THTensor_(nonzero)(THLongTensor *subscript, THTensor *tensor)
   int64_t *subscript_data;
   int64_t i = 0;
 #ifdef TH_REAL_IS_HALF
-#define IS_NONZERO(val) ((val.x & 0x7fff) != 0)
+#define IS_NONZERO(val) (c10::Half(0)!=val)
 #else
 #define IS_NONZERO(val) ((val)!=0)
 #endif
@@ -65,8 +65,12 @@ void THTensor_(nonzero)(THLongTensor *subscript, THTensor *tensor)
                 );
   delete [] sizes;
   delete [] idx;
+
+#undef IS_NONZERO
 }
 
+#if !defined(TH_REAL_IS_HALF) /* non half only part */
+
 accreal THTensor_(sumall)(THTensor *tensor)
 {
   accreal sum = 0;
@@ -75,7 +79,76 @@ accreal THTensor_(sumall)(THTensor *tensor)
   return sum;
 }
 
-#if !defined(TH_REAL_IS_BOOL) /* non bool only part */
+void THTensor_(maskedSelect)(THTensor *tensor, THTensor *src, THByteTensor *mask)
+{
+  ptrdiff_t numel = THByteTensor_sumall(mask);
+  scalar_t *tensor_data;
+
+#ifdef DEBUG
+  THAssert(numel <= LONG_MAX);
+#endif
+  THTensor_(resize1d)(tensor,numel);
+  tensor_data = tensor->data<scalar_t>();
+  TH_TENSOR_APPLY2(scalar_t, src, unsigned char, mask,
+                   if (*mask_data > 1)
+                   {
+                     THFree(mask_counter);
+                     THFree(src_counter);
+                     THError("Mask tensor can take 0 and 1 values only");
+                   }
+                   else if (*mask_data == 1)
+                   {
+                     *tensor_data = *src_data;
+                     tensor_data++;
+                   });
+}
+
+void THTensor_(maskedSelectBool)(THTensor *tensor, THTensor *src, THBoolTensor *mask)
+{
+  ptrdiff_t numel = THBoolTensor_sumall(mask);
+  scalar_t *tensor_data;
+
+#ifdef DEBUG
+  THAssert(numel <= LONG_MAX);
+#endif
+  THTensor_(resize1d)(tensor,numel);
+  tensor_data = tensor->data<scalar_t>();
+  TH_TENSOR_APPLY2(scalar_t, src, bool, mask,
+                   if (*mask_data)
+                   {
+                     *tensor_data = *src_data;
+                     tensor_data++;
+                   });
+}
+
+void THTensor_(bitand)(THTensor *r_, THTensor *t, scalar_t value)
+{
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF)
+  (void)r_;
+  (void)t;
+  (void)value;
+  return THError("bitand is only supported for integer type tensors");
+#else
+  THTensor_(resizeAs)(r_, t);
+  int64_t r_Size = THTensor_(nElement)(r_);
+  int r_Contig = THTensor_(isContiguous)(r_);
+  int tContig = THTensor_(isContiguous)(t);
+  if (r_Contig && tContig) {
+    scalar_t *tp = t->data<scalar_t>();
+    scalar_t *rp = r_->data<scalar_t>();
+    at::parallel_for(0, r_Size, TH_OMP_OVERHEAD_THRESHOLD * 100,
+        [&](int64_t start, int64_t end) {
+      for (auto i = start; i < end; i++) {
+        rp[i] = tp[i] & value;
+      }
+    });
+  } else {
+    TH_TENSOR_APPLY2_PARALLEL(r_Size, r_Contig, tContig, scalar_t, r_, scalar_t, t, *r__data = *t_data & value;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
+  }
+#endif
+}
+
+#if !defined(TH_REAL_IS_BOOL)
 
 void THTensor_(maskedFill)(THTensor *tensor, THByteTensor *mask, scalar_t value)
 {
@@ -186,48 +259,6 @@ void THTensor_(maskedCopyBool)(THTensor *tensor, THBoolTensor *mask, THTensor* s
   c10::raw::intrusive_ptr::decref(srct);
 }
 
-void THTensor_(maskedSelect)(THTensor *tensor, THTensor *src, THByteTensor *mask)
-{
-  ptrdiff_t numel = THByteTensor_sumall(mask);
-  scalar_t *tensor_data;
-
-#ifdef DEBUG
-  THAssert(numel <= LONG_MAX);
-#endif
-  THTensor_(resize1d)(tensor,numel);
-  tensor_data = tensor->data<scalar_t>();
-  TH_TENSOR_APPLY2(scalar_t, src, unsigned char, mask,
-                   if (*mask_data > 1)
-                   {
-                     THFree(mask_counter);
-                     THFree(src_counter);
-                     THError("Mask tensor can take 0 and 1 values only");
-                   }
-                   else if (*mask_data == 1)
-                   {
-                     *tensor_data = *src_data;
-                     tensor_data++;
-                   });
-}
-
-void THTensor_(maskedSelectBool)(THTensor *tensor, THTensor *src, THBoolTensor *mask)
-{
-  ptrdiff_t numel = THBoolTensor_sumall(mask);
-  scalar_t *tensor_data;
-
-#ifdef DEBUG
-  THAssert(numel <= LONG_MAX);
-#endif
-  THTensor_(resize1d)(tensor,numel);
-  tensor_data = tensor->data<scalar_t>();
-  TH_TENSOR_APPLY2(scalar_t, src, bool, mask,
-                   if (*mask_data)
-                   {
-                     *tensor_data = *src_data;
-                     tensor_data++;
-                   });
-}
-
 void THTensor_(indexSelect)(THTensor *tensor, THTensor *src, int dim, THLongTensor *index)
 {
   ptrdiff_t i, numel;
@@ -304,7 +335,7 @@ void THTensor_(indexSelect)(THTensor *tensor, THTensor *src, int dim, THLongTens
       THTensor_(select)(sSlice, src, dim, index_data[i]);
       at::Tensor tSlice_wrap = THTensor_wrap(tSlice);
       at::Tensor sSlice_wrap = THTensor_wrap(sSlice);
-      at::_copy_same_type_(tSlice_wrap, sSlice_wrap);
+      at::native::copy_(tSlice_wrap, sSlice_wrap);
       c10::raw::intrusive_ptr::decref(tSlice);
       c10::raw::intrusive_ptr::decref(sSlice);
     }
@@ -337,7 +368,7 @@ void THTensor_(indexCopy)(THTensor *tensor, int dim, THLongTensor *index, THTens
       THTensor_(select)(sSlice, src, dim, i);
       at::Tensor tSlice_wrap = THTensor_wrap(tSlice);
       at::Tensor sSlice_wrap = THTensor_wrap(sSlice);
-      at::_copy_same_type_(tSlice_wrap, sSlice_wrap);
+      at::native::copy_(tSlice_wrap, sSlice_wrap);
     }
 
     c10::raw::intrusive_ptr::decref(tSlice);
@@ -877,32 +908,7 @@ void THTensor_(remainder)(THTensor *r_, THTensor *t, scalar_t value)
   }
 }
 
-void THTensor_(bitand)(THTensor *r_, THTensor *t, scalar_t value)
-{
-#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF)
-  (void)r_;
-  (void)t;
-  (void)value;
-  return THError("bitand is only supported for integer type tensors");
-#else
-  THTensor_(resizeAs)(r_, t);
-  int64_t r_Size = THTensor_(nElement)(r_);
-  int r_Contig = THTensor_(isContiguous)(r_);
-  int tContig = THTensor_(isContiguous)(t);
-  if (r_Contig && tContig) {
-    scalar_t *tp = t->data<scalar_t>();
-    scalar_t *rp = r_->data<scalar_t>();
-    at::parallel_for(0, r_Size, TH_OMP_OVERHEAD_THRESHOLD * 100,
-        [&](int64_t start, int64_t end) {
-      for (auto i = start; i < end; i++) {
-        rp[i] = tp[i] & value;
-      }
-    });
-  } else {
-    TH_TENSOR_APPLY2_PARALLEL(r_Size, r_Contig, tContig, scalar_t, r_, scalar_t, t, *r__data = *t_data & value;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
-  }
 #endif
-}
 
 #endif
 
diff --git a/aten/src/TH/generic/THTensorLapack.cpp b/aten/src/TH/generic/THTensorLapack.cpp
index e0ed46a8b241..6f53e9ad5ef6 100644
--- a/aten/src/TH/generic/THTensorLapack.cpp
+++ b/aten/src/TH/generic/THTensorLapack.cpp
@@ -83,14 +83,14 @@ static THTensor *THTensor_(cloneColumnMajorNrows)(THTensor *self, THTensor *src,
   if (src->size(0) == nrows) {
     at::Tensor result_wrap = THTensor_wrap(result);
     at::Tensor src_wrap = THTensor_wrap(src);
-    at::_copy_same_type_(result_wrap, src_wrap);
+    at::native::copy_(result_wrap, src_wrap);
   }
   else
   {
     view = THTensor_(newNarrow)(result, 0, 0, src->size(0));
     at::Tensor view_wrap = THTensor_wrap(view);
     at::Tensor src_wrap = THTensor_wrap(src);
-    at::_copy_same_type_(view_wrap, src_wrap);
+    at::native::copy_(view_wrap, src_wrap);
     c10::raw::intrusive_ptr::decref(view);
   }
   return result;
@@ -118,7 +118,7 @@ void THTensor_(gels)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a)
   THArgCheck(b->dim() == 1 || b->dim() == 2, 1, "B should have 1 or 2 "
       "dimensions, but has %d", b->dim());
   THArgCheck(!b->is_empty(), 1, "B should not be empty");
-  AT_CHECK(a->size(0) == b->size(0), "Expected A and b to have same size "
+  TORCH_CHECK(a->size(0) == b->size(0), "Expected A and b to have same size "
       "at dim 0, but A has ", a->size(0), " rows and B has ", b->size(0), " rows");
 
   if (THTensor_nDimensionLegacyAll(b) == 1) {
@@ -432,7 +432,7 @@ void THTensor_(gesdd2)(THTensor *ru_, THTensor *rs_, THTensor *rv_, THTensor *ra
     THTensor_(resizeAs)(rv_, rvf_);
     at::Tensor rv__wrap = THTensor_wrap(rv_);
     at::Tensor rvf__wrap =  THTensor_wrap(rvf_);
-    at::_copy_same_type_(rv__wrap, rvf__wrap);
+    at::native::copy_(rv__wrap, rvf__wrap);
     c10::raw::intrusive_ptr::decref(rvf_);
   } else {
     THTensor_(zero)(ru_);
@@ -440,50 +440,6 @@ void THTensor_(gesdd2)(THTensor *ru_, THTensor *rs_, THTensor *rv_, THTensor *ra
   }
 }
 
-void THTensor_(getri)(THTensor *ra_, THTensor *a)
-{
-  if (a == NULL) a = ra_;
-  THArgCheck(THTensor_nDimensionLegacyAll(a) == 2, 1, "A should be 2 dimensional");
-  THArgCheck(a->size(0) == a->size(1), 1, "A should be square");
-
-  int m, n, lda, info, lwork;
-  scalar_t wkopt;
-  THIntTensor *ipiv;
-  THTensor *work;
-  THTensor *ra__ = NULL;
-
-  ra__ = THTensor_(cloneColumnMajor)(ra_, a);
-
-  m = ra__->size(0);
-  n = ra__->size(1);
-  lda = m;
-  ipiv = THIntTensor_newWithSize1d((int64_t)m);
-
-  /* Run LU */
-  THLapack_(getrf)(n, n, ra__->data<scalar_t>(), lda, THIntTensor_data(ipiv), &info);
-  THLapackCheckWithCleanup("Lapack Error %s : U(%d,%d) is 0, U is singular",
-                           THCleanup(
-                               c10::raw::intrusive_ptr::decref(ra__);
-                               THIntTensor_free(ipiv);),
-                           "getrf", info, info);
-
-  /* Run inverse */
-  THLapack_(getri)(n, ra__->data<scalar_t>(), lda, THIntTensor_data(ipiv), &wkopt, -1, &info);
-  lwork = (int)wkopt;
-  work = THTensor_(newWithSize1d)(lwork);
-  THLapack_(getri)(n, ra__->data<scalar_t>(), lda, THIntTensor_data(ipiv), work->data<scalar_t>(), lwork, &info);
-  THLapackCheckWithCleanup("Lapack Error %s : U(%d,%d) is 0, U is singular",
-                           THCleanup(
-                               c10::raw::intrusive_ptr::decref(ra__);
-                               c10::raw::intrusive_ptr::decref(work);
-                               THIntTensor_free(ipiv);),
-                           "getri", info, info);
-
-  THTensor_(freeCopyTo)(ra__, ra_);
-  c10::raw::intrusive_ptr::decref(work);
-  THIntTensor_free(ipiv);
-}
-
 void THTensor_(clearUpLoTriangle)(THTensor *a, const char *uplo)
 {
   THArgCheck(THTensor_nDimensionLegacyAll(a) == 2, 1, "A should be 2 dimensional");
@@ -831,9 +787,9 @@ void THTensor_(ormqr)(THTensor *ra_, THTensor *a, THTensor *tau, THTensor *c, co
 
 void THTensor_(btrisolve)(THTensor *rb_, THTensor *b, THTensor *atf, THIntTensor *pivots)
 {
-  AT_CHECK(!atf->is_empty() && THTensor_(nDimensionLegacyNoScalars)(atf) == 3, "expected non-empty 3D tensor, got size: ",
+  TORCH_CHECK(!atf->is_empty() && THTensor_(nDimensionLegacyNoScalars)(atf) == 3, "expected non-empty 3D tensor, got size: ",
            atf->sizes());
-  AT_CHECK(!b->is_empty() && (THTensor_(nDimensionLegacyNoScalars)(b) == 3 ||
+  TORCH_CHECK(!b->is_empty() && (THTensor_(nDimensionLegacyNoScalars)(b) == 3 ||
              THTensor_(nDimensionLegacyNoScalars)(b) == 2), "expected non-empty 2D or 3D tensor, got size: ", b->sizes());
   THArgCheck(THTensor_(size)(atf, 0) ==
              THTensor_(size)(b, 0), 3, "number of batches must be equal");
@@ -846,7 +802,7 @@ void THTensor_(btrisolve)(THTensor *rb_, THTensor *b, THTensor *atf, THIntTensor
     THTensor_(resizeAs)(rb_, b);
     at::Tensor rb__wrap = THTensor_wrap(rb_);
     at::Tensor b_wrap = THTensor_wrap(b);
-    at::_copy_same_type_(rb__wrap, b_wrap);
+    at::native::copy_(rb__wrap, b_wrap);
   }
 
   int64_t num_batches = atf->size(0);
diff --git a/aten/src/TH/generic/THTensorLapack.h b/aten/src/TH/generic/THTensorLapack.h
index 4c693a870a86..5c512ab98110 100644
--- a/aten/src/TH/generic/THTensorLapack.h
+++ b/aten/src/TH/generic/THTensorLapack.h
@@ -8,7 +8,6 @@ TH_API void THTensor_(geev)(THTensor *re_, THTensor *rv_, THTensor *a_, const ch
 TH_API void THTensor_(gesdd)(THTensor *ru_, THTensor *rs_, THTensor *rv_, THTensor *a, const char *some, const char* compute_uv);
 TH_API void THTensor_(gesdd2)(THTensor *ru_, THTensor *rs_, THTensor *rv_, THTensor *ra_, THTensor *a,
                               const char *some, const char* compute_uv);
-TH_API void THTensor_(getri)(THTensor *ra_, THTensor *a);
 TH_API void THTensor_(potri)(THTensor *ra_, THTensor *a, const char *uplo);
 TH_API void THTensor_(qr)(THTensor *rq_, THTensor *rr_, THTensor *a);
 TH_API void THTensor_(geqrf)(THTensor *ra_, THTensor *rtau_, THTensor *a);
diff --git a/aten/src/TH/generic/THTensorMath.cpp b/aten/src/TH/generic/THTensorMath.cpp
index 4dbc8fbdaecb..52718e541d57 100644
--- a/aten/src/TH/generic/THTensorMath.cpp
+++ b/aten/src/TH/generic/THTensorMath.cpp
@@ -21,20 +21,115 @@
 // sense (rather than just having cut the file down the middle, which is
 // what I did when I split these up originally).
 
-#if !defined(TH_REAL_IS_BOOL) /* non bool only part */
+void THTensor_(cbitand)(THTensor *r_, THTensor *t, THTensor *src)
+{
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF)
+  (void)r_;
+  (void)t;
+  (void)src;
+  return THError("cbitand is only supported for integer type tensors");
+#else
+  THTensor_(resizeAs)(r_, t);
+  int64_t r_Size = THTensor_(nElement)(r_);
+  int64_t srcSize = THTensor_(nElement)(src);
+  int r_Contig = THTensor_(isContiguous)(r_);
+  int tContig = THTensor_(isContiguous)(t);
+  int srcContig = THTensor_(isContiguous)(src);
+  if (srcSize == r_Size){
+    if (r_Contig && tContig && srcContig) {
+      scalar_t *tp = t->data<scalar_t>();
+      scalar_t *sp = src->data<scalar_t>();
+      scalar_t *rp = r_->data<scalar_t>();
+      at::parallel_for(0, r_Size, TH_OMP_OVERHEAD_THRESHOLD,
+          [&](int64_t start, int64_t end) {
+        for (auto i = start; i < end; i++) {
+          rp[i] = tp[i] & sp[i];
+        }
+      });
+    } else {
+      TH_TENSOR_APPLY3_PARALLEL(r_Size, r_Contig, tContig, srcContig, scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = *t_data & *src_data;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
+    }
+  } else {
+    TH_TENSOR_APPLY3(scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = *t_data & *src_data;);
+  }
+#endif
+}
 
-// Should wrap if the value (a) has a different sign than the divisor (b), but is not 0.
-static inline bool modulo_wrap(scalar_t a, scalar_t b) {
-  return (a != 0) && (a < 0) != (b < 0);
+void THTensor_(cbitor)(THTensor *r_, THTensor *t, THTensor *src)
+{
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF)
+  (void)r_;
+  (void)t;
+  (void)src;
+  return THError("cbitor is only supported for integer type tensors");
+#else
+  THTensor_(resizeAs)(r_, t);
+  int64_t r_Size = THTensor_(nElement)(r_);
+  int64_t srcSize = THTensor_(nElement)(src);
+  int r_Contig = THTensor_(isContiguous)(r_);
+  int tContig = THTensor_(isContiguous)(t);
+  int srcContig = THTensor_(isContiguous)(src);
+  if (srcSize == r_Size){
+    if (r_Contig && tContig && srcContig) {
+      scalar_t *tp = t->data<scalar_t>();
+      scalar_t *sp = src->data<scalar_t>();
+      scalar_t *rp = r_->data<scalar_t>();
+      at::parallel_for(0, r_Size, TH_OMP_OVERHEAD_THRESHOLD,
+          [&](int64_t start, int64_t end) {
+        for (auto i = start; i < end; i++) {
+          rp[i] = tp[i] | sp[i];
+        }
+      });
+    } else {
+      TH_TENSOR_APPLY3_PARALLEL(r_Size, r_Contig, tContig, srcContig, scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = *t_data | *src_data;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
+    }
+  } else {
+    TH_TENSOR_APPLY3(scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = *t_data | *src_data;);
+  }
+#endif
 }
 
-void THTensor_(bitor)(THTensor *r_, THTensor *t, scalar_t value)
+void THTensor_(cbitxor)(THTensor *r_, THTensor *t, THTensor *src)
+{
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF)
+  (void)r_;
+  (void)t;
+  (void)src;
+  return THError("cbitxor is only supported for integer type tensors");
+#else
+  THTensor_(resizeAs)(r_, t);
+  int64_t r_Size = THTensor_(nElement)(r_);
+  int64_t srcSize = THTensor_(nElement)(src);
+  int r_Contig = THTensor_(isContiguous)(r_);
+  int tContig = THTensor_(isContiguous)(t);
+  int srcContig = THTensor_(isContiguous)(src);
+  if (srcSize == r_Size){
+    if (r_Contig && tContig && srcContig) {
+      scalar_t *tp = t->data<scalar_t>();
+      scalar_t *sp = src->data<scalar_t>();
+      scalar_t *rp = r_->data<scalar_t>();
+      at::parallel_for(0, r_Size, TH_OMP_OVERHEAD_THRESHOLD,
+          [&](int64_t start, int64_t end) {
+        for (auto i = start; i < end; i++) {
+          rp[i] = tp[i] ^ sp[i];
+        }
+      });
+    } else {
+      TH_TENSOR_APPLY3_PARALLEL(r_Size, r_Contig, tContig, srcContig, scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = *t_data ^ *src_data;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
+    }
+  } else {
+    TH_TENSOR_APPLY3(scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = *t_data ^ *src_data;);
+  }
+#endif
+}
+
+void THTensor_(bitxor)(THTensor *r_, THTensor *t, scalar_t value)
 {
 #if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF)
   (void)r_;
   (void)t;
   (void)value;
-  return THError("bitor is only supported for integer type tensors");
+  return THError("bitxor is only supported for integer type tensors");
 #else
   THTensor_(resizeAs)(r_, t);
   int64_t r_Size = THTensor_(nElement)(r_);
@@ -46,22 +141,22 @@ void THTensor_(bitor)(THTensor *r_, THTensor *t, scalar_t value)
     at::parallel_for(0, r_Size, TH_OMP_OVERHEAD_THRESHOLD * 100,
         [&](int64_t start, int64_t end) {
       for (auto i = start; i < end; i++) {
-        rp[i] = tp[i] | value;
+        rp[i] = tp[i] ^ value;
       }
     });
   } else {
-    TH_TENSOR_APPLY2_PARALLEL(r_Size, r_Contig, tContig, scalar_t, r_, scalar_t, t, *r__data = *t_data | value;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
+    TH_TENSOR_APPLY2_PARALLEL(r_Size, r_Contig, tContig, scalar_t, r_, scalar_t, t, *r__data = *t_data ^ value;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
   }
 #endif
 }
 
-void THTensor_(bitxor)(THTensor *r_, THTensor *t, scalar_t value)
+void THTensor_(bitor)(THTensor *r_, THTensor *t, scalar_t value)
 {
 #if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF)
   (void)r_;
   (void)t;
   (void)value;
-  return THError("bitxor is only supported for integer type tensors");
+  return THError("bitor is only supported for integer type tensors");
 #else
   THTensor_(resizeAs)(r_, t);
   int64_t r_Size = THTensor_(nElement)(r_);
@@ -73,15 +168,22 @@ void THTensor_(bitxor)(THTensor *r_, THTensor *t, scalar_t value)
     at::parallel_for(0, r_Size, TH_OMP_OVERHEAD_THRESHOLD * 100,
         [&](int64_t start, int64_t end) {
       for (auto i = start; i < end; i++) {
-        rp[i] = tp[i] ^ value;
+        rp[i] = tp[i] | value;
       }
     });
   } else {
-    TH_TENSOR_APPLY2_PARALLEL(r_Size, r_Contig, tContig, scalar_t, r_, scalar_t, t, *r__data = *t_data ^ value;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
+    TH_TENSOR_APPLY2_PARALLEL(r_Size, r_Contig, tContig, scalar_t, r_, scalar_t, t, *r__data = *t_data | value;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
   }
 #endif
 }
 
+#if !defined(TH_REAL_IS_BOOL) /* non bool only part */
+
+// Should wrap if the value (a) has a different sign than the divisor (b), but is not 0.
+static inline bool modulo_wrap(scalar_t a, scalar_t b) {
+  return (a != 0) && (a < 0) != (b < 0);
+}
+
 void THTensor_(clamp)(THTensor *r_, THTensor *t, scalar_t min_value, scalar_t max_value)
 {
   THTensor_(resizeAs)(r_, t);
@@ -176,7 +278,7 @@ void THTensor_(pow)(THTensor *r_, THTensor *t, scalar_t value)
   if(value == 1) {
     at::Tensor r__wrap = THTensor_wrap(r_);
     at::Tensor t_wrap = THTensor_wrap(t);
-    at::_copy_same_type_(r__wrap, t_wrap);
+    at::native::copy_(r__wrap, t_wrap);
   }
   else if(value == 2){
     THTensor_(cmul)(r_, t, t);
@@ -453,108 +555,6 @@ void THTensor_(cremainder)(THTensor *r_, THTensor *t, THTensor *src)
   }
 }
 
-void THTensor_(cbitand)(THTensor *r_, THTensor *t, THTensor *src)
-{
-#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF)
-  (void)r_;
-  (void)t;
-  (void)src;
-  return THError("cbitand is only supported for integer type tensors");
-#else
-  THTensor_(resizeAs)(r_, t);
-  int64_t r_Size = THTensor_(nElement)(r_);
-  int64_t srcSize = THTensor_(nElement)(src);
-  int r_Contig = THTensor_(isContiguous)(r_);
-  int tContig = THTensor_(isContiguous)(t);
-  int srcContig = THTensor_(isContiguous)(src);
-  if (srcSize == r_Size){
-    if (r_Contig && tContig && srcContig) {
-      scalar_t *tp = t->data<scalar_t>();
-      scalar_t *sp = src->data<scalar_t>();
-      scalar_t *rp = r_->data<scalar_t>();
-      at::parallel_for(0, r_Size, TH_OMP_OVERHEAD_THRESHOLD,
-          [&](int64_t start, int64_t end) {
-        for (auto i = start; i < end; i++) {
-          rp[i] = tp[i] & sp[i];
-        }
-      });
-    } else {
-      TH_TENSOR_APPLY3_PARALLEL(r_Size, r_Contig, tContig, srcContig, scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = *t_data & *src_data;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
-    }
-  } else {
-    TH_TENSOR_APPLY3(scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = *t_data & *src_data;);
-  }
-#endif
-}
-
-void THTensor_(cbitor)(THTensor *r_, THTensor *t, THTensor *src)
-{
-#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF)
-  (void)r_;
-  (void)t;
-  (void)src;
-  return THError("cbitor is only supported for integer type tensors");
-#else
-  THTensor_(resizeAs)(r_, t);
-  int64_t r_Size = THTensor_(nElement)(r_);
-  int64_t srcSize = THTensor_(nElement)(src);
-  int r_Contig = THTensor_(isContiguous)(r_);
-  int tContig = THTensor_(isContiguous)(t);
-  int srcContig = THTensor_(isContiguous)(src);
-  if (srcSize == r_Size){
-    if (r_Contig && tContig && srcContig) {
-      scalar_t *tp = t->data<scalar_t>();
-      scalar_t *sp = src->data<scalar_t>();
-      scalar_t *rp = r_->data<scalar_t>();
-      at::parallel_for(0, r_Size, TH_OMP_OVERHEAD_THRESHOLD,
-          [&](int64_t start, int64_t end) {
-        for (auto i = start; i < end; i++) {
-          rp[i] = tp[i] | sp[i];
-        }
-      });
-    } else {
-      TH_TENSOR_APPLY3_PARALLEL(r_Size, r_Contig, tContig, srcContig, scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = *t_data | *src_data;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
-    }
-  } else {
-    TH_TENSOR_APPLY3(scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = *t_data | *src_data;);
-  }
-#endif
-}
-
-void THTensor_(cbitxor)(THTensor *r_, THTensor *t, THTensor *src)
-{
-#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF)
-  (void)r_;
-  (void)t;
-  (void)src;
-  return THError("cbitxor is only supported for integer type tensors");
-#else
-  THTensor_(resizeAs)(r_, t);
-  int64_t r_Size = THTensor_(nElement)(r_);
-  int64_t srcSize = THTensor_(nElement)(src);
-  int r_Contig = THTensor_(isContiguous)(r_);
-  int tContig = THTensor_(isContiguous)(t);
-  int srcContig = THTensor_(isContiguous)(src);
-  if (srcSize == r_Size){
-    if (r_Contig && tContig && srcContig) {
-      scalar_t *tp = t->data<scalar_t>();
-      scalar_t *sp = src->data<scalar_t>();
-      scalar_t *rp = r_->data<scalar_t>();
-      at::parallel_for(0, r_Size, TH_OMP_OVERHEAD_THRESHOLD,
-          [&](int64_t start, int64_t end) {
-        for (auto i = start; i < end; i++) {
-          rp[i] = tp[i] ^ sp[i];
-        }
-      });
-    } else {
-      TH_TENSOR_APPLY3_PARALLEL(r_Size, r_Contig, tContig, srcContig, scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = *t_data ^ *src_data;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
-    }
-  } else {
-    TH_TENSOR_APPLY3(scalar_t, r_, scalar_t, t, scalar_t, src, *r__data = *t_data ^ *src_data;);
-  }
-#endif
-}
-
 void THTensor_(tpow)(THTensor *r_, scalar_t value, THTensor *t)
 {
   THTensor_(resizeAs)(r_, t);
@@ -582,7 +582,7 @@ void THTensor_(addcmul)(THTensor *r_, THTensor *t, scalar_t value, THTensor *src
     THTensor_(resizeAs)(r_, t);
     at::Tensor r__wrap = THTensor_wrap(r_);
     at::Tensor t_wrap = THTensor_wrap(t);
-    at::_copy_same_type_(r__wrap, t_wrap);
+    at::native::copy_(r__wrap, t_wrap);
   }
   int64_t r_Size = THTensor_(nElement)(r_);
   int64_t src1Size = THTensor_(nElement)(src1);
@@ -604,7 +604,7 @@ void THTensor_(addcdiv)(THTensor *r_, THTensor *t, scalar_t value, THTensor *src
     THTensor_(resizeAs)(r_, t);
     at::Tensor r__wrap = THTensor_wrap(r_);
     at::Tensor t_wrap = THTensor_wrap(t);
-    at::_copy_same_type_(r__wrap, t_wrap);
+    at::native::copy_(r__wrap, t_wrap);
   }
   int64_t r_Size = THTensor_(nElement)(r_);
   int64_t src1Size = THTensor_(nElement)(src1);
@@ -645,7 +645,7 @@ void THTensor_(addmv)(THTensor *r_, scalar_t beta, THTensor *t, scalar_t alpha,
     THTensor_(resizeAs)(r_, t);
     at::Tensor r__wrap = THTensor_wrap(r_);
     at::Tensor t_wrap = THTensor_wrap(t);
-    at::_copy_same_type_(r__wrap, t_wrap);
+    at::native::copy_(r__wrap, t_wrap);
   }
 
   auto r_stride = THTensor_strideLegacyNoScalars(r_, 0);
@@ -768,7 +768,7 @@ void THTensor_(addmm)(THTensor *r_, scalar_t beta, THTensor *t, scalar_t alpha,
     if (beta != 0.0) {
       at::Tensor r__wrap = THTensor_wrap(r_);
       at::Tensor t_wrap = THTensor_wrap(t);
-      at::_copy_same_type_(r__wrap, t_wrap);
+      at::native::copy_(r__wrap, t_wrap);
     }
   }
 
@@ -905,7 +905,7 @@ void THTensor_(addr)(THTensor *r_, scalar_t beta, THTensor *t, scalar_t alpha, T
     THTensor_(resizeAs)(r_, t);
     at::Tensor r__wrap = THTensor_wrap(r_);
     at::Tensor t_wrap = THTensor_wrap(t);
-    at::_copy_same_type_(r__wrap, t_wrap);
+    at::native::copy_(r__wrap, t_wrap);
   }
 
   if(beta == 0) {
@@ -970,7 +970,7 @@ void THTensor_(addbmm)(THTensor *result, scalar_t beta, THTensor *t, scalar_t al
     if (beta != 0.0) {
       at::Tensor result_wrap = THTensor_wrap(result);
       at::Tensor t_wrap = THTensor_wrap(t);
-      at::_copy_same_type_(result_wrap, t_wrap);
+      at::native::copy_(result_wrap, t_wrap);
     }
   }
 
diff --git a/aten/src/TH/generic/THTensorMath.h b/aten/src/TH/generic/THTensorMath.h
index 82823441aee8..7ec9d7854ba6 100644
--- a/aten/src/TH/generic/THTensorMath.h
+++ b/aten/src/TH/generic/THTensorMath.h
@@ -4,6 +4,8 @@
 
 TH_API void THTensor_(nonzero)(THLongTensor *subscript, THTensor *tensor);
 
+#ifndef TH_REAL_IS_HALF
+
 TH_API void THTensor_(ltValue)(THByteTensor *r_, THTensor* t, scalar_t value);
 TH_API void THTensor_(leValue)(THByteTensor *r_, THTensor* t, scalar_t value);
 TH_API void THTensor_(gtValue)(THByteTensor *r_, THTensor* t, scalar_t value);
@@ -35,14 +37,24 @@ TH_API void THTensor_(eqTensorT)(THTensor *r_, THTensor *ta, THTensor *tb);
 TH_API accreal THTensor_(sumall)(THTensor *t);
 TH_API int THTensor_(equal)(THTensor *ta, THTensor *tb);
 
+TH_API void THTensor_(maskedSelect)(THTensor *tensor, THTensor* src, THByteTensor *mask);
+TH_API void THTensor_(maskedSelectBool)(THTensor *tensor, THTensor* src, THBoolTensor *mask);
+
+TH_API void THTensor_(bitand)(THTensor *r_, THTensor *t, scalar_t value);
+TH_API void THTensor_(cbitand)(THTensor *r_, THTensor *t, THTensor *src);
+TH_API void THTensor_(bitor)(THTensor *r_, THTensor *t, scalar_t value);
+TH_API void THTensor_(cbitor)(THTensor *r_, THTensor *t, THTensor *src);
+TH_API void THTensor_(bitxor)(THTensor *r_, THTensor *t, scalar_t value);
+TH_API void THTensor_(cbitxor)(THTensor *r_, THTensor *t, THTensor *src);
+
+TH_API void THTensor_(sign)(THTensor *r_, THTensor *t);
+
 #if !defined(TH_REAL_IS_BOOL) /* non bool only part */
 
 TH_API void THTensor_(maskedFill)(THTensor *tensor, THByteTensor *mask, scalar_t value);
 TH_API void THTensor_(maskedCopy)(THTensor *tensor, THByteTensor *mask, THTensor* src);
-TH_API void THTensor_(maskedSelect)(THTensor *tensor, THTensor* src, THByteTensor *mask);
 TH_API void THTensor_(maskedFillBool)(THTensor *tensor, THBoolTensor *mask, scalar_t value);
 TH_API void THTensor_(maskedCopyBool)(THTensor *tensor, THBoolTensor *mask, THTensor* src);
-TH_API void THTensor_(maskedSelectBool)(THTensor *tensor, THTensor* src, THBoolTensor *mask);
 
 TH_API void THTensor_(indexSelect)(THTensor *tensor, THTensor *src, int dim, THLongTensor *index);
 TH_API void THTensor_(indexCopy)(THTensor *tensor, int dim, THLongTensor *index, THTensor *src);
@@ -75,9 +87,6 @@ TH_API void THTensor_(rshift)(THTensor *r_, THTensor *t, scalar_t value);
 TH_API void THTensor_(fmod)(THTensor *r_, THTensor *t, scalar_t value);
 TH_API void THTensor_(remainder)(THTensor *r_, THTensor *t, scalar_t value);
 TH_API void THTensor_(clamp)(THTensor *r_, THTensor *t, scalar_t min_value, scalar_t max_value);
-TH_API void THTensor_(bitand)(THTensor *r_, THTensor *t, scalar_t value);
-TH_API void THTensor_(bitor)(THTensor *r_, THTensor *t, scalar_t value);
-TH_API void THTensor_(bitxor)(THTensor *r_, THTensor *t, scalar_t value);
 
 TH_API void THTensor_(cadd)(THTensor *r_, THTensor *t, scalar_t value, THTensor *src);
 TH_API void THTensor_(csub)(THTensor *self, THTensor *src1, scalar_t value, THTensor *src2);
@@ -88,9 +97,6 @@ TH_API void THTensor_(clshift)(THTensor *r_, THTensor *t, THTensor *src);
 TH_API void THTensor_(crshift)(THTensor *r_, THTensor *t, THTensor *src);
 TH_API void THTensor_(cfmod)(THTensor *r_, THTensor *t, THTensor *src);
 TH_API void THTensor_(cremainder)(THTensor *r_, THTensor *t, THTensor *src);
-TH_API void THTensor_(cbitand)(THTensor *r_, THTensor *t, THTensor *src);
-TH_API void THTensor_(cbitor)(THTensor *r_, THTensor *t, THTensor *src);
-TH_API void THTensor_(cbitxor)(THTensor *r_, THTensor *t, THTensor *src);
 
 TH_API void THTensor_(addcmul)(THTensor *r_, THTensor *t, scalar_t value, THTensor *src1, THTensor *src2);
 TH_API void THTensor_(addcdiv)(THTensor *r_, THTensor *t, scalar_t value, THTensor *src1, THTensor *src2);
@@ -113,7 +119,6 @@ TH_API void THTensor_(mode)(THTensor *values_, THLongTensor *indices_, THTensor
 TH_API void THTensor_(prod)(THTensor *r_, THTensor *t, int dimension, int keepdim);
 TH_API void THTensor_(cumsum)(THTensor *r_, THTensor *t, int dimension);
 TH_API void THTensor_(cumprod)(THTensor *r_, THTensor *t, int dimension);
-TH_API void THTensor_(sign)(THTensor *r_, THTensor *t);
 TH_API accreal THTensor_(trace)(THTensor *t);
 
 TH_API void THTensor_(cmax)(THTensor *r, THTensor *t, THTensor *src);
@@ -183,3 +188,4 @@ TH_API void THTensor_(dirichlet_grad)(THTensor *self, THTensor *x, THTensor *alp
 
 #endif
 #endif
+#endif
diff --git a/aten/src/TH/generic/THTensorMoreMath.cpp b/aten/src/TH/generic/THTensorMoreMath.cpp
index 1fcb8ee231df..e22ccdcea45e 100644
--- a/aten/src/TH/generic/THTensorMoreMath.cpp
+++ b/aten/src/TH/generic/THTensorMoreMath.cpp
@@ -63,6 +63,26 @@ int THTensor_(equal)(THTensor *ta, THTensor* tb)
   return equal;
 }
 
+void THTensor_(sign)(THTensor *r_, THTensor *t)
+{
+  THTensor_(resizeAs)(r_, t);
+
+#if defined (TH_REAL_IS_BYTE)
+  TH_TENSOR_APPLY2(scalar_t, r_, scalar_t, t,
+    if (*t_data > 0) *r__data = 1;
+    else *r__data = 0;);
+#elif defined (TH_REAL_IS_BOOL)
+TH_TENSOR_APPLY2(scalar_t, r_, scalar_t, t,
+  if (*t_data == true) *r__data = false;
+  else *r__data = true;);
+#else
+  TH_TENSOR_APPLY2(scalar_t, r_, scalar_t, t,
+    if (*t_data > 0) *r__data = 1;
+    else if (*t_data < 0) *r__data = -1;
+    else *r__data = 0;);
+#endif
+}
+
 #if !defined(TH_REAL_IS_BOOL) /* non bool only part */
 
 void THTensor_(baddbmm)(THTensor *result, scalar_t beta, THTensor *t, scalar_t alpha, THTensor *batch1, THTensor *batch2)
@@ -91,7 +111,7 @@ void THTensor_(baddbmm)(THTensor *result, scalar_t beta, THTensor *t, scalar_t a
     if (beta != 0.0) {
       at::Tensor result_wrap = THTensor_wrap(result);
       at::Tensor t_wrap = THTensor_wrap(t);
-      at::_copy_same_type_(result_wrap, t_wrap);
+      at::native::copy_(result_wrap, t_wrap);
     }
   }
 
@@ -177,7 +197,7 @@ void THTensor_(max)(THTensor *values_, THLongTensor *indices_, THTensor *t, int
       at::Tensor values__wrap = THTensor_wrap(values_);
       at::Tensor t0_wrap = THTensor_wrap(t0);
       auto right_shape = t0_wrap.reshape(values__wrap.sizes());
-      at::_copy_same_type_(values__wrap, right_shape);
+      at::native::copy_(values__wrap, right_shape);
       c10::raw::intrusive_ptr::decref(t0);
     } else {
       THTensor_(fill)(values_, THTensor_(get1d)(t, 0));
@@ -261,7 +281,7 @@ void THTensor_(min)(THTensor *values_, THLongTensor *indices_, THTensor *t, int
       at::Tensor values__wrap = THTensor_wrap(values_);
       at::Tensor t0_wrap = THTensor_wrap(t0);
       auto right_shape = t0_wrap.reshape(values__wrap.sizes());
-      at::_copy_same_type_(values__wrap, right_shape);
+      at::native::copy_(values__wrap, right_shape);
       c10::raw::intrusive_ptr::decref(t0);
     } else {
       THTensor_(fill)(values_, THTensor_(get1d)(t, 0));
@@ -400,24 +420,6 @@ void THTensor_(cumprod)(THTensor *r_, THTensor *t, int dimension)
                        });
 }
 
-
-void THTensor_(sign)(THTensor *r_, THTensor *t)
-{
-  THTensor_(resizeAs)(r_, t);
-
-#if defined (TH_REAL_IS_BYTE)
-  TH_TENSOR_APPLY2(scalar_t, r_, scalar_t, t,
-    if (*t_data > 0) *r__data = 1;
-    else *r__data = 0;);
-#else
-  TH_TENSOR_APPLY2(scalar_t, r_, scalar_t, t,
-    if (*t_data > 0) *r__data = 1;
-    else if (*t_data < 0) *r__data = -1;
-    else *r__data = 0;);
-#endif
-}
-
-
 accreal THTensor_(trace)(THTensor *t)
 {
   scalar_t *t_data = t->data<scalar_t>();
@@ -737,7 +739,7 @@ void THTensor_(sort)(THTensor *rt_, THLongTensor *ri_, THTensor *t, int dimensio
   THTensor_(resizeAs)(rt_, t);
   at::Tensor rt__wrap = THTensor_wrap(rt_);
   at::Tensor t_wrap = THTensor_wrap(t);
-  at::_copy_same_type_(rt__wrap, t_wrap);
+  at::native::copy_(rt__wrap, t_wrap);
   THLongTensor_resize(ri_, t->sizes(), {});
 
   if(descendingOrder)
@@ -1341,7 +1343,7 @@ void THTensor_(renorm)(THTensor *res, THTensor *src, scalar_t value, int dimensi
     {
       at::Tensor rowR_wrap = THTensor_wrap(rowR);
       at::Tensor rowS_wrap = THTensor_wrap(rowS);
-      at::_copy_same_type_(rowR_wrap, rowS_wrap);
+      at::native::copy_(rowR_wrap, rowS_wrap);
     }
   }
 
diff --git a/aten/src/THC/CMakeLists.txt b/aten/src/THC/CMakeLists.txt
index 4a7dcccfa93d..28c44dcfeaaa 100644
--- a/aten/src/THC/CMakeLists.txt
+++ b/aten/src/THC/CMakeLists.txt
@@ -18,7 +18,7 @@ foreach(THC_TYPE Byte Char Short Int Long Half Float Double)
    endforeach()
 endforeach()
 
-foreach(THC_FILE TensorMathCompareT TensorMathCompare TensorMathReduce TensorMasked)
+foreach(THC_FILE TensorMathCompareT TensorMathCompare TensorMathReduce TensorMasked TensorMathPointwise)
    if(NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/generated/THC${THC_FILE}Bool.cu")
       FILE(WRITE "${CMAKE_CURRENT_SOURCE_DIR}/generated/THC${THC_FILE}Bool.cu"
         "#include <THC/THC${THC_FILE}.cuh>\n#include <THC/THCTensor.hpp>\n\n#include <THC/generic/THC${THC_FILE}.cu>\n#include <THC/THCGenerateBoolType.h>\n")
diff --git a/aten/src/THC/THCBlas.cu b/aten/src/THC/THCBlas.cu
index 39048730a6ee..4927bf2a6800 100644
--- a/aten/src/THC/THCBlas.cu
+++ b/aten/src/THC/THCBlas.cu
@@ -508,37 +508,6 @@ void THCudaBlas_DgemmStridedBatched(THCState *state, char transa, char transb, i
 }
 #endif
 
-/* Inverse */
-void THCudaBlas_Sgetrf(THCState *state, int n, float **a, int lda, int *pivot, int *info, int batchSize) {
-#ifndef __HIP_PLATFORM_HCC__
-  if( (n >= INT_MAX) || (lda >= INT_MAX) || (batchSize >= INT_MAX) )
-  {
-    THError("Cublas_Sgetrf only supports n, lda, batchSize"
-            "with the bound [val] <= %d", INT_MAX);
-  }
-  cublasHandle_t handle = THCState_getCurrentBlasHandle(state);
-  cublasSetStream(handle, THCState_getCurrentStream(state));
-  THCublasCheck(cublasSgetrfBatched(handle, n, a, lda, pivot, info, batchSize));
-#else
-  THError("THCudaBlas_Sgetrf not supported in ROCM.");
-#endif
-}
-
-void THCudaBlas_Dgetrf(THCState *state, int n, double **a, int lda, int *pivot, int *info, int batchSize) {
-#ifndef __HIP_PLATFORM_HCC__
-  if( (n >= INT_MAX) || (lda >= INT_MAX) || (batchSize >= INT_MAX) )
-  {
-    THError("Cublas_Dgetrf only supports n, lda, batchSize"
-            "with the bound [val] <= %d", INT_MAX);
-  }
-  cublasHandle_t handle = THCState_getCurrentBlasHandle(state);
-  cublasSetStream(handle, THCState_getCurrentStream(state));
-  THCublasCheck(cublasDgetrfBatched(handle, n, a, lda, pivot, info, batchSize));
-#else
-  THError("THCudaBlas_Dgetrf not supported in ROCM.");
-#endif
-}
-
 void THCudaBlas_Sgetrs(THCState *state, char transa, int n, int nrhs, const float **a, int lda, int *pivot, float **b, int ldb, int *info, int batchSize)
 {
 #ifndef __HIP_PLATFORM_HCC__
@@ -579,33 +548,3 @@ void THCudaBlas_Dgetrs(THCState *state, char transa, int n, int nrhs, const doub
   THError("THCudaBlas_Dgetrs not supported in ROCM.");
 #endif
 }
-
-void THCudaBlas_Sgetri(THCState *state, int n, const float **a, int lda, int *pivot, float **c, int ldc, int *info, int batchSize) {
-#ifndef __HIP_PLATFORM_HCC__
-  if( (n >= INT_MAX) || (lda >= INT_MAX)|| (ldc >= INT_MAX) || (batchSize >= INT_MAX) )
-  {
-    THError("Cublas_Sgetri only supports n, lda, ldc, batchSize"
-            "with the bound [val] <= %d", INT_MAX);
-  }
-  cublasHandle_t handle = THCState_getCurrentBlasHandle(state);
-  cublasSetStream(handle, THCState_getCurrentStream(state));
-  THCublasCheck(cublasSgetriBatched(handle, n, a, lda, pivot, c, ldc, info, batchSize));
-#else
-  THError("THCudaBlas_Sgetri not supported in ROCM.");
-#endif
-}
-
-void THCudaBlas_Dgetri(THCState *state, int n, const double **a, int lda, int *pivot, double **c, int ldc, int *info, int batchSize) {
-#ifndef __HIP_PLATFORM_HCC__
-  if( (n >= INT_MAX) || (lda >= INT_MAX)|| (ldc >= INT_MAX) || (batchSize >= INT_MAX) )
-  {
-    THError("Cublas_Dgetri only supports n, lda, ldc, batchSize"
-            "with the bound [val] <= %d", INT_MAX);
-  }
-  cublasHandle_t handle = THCState_getCurrentBlasHandle(state);
-  cublasSetStream(handle, THCState_getCurrentStream(state));
-  THCublasCheck(cublasDgetriBatched(handle, n, a, lda, pivot, c, ldc, info, batchSize));
-#else
-  THError("THCudaBlas_Dgetri not supported in ROCM.");
-#endif
-}
diff --git a/aten/src/THC/THCBlas.h b/aten/src/THC/THCBlas.h
index 030646892d7c..56e011386a39 100644
--- a/aten/src/THC/THCBlas.h
+++ b/aten/src/THC/THCBlas.h
@@ -42,14 +42,7 @@ void THCudaBlas_HgemmStridedBatched(THCState *state, char transa, char transb, i
                                                                   THHalf beta, THHalf *c, int64_t ldc, int64_t strideC, int64_t batchCount);
 #endif
 
-/* Inverse */
-THC_API void THCudaBlas_Sgetrf(THCState *state, int n, float **a, int lda, int *pivot, int *info, int batchSize);
-THC_API void THCudaBlas_Dgetrf(THCState *state, int n, double **a, int lda, int *pivot, int *info, int batchSize);
-
 THC_API void THCudaBlas_Sgetrs(THCState *state, char transa, int n, int nrhs, const float **a, int lda, int *pivot, float **b, int ldb, int *info, int batchSize);
 THC_API void THCudaBlas_Dgetrs(THCState *state, char transa, int n, int nrhs, const double **a, int lda, int *pivot, double **b, int ldb, int *info, int batchSize);
 
-THC_API void THCudaBlas_Sgetri(THCState *state, int n, const float **a, int lda, int *pivot, float **c, int ldc, int *info, int batchSize);
-THC_API void THCudaBlas_Dgetri(THCState *state, int n, const double **a, int lda, int *pivot, double **c, int ldc, int *info, int batchSize);
-
 #endif
diff --git a/aten/src/THC/THCTensor.cpp b/aten/src/THC/THCTensor.cpp
index 8065d6020fb3..171ec945f575 100644
--- a/aten/src/THC/THCTensor.cpp
+++ b/aten/src/THC/THCTensor.cpp
@@ -104,7 +104,7 @@ void THCTensor_resizeAs(THCState *state, THCTensor *self, THCTensor *src) {
 
 void THCTensor_resizeNd(THCState *state, THCTensor *self, int nDimension, const int64_t *size, const int64_t *stride)
 {
-  AT_CHECK(nDimension >= 0, "resizeNd nDimension must be non-negative");
+  TORCH_CHECK(nDimension >= 0, "resizeNd nDimension must be non-negative");
   at::IntArrayRef sizes(size, nDimension);
   at::optional<at::IntArrayRef> strides;
   if (stride) {
diff --git a/aten/src/THC/THCTensorMath.h b/aten/src/THC/THCTensorMath.h
index 264a6b26b1e3..4002a85f7ce7 100644
--- a/aten/src/THC/THCTensorMath.h
+++ b/aten/src/THC/THCTensorMath.h
@@ -19,9 +19,15 @@
 #include <THC/generic/THCTensorMathPairwise.h>
 #include <THC/THCGenerateAllTypes.h>
 
+#include <THC/generic/THCTensorMathPairwise.h>
+#include <THC/THCGenerateBoolType.h>
+
 #include <THC/generic/THCTensorMathPointwise.h>
 #include <THC/THCGenerateAllTypes.h>
 
+#include <THC/generic/THCTensorMathPointwise.h>
+#include <THC/THCGenerateBoolType.h>
+
 #include <THC/generic/THCTensorMathReduce.h>
 #include <THC/THCGenerateAllTypes.h>
 
@@ -46,6 +52,9 @@
 #include <THC/generic/THCTensorMasked.h>
 #include <THC/THCGenerateAllTypes.h>
 
+#include <THC/generic/THCTensorMasked.h>
+#include <THC/THCGenerateBoolType.h>
+
 #include <THC/generic/THCTensorScatterGather.h>
 #include <THC/THCGenerateAllTypes.h>
 
diff --git a/aten/src/THC/THCTensorMathPairwise.cu b/aten/src/THC/THCTensorMathPairwise.cu
index 63eb707cfaf3..fe781feb0d53 100644
--- a/aten/src/THC/THCTensorMathPairwise.cu
+++ b/aten/src/THC/THCTensorMathPairwise.cu
@@ -303,3 +303,6 @@ struct TensorBitXorConstantOp {
 
 #include <THC/generic/THCTensorMathPairwise.cu>
 #include <THC/THCGenerateAllTypes.h>
+
+#include <THC/generic/THCTensorMathPairwise.cu>
+#include <THC/THCGenerateBoolType.h>
diff --git a/aten/src/THC/THCTensorRandom.cu b/aten/src/THC/THCTensorRandom.cu
index db7f6446335b..f324fa2f0b1a 100644
--- a/aten/src/THC/THCTensorRandom.cu
+++ b/aten/src/THC/THCTensorRandom.cu
@@ -101,24 +101,6 @@ THC_API __host__ void THCRandom_setRNGState(THCState* state, THByteTensor *rng_s
   }
 }
 
-// Goes from (0, 1] to [0, 1). Note 1-x is not sufficient since for some floats
-// eps near 0, 1-eps will round to 1.
-template <typename T>
-__device__ inline T reverse_bounds(T value) {
-  if (THCNumerics<T>::eq(value, ScalarConvert<int, T>::to(1))) {
-    return ScalarConvert<int, T>::to(0);
-  }
-  return value;
-}
-
-
-__device__ inline at::Half half_uniform_scale_and_shift(float x, double a, double b) {
-  at::Half width = ScalarConvert<double, at::Half>::to(b - a);
-  at::Half start = ScalarConvert<double, at::Half>::to(a);
-  at::Half scaled = THCNumerics<at::Half>::mul(reverse_bounds(ScalarConvert<float, at::Half>::to(x)), width);
-  return THCNumerics<at::Half>::add(scaled, start);
-}
-
 #define GENERATE_KERNEL1(NAME, T, ARG1, CURAND_T, CURAND_FUNC, TRANSFORM)      \
 __global__ void NAME(curandStateMtgp32 *state, int size, T *result, ARG1)    \
 {                                                                              \
@@ -147,11 +129,6 @@ __global__ void NAME(curandStateMtgp32 *state, int size, T *result, ARG1, ARG2)
   }                                                                                  \
 }
 
-// NOTE: curand_uniform is (0, 1] and we want [a, b)
-GENERATE_KERNEL2(generate_uniform, float, float a, float b, float, curand_uniform, reverse_bounds(x) * (b-a) + a)
-GENERATE_KERNEL2(generate_uniform, float, double a, double b, float, curand_uniform, reverse_bounds(x) * (b-a) + a)
-GENERATE_KERNEL2(generate_uniform, double, double a, double b, double, curand_uniform_double, reverse_bounds(x) * (b-a) + a)
-
 GENERATE_KERNEL2(generate_normal, float, double mean, double stdv, float, curand_normal, (x * stdv) + mean)
 GENERATE_KERNEL2(generate_normal, double, double mean, double stdv, double, curand_normal_double, (x * stdv) + mean)
 
@@ -161,7 +138,6 @@ GENERATE_KERNEL1(generate_exponential, double, double lambda, double, curand_uni
 GENERATE_KERNEL2(generate_cauchy, float, double median, double sigma, float, curand_uniform, (float)(median + sigma * tan(M_PI*(x-0.5))))
 GENERATE_KERNEL2(generate_cauchy, double, double median, double sigma, double, curand_uniform_double, (double)(median + sigma * tan(M_PI*(x-0.5))))
 
-GENERATE_KERNEL2(generate_uniform, at::Half, double a, double b, float, curand_uniform, (half_uniform_scale_and_shift(x, a, b)))
 GENERATE_KERNEL2(generate_normal, at::Half, double mean, double stdv, float, curand_normal, (ScalarConvert<float, at::Half>::to((x * stdv) + mean)))
 GENERATE_KERNEL1(generate_exponential, at::Half, double lambda, float, curand_uniform, (ScalarConvert<float, at::Half>::to((float)(-1. / lambda * log(x)))))
 GENERATE_KERNEL2(generate_cauchy, at::Half, double median, double sigma, float, curand_uniform, (ScalarConvert<float, at::Half>::to((float)(median + sigma * tan(M_PI*(x-0.5))))))
diff --git a/aten/src/THC/generated/THCTensorMathPointwiseBool.cu b/aten/src/THC/generated/THCTensorMathPointwiseBool.cu
new file mode 100644
index 000000000000..817106de7175
--- /dev/null
+++ b/aten/src/THC/generated/THCTensorMathPointwiseBool.cu
@@ -0,0 +1,5 @@
+#include <THC/THCTensorMathPointwise.cuh>
+#include <THC/THCTensor.hpp>
+
+#include <THC/generic/THCTensorMathPointwise.cu>
+#include <THC/THCGenerateBoolType.h>
diff --git a/aten/src/THC/generic/THCTensor.cpp b/aten/src/THC/generic/THCTensor.cpp
index 4844e61a8ad6..c04ec7fdcbbe 100644
--- a/aten/src/THC/generic/THCTensor.cpp
+++ b/aten/src/THC/generic/THCTensor.cpp
@@ -89,7 +89,7 @@ THCTensor *THCTensor_(newWithTensor)(THCState *state, THCTensor *tensor)
 /* Storage init */
 THCTensor *THCTensor_(newWithStorage)(THCState *state, THCStorage *storage, ptrdiff_t storageOffset, at::IntArrayRef sizes, at::IntArrayRef strides) {
   if (strides.data()) {
-    AT_CHECK(sizes.size() == strides.size(), "number of sizes and strides must match");
+    TORCH_CHECK(sizes.size() == strides.size(), "number of sizes and strides must match");
   }
   THCTensor *self = c10::make_intrusive<at::TensorImpl, at::UndefinedTensorImpl>(
     c10::intrusive_ptr<at::StorageImpl>::reclaim(THCStorage_(new)(state)),
diff --git a/aten/src/THC/generic/THCTensorCopy.cu b/aten/src/THC/generic/THCTensorCopy.cu
index 3e837e45c267..19fe5a5fc408 100644
--- a/aten/src/THC/generic/THCTensorCopy.cu
+++ b/aten/src/THC/generic/THCTensorCopy.cu
@@ -6,7 +6,7 @@ void THCTensor_(copy)(THCState* state, THCTensor* dst, THCTensor* src) {
   if (dst == src) return;
   at::Tensor dst_wrap = THTensor_wrap(dst);
   at::Tensor src_wrap = THTensor_wrap(src);
-  at::s_copy_(dst_wrap, src_wrap);
+  at::native::copy_(dst_wrap, src_wrap);
 }
 
 template <>
@@ -16,7 +16,7 @@ THCTensor *THCTensor_newClone<scalar_t>(THCState *state, THCTensor *self) {
   THCTensor_resizeAs(state, tensor, self);
   at::Tensor tensor_wrap = THTensor_wrap(tensor);
   at::Tensor self_wrap = THTensor_wrap(self);
-  at::s_copy_(tensor_wrap, self_wrap);
+  at::native::copy_(tensor_wrap, self_wrap);
   return tensor;
 }
 
@@ -37,7 +37,7 @@ void THCTensor_freeCopyTo<scalar_t>(THCState *state, THCTensor *self, THCTensor
   if(self != dst) {
     at::Tensor dst_wrap = THTensor_wrap(dst);
     at::Tensor self_wrap = THTensor_wrap(self);
-    at::s_copy_(dst_wrap, self_wrap);
+    at::native::copy_(dst_wrap, self_wrap);
   }
 
   THCTensor_free(state, self);
diff --git a/aten/src/THC/generic/THCTensorMathMagma.cu b/aten/src/THC/generic/THCTensorMathMagma.cu
index 198495bb881a..a83d6d0da864 100644
--- a/aten/src/THC/generic/THCTensorMathMagma.cu
+++ b/aten/src/THC/generic/THCTensorMathMagma.cu
@@ -64,7 +64,7 @@ void THCTensor_(gels)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor
 #ifdef USE_MAGMA
   THArgCheck(!a_->is_empty() && a_->dim() == 2, 1, "A should be (non-empty) 2 dimensional");
   THArgCheck(!b_->is_empty() && b_->dim() == 2, 1, "b should be (non-empty) 2 dimensional");
-  AT_CHECK(a_->size(0) == b_->size(0), "Expected A and b to have same size "
+  TORCH_CHECK(a_->size(0) == b_->size(0), "Expected A and b to have same size "
       "at dim 0, but A has ", a_->size(0), " rows and B has ", b_->size(0), " rows");
   THArgCheck(a_->size(0) >= a_->size(1), 2, "Expected A with shape (m x n) to have "
       "m >= n. The case for m < n is not implemented yet.");
@@ -334,112 +334,6 @@ void THCTensor_(gesdd2)(THCState *state, THCTensor *ru_, THCTensor *rs_, THCTens
 #endif
 }
 
-void THCTensor_(getri)(THCState *state, THCTensor *ra_, THCTensor *a)
-{
-  THArgCheck(!a->is_empty() && a->dim() == 2, 2, "A should be non-empty 2 dimensional");
-  THArgCheck(a->size(0) == a->size(1), 2, "A should be square");
-
-#ifdef USE_MAGMA
-  int info;
-  int64_t n = a->size(0);
-  int lwork = n * magma_get_sgetri_nb(n);
-
-  THCTensor *input = THCTensor_(newColumnMajor)(state, ra_, a);
-  scalar_t *input_data = THCTensor_(data)(state, input);
-
-  int *ipiv = th_magma_malloc_pinned<int>(n);
-
-  THCTensor *work = THCTensor_(newWithSize1d)(state, lwork);
-  scalar_t *work_data = THCTensor_(data)(state, work);
-
-  // Run LU
-#if defined(THC_REAL_IS_FLOAT)
-  magma_sgetrf_gpu(n, n, input_data, n, ipiv, &info);
-#else
-  magma_dgetrf_gpu(n, n, input_data, n, ipiv, &info);
-#endif
-
-  if (info > 0)
-    THError("MAGMA getrf : U(%d,%d) is 0, U is singular", info, info);
-  else if (info < 0)
-    THError("MAGMA getrf : Argument %d : illegal value", -info);
-
-  // Inverse
-#if defined(THC_REAL_IS_FLOAT)
-  magma_sgetri_gpu(n, input_data, n, ipiv, work_data, lwork, &info);
-#else
-  magma_dgetri_gpu(n, input_data, n, ipiv, work_data, lwork, &info);
-#endif
-
-  if (info > 0)
-    THError("MAGMA getri : U(%d,%d) is 0, U is singular", info, info);
-  else if (info < 0)
-    THError("MAGMA getri : Argument %d : illegal value", -info);
-
-  THCTensor_(free)(state, work);
-  magma_free_pinned(ipiv);
-  THCTensor_(freeCopyTo)(state, input, ra_);
-#else
-  int64_t n = a->size(0);
-
-  // input
-  THCTensor *input = THCTensor_(newColumnMajor)(state, a, a);
-  THCTensor_(resizeNd)(state, ra_, 2, THTensor_getSizePtr(input), THTensor_getStridePtr(input));
-
-  scalar_t *matrices1[1] = { THCTensor_(data)(state, input) };
-  scalar_t *matrices2[1] = { THCTensor_(data)(state, ra_) };
-
-  // Copy pointers to device.
-  auto d_matrices1 = static_cast<scalar_t**>(THCudaMalloc(state, sizeof(scalar_t*)));
-  auto d_matrices2 = static_cast<scalar_t**>(THCudaMalloc(state, sizeof(scalar_t*)));
-
-  THCudaCheck(cudaMemcpyAsync(d_matrices1, matrices1, sizeof(scalar_t*),
-                              cudaMemcpyHostToDevice, THCState_getCurrentStream(state)));
-  THCudaCheck(cudaMemcpyAsync(d_matrices2, matrices2, sizeof(scalar_t*),
-                              cudaMemcpyHostToDevice, THCState_getCurrentStream(state)));
-  int info;
-  auto info_gpu = static_cast<int*>(THCudaMalloc(state, sizeof(int)));
-
-  auto ipiv_gpu = static_cast<int*>(THCudaMalloc(state, n * sizeof(int)));
-
-  // Run LU
-#if defined(THC_REAL_IS_FLOAT)
-  THCudaBlas_Sgetrf(state, n, d_matrices1, n, ipiv_gpu, info_gpu, 1);
-#else
-  THCudaBlas_Dgetrf(state, n, d_matrices1, n, ipiv_gpu, info_gpu, 1);
-#endif
-
-  THCudaCheck(cudaMemcpy(&info, info_gpu, sizeof(int), cudaMemcpyDeviceToHost));
-
-  if (info > 0)
-    THError("CUBLAS getrf : U(%d,%d) is 0, U is singular", info, info);
-  else if (info < 0)
-    THError("CUBLAS getrf : Argument %d : illegal value", -info);
-
-  // Inverse
-#if defined(THC_REAL_IS_FLOAT)
-  THCudaBlas_Sgetri(state, n, (const scalar_t**)d_matrices1, n, ipiv_gpu, d_matrices2, n, info_gpu, 1);
-#else
-  THCudaBlas_Dgetri(state, n, (const scalar_t**)d_matrices1, n, ipiv_gpu, d_matrices2, n, info_gpu, 1);
-#endif
-
-  THCudaCheck(cudaMemcpy(&info, info_gpu, sizeof(int), cudaMemcpyDeviceToHost));
-
-  if (info > 0)
-    THError("CUBLAS getri : U(%d,%d) is 0, U is singular", info, info);
-  else if (info < 0)
-    THError("CUBLAS getri : Argument %d : illegal value", -info);
-
-  THCudaFree(state, ipiv_gpu);
-  THCudaFree(state, info_gpu);
-
-  THCudaFree(state, d_matrices1);
-  THCudaFree(state, d_matrices2);
-
-  THCTensor_(free)(state, input);
-#endif
-}
-
 __global__ void THCTensor_(copyUpperSymmetric)(scalar_t *input, int n, int len)
 {
   for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < len; idx += 65535) {
diff --git a/aten/src/THC/generic/THCTensorMathMagma.h b/aten/src/THC/generic/THCTensorMathMagma.h
index f388f68517c6..0ae49cd65007 100644
--- a/aten/src/THC/generic/THCTensorMathMagma.h
+++ b/aten/src/THC/generic/THCTensorMathMagma.h
@@ -12,7 +12,6 @@ THC_API void THCTensor_(gesdd)(THCState *state, THCTensor *ru_, THCTensor *rs_,
                                const char *some, const char* compute_uv);
 THC_API void THCTensor_(gesdd2)(THCState *state, THCTensor *ru_, THCTensor *rs_, THCTensor *rv_, THCTensor *ra_, THCTensor *a,
                                 const char *some, const char* compute_uv);
-THC_API void THCTensor_(getri)(THCState *state, THCTensor *ra_, THCTensor *a);
 THC_API void THCTensor_(potri)(THCState *state, THCTensor *ra_, THCTensor *a, const char *uplo);
 THC_API void THCTensor_(geqrf)(THCState *state, THCTensor *ra_, THCTensor *rtau_, THCTensor *a_);
 THC_API void THCTensor_(qr)(THCState *state, THCTensor *rq_, THCTensor *rr_, THCTensor *a);
diff --git a/aten/src/THC/generic/THCTensorMathPairwise.cu b/aten/src/THC/generic/THCTensorMathPairwise.cu
index c16e36909dd6..667db64cf53b 100644
--- a/aten/src/THC/generic/THCTensorMathPairwise.cu
+++ b/aten/src/THC/generic/THCTensorMathPairwise.cu
@@ -2,6 +2,95 @@
 #define THC_GENERIC_FILE "THC/generic/THCTensorMathPairwise.cu"
 #else
 
+int THCTensor_(equal)(THCState *state, THCTensor *self_, THCTensor *src_)
+{
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_));
+  if (!THCTensor_(isSameSizeAs(state, self_, src_))) {
+    return 0;
+  }
+
+  // This is not as efficient as TH, but the basic idea: create a buffer that stores
+  // 1 if the two tensors are equal at a position, otherwise 0. If the minimum value
+  // in this buffer is 1, the two tensors are equal, otherwise they are not
+
+  THCudaByteTensor *buf = THCudaByteTensor_newWithSize(state, self_->sizes(), {});
+
+  if (!THC_pointwiseApply3<uint8_t, scalar_t, scalar_t>(state, buf, self_, src_, TensorEQOp<scalar_t, unsigned char>())) {
+    THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+  }
+
+  unsigned char min = THCudaByteTensor_minall(state, buf);
+
+  THCudaByteTensor_free(state, buf);
+
+  return min != 0;
+}
+
+void THCTensor_(bitand)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value)
+{
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
+  return THError("bitand only supported for integer type tensors");
+#else
+  if (self_ == src_) {
+    if (!THC_pointwiseApply1<scalar_t>(state, self_, TensorBitAndConstantOp<scalar_t>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src_);
+
+    if (!THC_pointwiseApply2<scalar_t, scalar_t>(state, self_, src_, TensorBitAndConstantOp<scalar_t>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+#endif
+}
+
+void THCTensor_(bitor)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value)
+{
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
+  return THError("bitor only supported for integer type tensors");
+#else
+  if (self_ == src_) {
+    if (!THC_pointwiseApply1<scalar_t>(state, self_, TensorBitOrConstantOp<scalar_t>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src_);
+
+    if (!THC_pointwiseApply2<scalar_t, scalar_t>(state, self_, src_, TensorBitOrConstantOp<scalar_t>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+#endif
+}
+
+void THCTensor_(bitxor)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value)
+{
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
+  return THError("bitxor only supported for integer type tensors");
+#else
+  if (self_ == src_) {
+    if (!THC_pointwiseApply1<scalar_t>(state, self_, TensorBitXorConstantOp<scalar_t>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src_);
+
+    if (!THC_pointwiseApply2<scalar_t, scalar_t>(state, self_, src_, TensorBitXorConstantOp<scalar_t>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+#endif
+}
+
+#if !defined(THC_REAL_IS_BOOL)
+
 void THCTensor_(add)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_));
@@ -196,91 +285,6 @@ void THCTensor_(triu)(THCState *state, THCTensor *self_, THCTensor *src_, int64_
   THCudaCheck(cudaGetLastError());
 }
 
-int THCTensor_(equal)(THCState *state, THCTensor *self_, THCTensor *src_)
-{
-  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_));
-  if (!THCTensor_(isSameSizeAs(state, self_, src_))) {
-    return 0;
-  }
-
-  // This is not as efficient as TH, but the basic idea: create a buffer that stores
-  // 1 if the two tensors are equal at a position, otherwise 0. If the minimum value
-  // in this buffer is 1, the two tensors are equal, otherwise they are not
-
-  THCudaByteTensor *buf = THCudaByteTensor_newWithSize(state, self_->sizes(), {});
-
-  if (!THC_pointwiseApply3<uint8_t, scalar_t, scalar_t>(state, buf, self_, src_, TensorEQOp<scalar_t, unsigned char>())) {
-    THArgCheck(false, 2, CUTORCH_DIM_WARNING);
-  }
-
-  unsigned char min = THCudaByteTensor_minall(state, buf);
-
-  THCudaByteTensor_free(state, buf);
-
-  return min != 0;
-}
-
-void THCTensor_(bitand)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value)
-{
-#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
-  return THError("bitand only supported for integer type tensors");
-#else
-  if (self_ == src_) {
-    if (!THC_pointwiseApply1<scalar_t>(state, self_, TensorBitAndConstantOp<scalar_t>(value))) {
-      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
-    }
-  } else {
-    THCTensor_(resizeAs)(state, self_, src_);
-
-    if (!THC_pointwiseApply2<scalar_t, scalar_t>(state, self_, src_, TensorBitAndConstantOp<scalar_t>(value))) {
-      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
-    }
-  }
-
-  THCudaCheck(cudaGetLastError());
-#endif
-}
-
-void THCTensor_(bitor)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value)
-{
-#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
-  return THError("bitor only supported for integer type tensors");
-#else
-  if (self_ == src_) {
-    if (!THC_pointwiseApply1<scalar_t>(state, self_, TensorBitOrConstantOp<scalar_t>(value))) {
-      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
-    }
-  } else {
-    THCTensor_(resizeAs)(state, self_, src_);
-
-    if (!THC_pointwiseApply2<scalar_t, scalar_t>(state, self_, src_, TensorBitOrConstantOp<scalar_t>(value))) {
-      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
-    }
-  }
-
-  THCudaCheck(cudaGetLastError());
 #endif
-}
-
-void THCTensor_(bitxor)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value)
-{
-#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
-  return THError("bitxor only supported for integer type tensors");
-#else
-  if (self_ == src_) {
-    if (!THC_pointwiseApply1<scalar_t>(state, self_, TensorBitXorConstantOp<scalar_t>(value))) {
-      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
-    }
-  } else {
-    THCTensor_(resizeAs)(state, self_, src_);
-
-    if (!THC_pointwiseApply2<scalar_t, scalar_t>(state, self_, src_, TensorBitXorConstantOp<scalar_t>(value))) {
-      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
-    }
-  }
-
-  THCudaCheck(cudaGetLastError());
-#endif
-}
 
 #endif
diff --git a/aten/src/THC/generic/THCTensorMathPairwise.h b/aten/src/THC/generic/THCTensorMathPairwise.h
index 26efe9db5bfd..bcfb95b103c4 100644
--- a/aten/src/THC/generic/THCTensorMathPairwise.h
+++ b/aten/src/THC/generic/THCTensorMathPairwise.h
@@ -2,6 +2,14 @@
 #define THC_GENERIC_FILE "THC/generic/THCTensorMathPairwise.h"
 #else
 
+THC_API int THCTensor_(equal)(THCState *state, THCTensor *self, THCTensor *src);
+
+THC_API void THCTensor_(bitand)(THCState *state, THCTensor *self, THCTensor *src, scalar_t value);
+THC_API void THCTensor_(bitor)(THCState *state, THCTensor *self, THCTensor *src, scalar_t value);
+THC_API void THCTensor_(bitxor)(THCState *state, THCTensor *self, THCTensor *src, scalar_t value);
+
+#if !defined(THC_REAL_IS_BOOL)
+
 THC_API void THCTensor_(add)(THCState *state, THCTensor *self, THCTensor *src, scalar_t value);
 THC_API void THCTensor_(sub)(THCState *state, THCTensor *self, THCTensor *src, scalar_t value);
 THC_API void THCTensor_(add_scaled)(THCState *state, THCTensor *self, THCTensor *src, scalar_t value, scalar_t alpha);
@@ -12,10 +20,7 @@ THC_API void THCTensor_(lshift)(THCState *state, THCTensor *self, THCTensor *src
 THC_API void THCTensor_(rshift)(THCState *state, THCTensor *self, THCTensor *src, scalar_t value);
 THC_API void THCTensor_(fmod)(THCState *state, THCTensor *self, THCTensor *src, scalar_t value);
 THC_API void THCTensor_(remainder)(THCState *state, THCTensor *self, THCTensor *src, scalar_t value);
-THC_API void THCTensor_(bitand)(THCState *state, THCTensor *self, THCTensor *src, scalar_t value);
-THC_API void THCTensor_(bitor)(THCState *state, THCTensor *self, THCTensor *src, scalar_t value);
-THC_API void THCTensor_(bitxor)(THCState *state, THCTensor *self, THCTensor *src, scalar_t value);
 
-THC_API int THCTensor_(equal)(THCState *state, THCTensor *self, THCTensor *src);
+#endif
 
 #endif
diff --git a/aten/src/THC/generic/THCTensorMathPointwise.cu b/aten/src/THC/generic/THCTensorMathPointwise.cu
index 6a2ee33ed126..f4a86df732a9 100644
--- a/aten/src/THC/generic/THCTensorMathPointwise.cu
+++ b/aten/src/THC/generic/THCTensorMathPointwise.cu
@@ -4,6 +4,106 @@
 
 #include <ATen/MemoryOverlap.h>
 
+void THCTensor_(cbitand)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+#if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+  return THError("cbitand is only supported for integer type tensors");
+#else
+  THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THArgCheck(THCTensor_(nElement)(state, src1) ==
+             THCTensor_(nElement)(state, src2), 3, "sizes do not match");
+
+  if (self_ == src1) {
+    // self /= src2
+    if (!THC_pointwiseApply2<scalar_t, scalar_t>(state, self_, src2, TensorBitAndOp<scalar_t>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src1);
+
+    // self = src1 / src2
+    if (!THC_pointwiseApply3<scalar_t, scalar_t, scalar_t>(state, self_, src1, src2, TensorBitAndOp<scalar_t>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+#endif
+}
+
+void THCTensor_(cbitor)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+#if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+  return THError("cbitor is only supported for integer type tensors");
+#else
+  THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THArgCheck(THCTensor_(nElement)(state, src1) ==
+             THCTensor_(nElement)(state, src2), 3, "sizes do not match");
+
+  if (self_ == src1) {
+    // self /= src2
+    if (!THC_pointwiseApply2<scalar_t, scalar_t>(state, self_, src2, TensorBitOrOp<scalar_t>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src1);
+
+    // self = src1 / src2
+    if (!THC_pointwiseApply3<scalar_t, scalar_t, scalar_t>(state, self_, src1, src2, TensorBitOrOp<scalar_t>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+#endif
+}
+
+void THCTensor_(cbitxor)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+#if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+  return THError("cbitor is only supported for integer type tensors");
+#else
+  THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THArgCheck(THCTensor_(nElement)(state, src1) ==
+             THCTensor_(nElement)(state, src2), 3, "sizes do not match");
+
+  if (self_ == src1) {
+    // self /= src2
+    if (!THC_pointwiseApply2<scalar_t, scalar_t>(state, self_, src2, TensorBitXorOp<scalar_t>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src1);
+
+    // self = src1 / src2
+    if (!THC_pointwiseApply3<scalar_t, scalar_t, scalar_t>(state, self_, src1, src2, TensorBitXorOp<scalar_t>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+#endif
+}
+
+void THCTensor_(sign)(THCState* state, THCTensor* self_, THCTensor* src) {
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
+  if (self_ == src) {
+    if (!THC_pointwiseApply1<scalar_t>(state, self_, TensorSignOp<scalar_t>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src);
+
+    if (!THC_pointwiseApply2<scalar_t, scalar_t>(state, self_, src, TensorSignOp<scalar_t>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+#if !defined(THC_REAL_IS_BOOL)
+
 #define IMPLEMENT_CUDA_TENSOR_BASIC_FUNC_(NAME, CFUNC, REAL)             \
   struct Tensor_##NAME##_##REAL##_Op {                                  \
     __device__ __forceinline__ void operator()(scalar_t* out, scalar_t* in) const { \
@@ -75,23 +175,6 @@ IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(  abs, THCNumerics<scalar_t>::abs,   Real)
 #undef IMPLEMENT_CUDA_TENSOR_BASIC_FUNC_
 #undef IMPLEMENT_CUDA_TENSOR_BASIC_FUNC
 
-void THCTensor_(sign)(THCState* state, THCTensor* self_, THCTensor* src) {
-  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
-  if (self_ == src) {
-    if (!THC_pointwiseApply1<scalar_t>(state, self_, TensorSignOp<scalar_t>())) {
-      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
-    }
-  } else {
-    THCTensor_(resizeAs)(state, self_, src);
-
-    if (!THC_pointwiseApply2<scalar_t, scalar_t>(state, self_, src, TensorSignOp<scalar_t>())) {
-      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
-    }
-  }
-
-  THCudaCheck(cudaGetLastError());
-}
-
 void THCTensor_(clamp)(THCState *state, THCTensor *self_, THCTensor *src, scalar_t min_value,
   scalar_t max_value)
 {
@@ -552,84 +635,5 @@ void THCTensor_(addcdiv)(THCState *state, THCTensor *self_, THCTensor *t, scalar
   THCudaCheck(cudaGetLastError());
 }
 
-void THCTensor_(cbitand)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
-{
-#if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
-  return THError("cbitand is only supported for integer type tensors");
-#else
-  THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
-  THArgCheck(THCTensor_(nElement)(state, src1) ==
-             THCTensor_(nElement)(state, src2), 3, "sizes do not match");
-
-  if (self_ == src1) {
-    // self /= src2
-    if (!THC_pointwiseApply2<scalar_t, scalar_t>(state, self_, src2, TensorBitAndOp<scalar_t>())) {
-      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
-    }
-  } else {
-    THCTensor_(resizeAs)(state, self_, src1);
-
-    // self = src1 / src2
-    if (!THC_pointwiseApply3<scalar_t, scalar_t, scalar_t>(state, self_, src1, src2, TensorBitAndOp<scalar_t>())) {
-      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
-    }
-  }
-
-  THCudaCheck(cudaGetLastError());
 #endif
-}
-
-void THCTensor_(cbitor)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
-{
-#if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
-  return THError("cbitor is only supported for integer type tensors");
-#else
-  THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
-  THArgCheck(THCTensor_(nElement)(state, src1) ==
-             THCTensor_(nElement)(state, src2), 3, "sizes do not match");
-
-  if (self_ == src1) {
-    // self /= src2
-    if (!THC_pointwiseApply2<scalar_t, scalar_t>(state, self_, src2, TensorBitOrOp<scalar_t>())) {
-      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
-    }
-  } else {
-    THCTensor_(resizeAs)(state, self_, src1);
-
-    // self = src1 / src2
-    if (!THC_pointwiseApply3<scalar_t, scalar_t, scalar_t>(state, self_, src1, src2, TensorBitOrOp<scalar_t>())) {
-      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
-    }
-  }
-
-  THCudaCheck(cudaGetLastError());
-#endif
-}
-
-void THCTensor_(cbitxor)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
-{
-#if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
-  return THError("cbitor is only supported for integer type tensors");
-#else
-  THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
-  THArgCheck(THCTensor_(nElement)(state, src1) ==
-             THCTensor_(nElement)(state, src2), 3, "sizes do not match");
-
-  if (self_ == src1) {
-    // self /= src2
-    if (!THC_pointwiseApply2<scalar_t, scalar_t>(state, self_, src2, TensorBitXorOp<scalar_t>())) {
-      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
-    }
-  } else {
-    THCTensor_(resizeAs)(state, self_, src1);
-
-    // self = src1 / src2
-    if (!THC_pointwiseApply3<scalar_t, scalar_t, scalar_t>(state, self_, src1, src2, TensorBitXorOp<scalar_t>())) {
-      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
-    }
-  }
-
-  THCudaCheck(cudaGetLastError());
-#endif
-}
 #endif
diff --git a/aten/src/THC/generic/THCTensorMathPointwise.h b/aten/src/THC/generic/THCTensorMathPointwise.h
index 5539e8ed1bf8..4f4f209e23d3 100644
--- a/aten/src/THC/generic/THCTensorMathPointwise.h
+++ b/aten/src/THC/generic/THCTensorMathPointwise.h
@@ -2,6 +2,14 @@
 #define THC_GENERIC_FILE "THC/generic/THCTensorMathPointwise.h"
 #else
 
+THC_API void THCTensor_(cbitand)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(cbitor)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(cbitxor)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
+
+THC_API void THCTensor_(sign)(THCState *state, THCTensor *self, THCTensor *src);
+
+#if !defined(THC_REAL_IS_BOOL)
+
 THC_API void THCTensor_(pow)(THCState *state, THCTensor *self, THCTensor *src, scalar_t value);
 THC_API void THCTensor_(tpow)(THCState *state, THCTensor *self, scalar_t value, THCTensor *src);
 THC_API void THCTensor_(cpow)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
@@ -45,7 +53,6 @@ THC_API void THCTensor_(cinv)(THCState *state, THCTensor *self, THCTensor *src);
 
 THC_API void THCTensor_(neg)(THCState *state, THCTensor *self, THCTensor *src);
 THC_API void THCTensor_(abs)(THCState *state, THCTensor *self, THCTensor *src);
-THC_API void THCTensor_(sign)(THCState *state, THCTensor *self, THCTensor *src);
 THC_API void THCTensor_(clamp)(THCState *state, THCTensor *self, THCTensor *src, scalar_t min_value, scalar_t max_value);
 THC_API void THCTensor_(crossKernel)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2, int dimension);
 
@@ -61,11 +68,9 @@ THC_API void THCTensor_(cfmod)(THCState *state, THCTensor *self, THCTensor *src1
 THC_API void THCTensor_(cremainder)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
 THC_API void THCTensor_(cmaxValue)(THCState *state, THCTensor *self, THCTensor *src, scalar_t value);
 THC_API void THCTensor_(cminValue)(THCState *state, THCTensor *self, THCTensor *src, scalar_t value);
-THC_API void THCTensor_(cbitand)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
-THC_API void THCTensor_(cbitor)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
-THC_API void THCTensor_(cbitxor)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
 
 THC_API void THCTensor_(addcmul)(THCState *state, THCTensor *self, THCTensor* t, scalar_t value, THCTensor *src1, THCTensor *src2);
 THC_API void THCTensor_(addcdiv)(THCState *state, THCTensor *self, THCTensor* t, scalar_t value, THCTensor *src1, THCTensor *src2);
 
 #endif
+#endif
diff --git a/aten/src/THC/generic/THCTensorMathReduce.h b/aten/src/THC/generic/THCTensorMathReduce.h
index e4eec0fcb418..4f48c54f76c0 100644
--- a/aten/src/THC/generic/THCTensorMathReduce.h
+++ b/aten/src/THC/generic/THCTensorMathReduce.h
@@ -21,7 +21,6 @@ THC_API accreal THCTensor_(varall)(THCState *state, THCTensor *self, int biased)
 
 THC_API void THCTensor_(prod)(THCState *state, THCTensor *self, THCTensor *src, int dim, int keepdim);
 
-THC_API accreal THCTensor_(sumall)(THCState *state, THCTensor *self);
 THC_API accreal THCTensor_(meanall)(THCState *state, THCTensor *self);
 
 THC_API void THCTensor_(min)(THCState *state,
diff --git a/aten/src/THC/generic/THCTensorRandom.cu b/aten/src/THC/generic/THCTensorRandom.cu
index 0ee87a27abb8..61225d19b784 100644
--- a/aten/src/THC/generic/THCTensorRandom.cu
+++ b/aten/src/THC/generic/THCTensorRandom.cu
@@ -8,21 +8,6 @@
 
 #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
 
-void THCTensor_(uniform)(THCState* state, THCTensor *self_, double a, double b)
-{
-  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_));
-  ptrdiff_t size = THCTensor_(nElement)(state, self_);
-  if (size == 0) return;
-  THCGenerator* gen = THCRandom_getGenerator(state);
-  THCTensor *self = THCTensor_(newContiguous)(state, self_);
-  scalar_t *data = THCTensor_(data)(state, self);
-
-  generate_uniform<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
-      gen->state.gen_states, size, data, a, b);
-
-  THCTensor_(freeCopyTo)(state, self, self_);
-};
-
 void THCTensor_(normal)(THCState* state, THCTensor *self_, double mean, double stdv)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_));
@@ -191,7 +176,8 @@ void THCTensor_(multinomial)(struct THCState *state,
     // Uniform random samples in a separate kernel launch, into
     // temporarily allocated memory. The device RNG is thread-limited
     THCTensor *sampled = THCTensor_(newWithSize2d)(state, numDist, n_sample);
-    THCTensor_(uniform)(state, sampled, 0.0, 1.0);
+    auto out = THTensor_wrap(sampled);
+    at::native::uniform_cuda_(out, 0.0, 1.0);
 
     dim3 block(numCategories < maxThreads ? numCategories : maxThreads);
     dim3 grid(numDist < numSM * 4 ? numDist : numSM * 4);
@@ -380,8 +366,10 @@ void THCTensor_(multinomialAliasDraw)(THCState *state, THCudaLongTensor *self, T
   THCTensor *uniform = THCTensor_(newWithSize1d)(state, n_sample);
   THCTensor *bernoulli = THCTensor_(newWithSize1d)(state, n_sample);
 
-  THCTensor_(uniform)(state, uniform, 0, K);
-  THCTensor_(uniform)(state, bernoulli, 0, 1);
+  auto out_uniform = THTensor_wrap(uniform);
+  auto out_bernoulli = THTensor_wrap(bernoulli);
+  at::native::uniform_cuda_(out_uniform, 0, K);
+  at::native::uniform_cuda_(out_bernoulli, 0, 1);
 
   multinomialAliasDrawKernel
     <<<THCCeilDiv((int)n_sample+BLOCK_SIZE-1, BLOCK_SIZE), BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
diff --git a/aten/src/THC/generic/THCTensorRandom.h b/aten/src/THC/generic/THCTensorRandom.h
index 552207d9a885..578c77a6b7e5 100644
--- a/aten/src/THC/generic/THCTensorRandom.h
+++ b/aten/src/THC/generic/THCTensorRandom.h
@@ -4,7 +4,6 @@
 
 #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
 
-THC_API void THCTensor_(uniform)(struct THCState *state, THCTensor *self, double a, double b);
 THC_API void THCTensor_(normal)(struct THCState *state, THCTensor *self, double mean, double stdv);
 THC_API void THCTensor_(normal_means)(struct THCState *state, THCTensor *self, THCTensor *means, double stddev);
 THC_API void THCTensor_(normal_stddevs)(struct THCState *state, THCTensor *self, double mean, THCTensor *stddevs);
diff --git a/aten/src/THCUNN/CMakeLists.txt b/aten/src/THCUNN/CMakeLists.txt
index 3c32868c6846..c4e6df5b7d9e 100644
--- a/aten/src/THCUNN/CMakeLists.txt
+++ b/aten/src/THCUNN/CMakeLists.txt
@@ -33,24 +33,16 @@ ${CMAKE_CURRENT_SOURCE_DIR}/SpatialConvolutionMM.cu
 ${CMAKE_CURRENT_SOURCE_DIR}/SpatialCrossMapLRN.cu
 ${CMAKE_CURRENT_SOURCE_DIR}/SpatialDepthwiseConvolution.cu
 ${CMAKE_CURRENT_SOURCE_DIR}/SpatialDilatedConvolution.cu
-${CMAKE_CURRENT_SOURCE_DIR}/SpatialDilatedMaxPooling.cu
 ${CMAKE_CURRENT_SOURCE_DIR}/SpatialFullConvolution.cu
 ${CMAKE_CURRENT_SOURCE_DIR}/SpatialFullDilatedConvolution.cu
-${CMAKE_CURRENT_SOURCE_DIR}/SpatialMaxPooling.cu
 ${CMAKE_CURRENT_SOURCE_DIR}/SpatialMaxUnpooling.cu
 ${CMAKE_CURRENT_SOURCE_DIR}/SpatialSubSampling.cu
-${CMAKE_CURRENT_SOURCE_DIR}/SpatialUpSamplingBicubic.cu
-${CMAKE_CURRENT_SOURCE_DIR}/SpatialUpSamplingBilinear.cu
-${CMAKE_CURRENT_SOURCE_DIR}/SpatialUpSamplingNearest.cu
 ${CMAKE_CURRENT_SOURCE_DIR}/Sqrt.cu
 ${CMAKE_CURRENT_SOURCE_DIR}/Square.cu
 ${CMAKE_CURRENT_SOURCE_DIR}/Tanh.cu
 ${CMAKE_CURRENT_SOURCE_DIR}/TemporalConvolution.cu
 ${CMAKE_CURRENT_SOURCE_DIR}/TemporalMaxPooling.cu
 ${CMAKE_CURRENT_SOURCE_DIR}/TemporalRowConvolution.cu
-${CMAKE_CURRENT_SOURCE_DIR}/TemporalUpSamplingLinear.cu
-${CMAKE_CURRENT_SOURCE_DIR}/TemporalUpSamplingNearest.cu
-${CMAKE_CURRENT_SOURCE_DIR}/VolumetricAdaptiveAveragePooling.cu
 ${CMAKE_CURRENT_SOURCE_DIR}/VolumetricAveragePooling.cu
 ${CMAKE_CURRENT_SOURCE_DIR}/VolumetricConvolution.cu
 ${CMAKE_CURRENT_SOURCE_DIR}/VolumetricDilatedConvolution.cu
@@ -59,8 +51,6 @@ ${CMAKE_CURRENT_SOURCE_DIR}/VolumetricFullConvolution.cu
 ${CMAKE_CURRENT_SOURCE_DIR}/VolumetricFullDilatedConvolution.cu
 ${CMAKE_CURRENT_SOURCE_DIR}/VolumetricMaxPooling.cu
 ${CMAKE_CURRENT_SOURCE_DIR}/VolumetricMaxUnpooling.cu
-${CMAKE_CURRENT_SOURCE_DIR}/VolumetricUpSamplingNearest.cu
-${CMAKE_CURRENT_SOURCE_DIR}/VolumetricUpSamplingTrilinear.cu
 PARENT_SCOPE)
 
 set(ATen_CUDA_INCLUDE ${ATen_CUDA_INCLUDE}
diff --git a/aten/src/THCUNN/SpatialDilatedMaxPooling.cu b/aten/src/THCUNN/SpatialDilatedMaxPooling.cu
deleted file mode 100644
index 5adefab5bbd8..000000000000
--- a/aten/src/THCUNN/SpatialDilatedMaxPooling.cu
+++ /dev/null
@@ -1,121 +0,0 @@
-#include <THCUNN/THCUNN.h>
-#include <THC/THCTensor.hpp>
-#include <TH/THHalf.h>
-#include <THCUNN/THCHalfAutoNumerics.cuh>
-#include <THC/THCNumerics.cuh>
-#include <THCUNN/common.h>
-#include <c10/macros/Macros.h>
-
-// kernels borrowed from Caffe
-template <typename Dtype, typename AccType>
-__global__ void MaxPoolForward(const int nthreads, const Dtype* bottom_data,
-    const int num, const int channels, const int height,
-    const int width, const int pooled_height, const int pooled_width,
-    const int kernel_h, const int kernel_w, const int stride_h,
-    const int stride_w, const int pad_h, const int pad_w,
-    const int dilation_h, const int dilation_w, Dtype* top_data,
-    int64_t* top_mask) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    int pw = index % pooled_width;
-    int ph = (index / pooled_width) % pooled_height;
-    int c = (index / pooled_width / pooled_height) % channels;
-    int n = index / pooled_width / pooled_height / channels;
-    int hstart = ph * stride_h - pad_h;
-    int wstart = pw * stride_w - pad_w;
-    int hend = min(hstart + (kernel_h - 1) * dilation_h + 1, height);
-    int wend = min(wstart + (kernel_w - 1) * dilation_w + 1, width);
-    while(hstart < 0)
-      hstart += dilation_h;
-    while(wstart < 0)
-      wstart += dilation_w;
-    AccType maxval = THCNumerics<AccType>::min();
-    int maxidx = -1;
-    bottom_data += (n * channels + c) * height * width;
-    for (int h = hstart; h < hend; h += dilation_h) {
-      for (int w = wstart; w < wend; w += dilation_w) {
-        Dtype val = bottom_data[h * width + w];
-        if ((ScalarConvert<Dtype, AccType>::to(val) > maxval) || THCNumerics<Dtype>::isnan(val)) {
-          maxidx = h * width + w;
-          maxval = ScalarConvert<Dtype, AccType>::to(val);
-        }
-      }
-    }
-    top_data[index] = ScalarConvert<AccType, Dtype>::to(maxval);
-    top_mask[index] = maxidx;
-  }
-}
-
-const int BACKWARD_THREADS = 256;
-
-template <typename Dtype, typename AccType>
-#if defined (__HIP_PLATFORM_HCC__)
-C10_LAUNCH_BOUNDS_2(BACKWARD_THREADS, 4)
-#else
-C10_LAUNCH_BOUNDS_2(BACKWARD_THREADS, 8)
-#endif
-__global__ void MaxPoolBackward(const int nthreads, const Dtype* top_diff,
-    const int64_t* top_mask, const int num, const int channels,
-    const int height, const int width, const int pooled_height,
-    const int pooled_width, const int kernel_h, const int kernel_w,
-    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
-    const int dilation_h, const int dilation_w,
-    Dtype* bottom_diff) {
-    CUDA_KERNEL_LOOP(index, height*width) {
-    int h = index/width;
-    int w = index - h * width;
-//get some templating performance benefits without actually templating
-    int phstart, phend, pwstart, pwend;
-    if (stride_h == 1) {
-       phstart =
-        (h + pad_h < ((kernel_h - 1) * dilation_h + 1)) ? 0 : (h + pad_h - ((kernel_h - 1) * dilation_h + 1))  + 1;
-       phend = min((h + pad_h)  + 1, pooled_height);
-    } else if (stride_h == 2) {
-       phstart =
-        (h + pad_h < ((kernel_h - 1) * dilation_h + 1)) ? 0 : (h + pad_h - ((kernel_h - 1) * dilation_h + 1)) / 2  + 1;
-       phend = min((h + pad_h) / 2  + 1, pooled_height);
-    } else {
-       phstart =
-        (h + pad_h < ((kernel_h - 1) * dilation_h + 1)) ? 0 : (h + pad_h - ((kernel_h - 1) * dilation_h + 1)) / stride_h  + 1;
-       phend = min((h + pad_h) / stride_h  + 1, pooled_height);
-    }
-    if (stride_w == 1) {
-        pwstart =
-        (w + pad_w < ((kernel_w - 1) * dilation_w + 1)) ? 0 : (w + pad_w - ((kernel_w - 1) * dilation_w + 1)) + 1;
-        pwend = min((w + pad_w) + 1, pooled_width);
-    } else if (stride_w == 2) {
-        pwstart =
-        (w + pad_w < ((kernel_w - 1) * dilation_w + 1)) ? 0 : (w + pad_w - ((kernel_w - 1) * dilation_w + 1)) / 2 + 1;
-        pwend = min((w + pad_w) / 2 + 1, pooled_width);
-    } else {
-        pwstart =
-        (w + pad_w < ((kernel_w - 1) * dilation_w + 1)) ? 0 : (w + pad_w - ((kernel_w - 1) * dilation_w + 1)) / stride_w + 1;
-        pwend = min((w + pad_w) / stride_w + 1, pooled_width);
-    }
-    for (int n = blockIdx.y; n < num; n += gridDim.y)
-       for (int c = blockIdx.z; c < channels; c+= gridDim.z) {
-
-        AccType gradient = AccType(0);
-        int offset = (n * channels + c) * pooled_height * pooled_width;
-        top_diff += offset;
-        top_mask += offset;
-//get some templating performance benefits without actually templating
-        if ((phstart + 1 != phend) || (pwstart + 1 != pwend)) {
-        for (int ph = phstart; ph < phend; ++ph) {
-          for (int pw = pwstart; pw < pwend; ++pw) {
-            if (top_mask[ph * pooled_width + pw] == h * width + w) {
-              gradient += ScalarConvert<Dtype, AccType>::to(top_diff[ph * pooled_width + pw]);
-            }
-          }
-        }
-        } else {
-            if (top_mask[phstart * pooled_width + pwstart] == h * width + w) {
-              gradient += ScalarConvert<Dtype, AccType>::to(top_diff[phstart * pooled_width + pwstart]);
-            }
-        }
-        bottom_diff[(n*channels+c)*height*width+index] = ScalarConvert<AccType, Dtype>::to(gradient);
-      }
-  }
-}
-
-#include <THCUNN/generic/SpatialDilatedMaxPooling.cu>
-#include <THC/THCGenerateFloatTypes.h>
diff --git a/aten/src/THCUNN/SpatialMaxPooling.cu b/aten/src/THCUNN/SpatialMaxPooling.cu
deleted file mode 100644
index c01ea5957b7f..000000000000
--- a/aten/src/THCUNN/SpatialMaxPooling.cu
+++ /dev/null
@@ -1,4 +0,0 @@
-#include <THCUNN/THCUNN.h>
-
-#include <THCUNN/generic/SpatialMaxPooling.cu>
-#include <THC/THCGenerateFloatTypes.h>
diff --git a/aten/src/THCUNN/SpatialUpSamplingBicubic.cu b/aten/src/THCUNN/SpatialUpSamplingBicubic.cu
deleted file mode 100644
index ae22582bd902..000000000000
--- a/aten/src/THCUNN/SpatialUpSamplingBicubic.cu
+++ /dev/null
@@ -1,164 +0,0 @@
-#include <THCUNN/THCUNN.h>
-#include <THC/THCTensor.hpp>
-#include <THCUNN/common.h>
-#include <THCUNN/upsampling.h>
-#include <THC/THCDeviceTensor.cuh>
-#include <THC/THCDeviceTensorUtils.cuh>
-#include <THC/THCDeviceUtils.cuh>
-#include <TH/THHalf.h>
-#include <THCUNN/THCHalfAutoNumerics.cuh>
-#include <THC/THCAtomics.cuh>
-
-template<typename Dtype, typename Acctype>
-#if defined(__HIP_PLATFORM_HCC__)
-__launch_bounds__(1024)
-#endif
-__global__ void bicubic_interp2d_kernel(
-  const int num_elements,
-  const Acctype height_scale,
-  const Acctype width_scale,
-  const bool align_corners,
-  const THCDeviceTensor<Dtype, 4> in_data,
-  THCDeviceTensor<Dtype, 4> out_data
-) {
-
-  int index = threadIdx.x + blockIdx.x * blockDim.x;
-  const int batchsize = in_data.getSize(0);
-  const int channels = in_data.getSize(1);
-  const int input_height = in_data.getSize(2);
-  const int input_width = in_data.getSize(3);
-  const int output_height = out_data.getSize(2);
-  const int output_width = out_data.getSize(3);
-
-  if (index >= num_elements) {
-    return;
-  }
-
-  // Special case: input and output are the same size, just copy
-  const int output_x = index % output_width;
-  const int output_y = index / output_width;
-  if (input_height == output_height && input_width == output_width) {
-    for (int n = 0; n < batchsize; n++){
-      for (int c = 0; c < channels; c++) {
-        const Dtype val = in_data[n][c][output_y][output_x];
-        out_data[n][c][output_x][output_y] = val;
-      }
-    }
-    return;
-  }
-
-  // Interpolation kernel
-  Acctype real_x = area_pixel_compute_source_index(width_scale, output_x, align_corners, /*cubic=*/true);
-  int in_x = floorf(real_x);
-  Acctype t_x = real_x - in_x;
-
-  Acctype real_y = area_pixel_compute_source_index(height_scale, output_y, align_corners, /*cubic=*/true);
-  int in_y = floorf(real_y);
-  Acctype t_y = real_y - in_y;
-
-  for (int n = 0; n < batchsize ; n++) {
-    for (int c = 0; c < channels; c++) {
-      Acctype coefficients[4];
-
-      for (int k = 0; k < 4; k++) {
-        coefficients[k] = cubic_interp1d(
-          upsampling_get_value_bounded<Dtype>(
-            in_data, c, n, input_width, input_height, in_x - 1, in_y - 1 + k),
-          upsampling_get_value_bounded<Dtype>(
-            in_data, c, n, input_width, input_height, in_x + 0, in_y - 1 + k),
-          upsampling_get_value_bounded<Dtype>(
-            in_data, c, n, input_width, input_height, in_x + 1, in_y - 1 + k),
-          upsampling_get_value_bounded<Dtype>(
-            in_data, c, n, input_width, input_height, in_x + 2, in_y - 1 + k),
-          t_x
-        );
-      }
-
-      out_data[n][c][output_y][output_x] = ScalarConvert<Acctype, Dtype>::to(cubic_interp1d(
-        coefficients[0],
-        coefficients[1],
-        coefficients[2],
-        coefficients[3],
-        t_y
-      ));
-    }
-  }
-}
-
-// Backward (adjoint) operation 1 <- 2 (accumulates)
-template <typename Dtype, typename Acctype>
-#if defined(__HIP_PLATFORM_HCC__)
-__launch_bounds__(1024)
-#endif
-__global__ void bicubic_interp2d_backward_kernel(
-  const int num_elements,
-  const Acctype height_scale,
-  const Acctype width_scale,
-  const bool align_corners,
-  THCDeviceTensor<Dtype, 4> in_data,
-  const THCDeviceTensor<Dtype, 4> out_data
-){
-
-  int index = threadIdx.x + blockIdx.x * blockDim.x;
-  const int batchsize = in_data.getSize(0);
-  const int channels = in_data.getSize(1);
-  const int input_height = in_data.getSize(2);
-  const int input_width = in_data.getSize(3);
-  const int output_height = out_data.getSize(2);
-  const int output_width = out_data.getSize(3);
-
-  if (index >= num_elements) {
-    return;
-  }
-
-  const int output_x = index % output_width;
-  const int output_y = index / output_width;
-  // special case: output_xust copy
-  if (input_height == output_height && input_width == output_width) {
-    for (int n = 0; n < batchsize ; n++){
-      for (int c = 0; c < channels; ++c) {
-        const Dtype val = out_data[n][c][output_y][output_x];
-        in_data[n][c][output_y][output_x] += val;
-      }
-    }
-    return;
-  }
-
-  Acctype real_x = area_pixel_compute_source_index(width_scale, output_x, align_corners, /*cubic=*/true);
-  int input_x = floorf(real_x);
-  Acctype t_x = real_x - input_x;
-
-  Acctype real_y = area_pixel_compute_source_index(height_scale, output_y, align_corners, /*cubic=*/true);
-  int input_y = floorf(real_y);
-  Acctype t_y = real_y - input_y;
-
-  Acctype x_coeffs[4];
-  Acctype y_coeffs[4];
-
-  get_cubic_upsampling_coefficients(x_coeffs, t_x);
-  get_cubic_upsampling_coefficients(y_coeffs, t_y);
-
-  for (int n = 0; n < batchsize ; n++){
-    for (int c = 0; c < channels; ++c) {
-      Dtype out_value = out_data[n][c][output_y][output_x];
-      for (int i = 0; i < 4; i++) {
-        for (int j = 0; j < 4; j++) {
-          upsampling_increment_value_bounded<Dtype, Acctype>(
-            in_data,
-            c,
-            n,
-            input_width,
-            input_height,
-            input_x - 1 + j,
-            input_y - 1 + i,
-            out_value * y_coeffs[i] * x_coeffs[j]
-          );
-        }
-      }
-    }
-  }
-}
-
-
-#include <THCUNN/generic/SpatialUpSamplingBicubic.cu>
-#include <THC/THCGenerateFloatTypes.h>
diff --git a/aten/src/THCUNN/SpatialUpSamplingBilinear.cu b/aten/src/THCUNN/SpatialUpSamplingBilinear.cu
deleted file mode 100644
index ca9906d8b8eb..000000000000
--- a/aten/src/THCUNN/SpatialUpSamplingBilinear.cu
+++ /dev/null
@@ -1,130 +0,0 @@
-// Adapted from interp.cpp from Caffe util by Pauline Luc
-// Originally developed by George Papandreou
-#include <THCUNN/THCUNN.h>
-#include <THC/THCTensor.hpp>
-#include <THCUNN/common.h>
-#include <THCUNN/upsampling.h>
-#include <THC/THCDeviceTensor.cuh>
-#include <THC/THCDeviceTensorUtils.cuh>
-#include <THC/THCDeviceUtils.cuh>
-#include <TH/THHalf.h>
-#include <THCUNN/THCHalfAutoNumerics.cuh>
-#include <THC/THCAtomics.cuh>
-
-template<typename Dtype, typename Acctype>
-#ifdef __HIP_PLATFORM_HCC__
-C10_LAUNCH_BOUNDS_1(1024)
-#endif
-__global__ void caffe_gpu_interp2_kernel(const int n,
-    const Acctype rheight, const Acctype rwidth, const bool align_corners,
-    const THCDeviceTensor<Dtype, 4> data1, THCDeviceTensor<Dtype, 4> data2) {
-  int index = threadIdx.x + blockIdx.x * blockDim.x;
-  const int batchsize = data1.getSize(0);
-  const int channels = data1.getSize(1);
-  const int height1 = data1.getSize(2);
-  const int width1 = data1.getSize(3);
-  const int height2 = data2.getSize(2);
-  const int width2 = data2.getSize(3);
-
-  if (index < n) {
-    const int w2 = index % width2; // 0:width2-1
-    const int h2 = index / width2; // 0:height2-1
-    // special case: just copy
-    if (height1 == height2 && width1 == width2) {
-      const int h1 = h2;
-      const int w1 = w2;
-      for (int n = 0; n < batchsize ; n++){
-        for (int c = 0; c < channels; ++c) {
-          const Dtype val = data1[n][c][h1][w1];
-          data2[n][c][h2][w2] = val;
-        }
-      }
-      return;
-    }
-    //
-    const Acctype h1r = area_pixel_compute_source_index<Acctype>(rheight, h2, align_corners, /*cubic=*/false);
-    const int h1 = h1r;
-    const int h1p = (h1 < height1 - 1) ? 1 : 0;
-    const Acctype h1lambda = h1r - h1;
-    const Acctype h0lambda = Acctype(1) - h1lambda;
-    //
-    const Acctype w1r = area_pixel_compute_source_index<Acctype>(rwidth, w2, align_corners, /*cubic=*/false);
-    const int w1 = w1r;
-    const int w1p = (w1 < width1 - 1) ? 1 : 0;
-    const Acctype w1lambda = w1r - w1;
-    const Acctype w0lambda = Acctype(1) - w1lambda;
-    //
-    for (int n = 0; n < batchsize ; n++){
-        for (int c = 0; c < channels; ++c) {
-        const Acctype val = h0lambda * (w0lambda * data1[n][c][h1][w1]
-                            + w1lambda * data1[n][c][h1][w1+w1p])
-                            + h1lambda * (w0lambda * data1[n][c][h1+h1p][w1]
-                            + w1lambda * data1[n][c][h1+h1p][w1+w1p]);
-        data2[n][c][h2][w2] = ScalarConvert<Acctype, Dtype>::to(val);
-      }
-    }
-  }
-}
-
-// Backward (adjoint) operation 1 <- 2 (accumulates)
-template <typename Dtype, typename Acctype>
-#ifdef __HIP_PLATFORM_HCC__
-C10_LAUNCH_BOUNDS_1(1024)
-#endif
-__global__ void caffe_gpu_interp2_kernel_backward(const int n,
-    const Acctype rheight, const Acctype rwidth, const bool align_corners,
-    THCDeviceTensor<Dtype, 4> data1, const THCDeviceTensor<Dtype, 4> data2){
-  int index = threadIdx.x + blockIdx.x * blockDim.x;
-  const int batchsize = data1.getSize(0);
-  const int channels = data1.getSize(1);
-  const int height1 = data1.getSize(2);
-  const int width1 = data1.getSize(3);
-  const int height2 = data2.getSize(2);
-  const int width2 = data2.getSize(3);
-  if (index < n) {
-    const int w2 = index % width2; // 0:width2-1
-    const int h2 = index / width2; // 0:height2-1
-    // special case: just copy
-    if (height1 == height2 && width1 == width2) {
-      const int h1 = h2;
-      const int w1 = w2;
-      for (int n = 0; n < batchsize ; n++){
-        for (int c = 0; c < channels; ++c) {
-          const Dtype val = data2[n][c][h1][w1];
-          data1[n][c][h2][w2] += val;
-        }
-      }
-      return;
-    }
-    //
-    const Acctype h1r = area_pixel_compute_source_index<Acctype>(rheight, h2, align_corners, /*cubic=*/false);
-    const int h1 = h1r;
-    const int h1p = (h1 < height1 - 1) ? 1 : 0;
-    const Acctype h1lambda = h1r - h1;
-    const Acctype h0lambda = Acctype(1) - h1lambda;
-    //
-    const Acctype w1r = area_pixel_compute_source_index<Acctype>(rwidth, w2, align_corners, /*cubic=*/false);
-    const int w1 = w1r;
-    const int w1p = (w1 < width1 - 1) ? 1 : 0;
-    const Acctype w1lambda = w1r - w1;
-    const Acctype w0lambda = Acctype(1) - w1lambda;
-    //
-    for (int n = 0; n < batchsize ; n++){
-      for (int c = 0; c < channels; ++c) {
-        const Dtype d2val = data2[n][c][h2][w2];
-        atomicAdd(data1[n][c][h1][w1].data(),
-                  ScalarConvert<Acctype, Dtype>::to(h0lambda * w0lambda * d2val));
-        atomicAdd(data1[n][c][h1][w1+w1p].data(),
-                  ScalarConvert<Acctype, Dtype>::to(h0lambda * w1lambda * d2val));
-        atomicAdd(data1[n][c][h1+h1p][w1].data(),
-                  ScalarConvert<Acctype, Dtype>::to(h1lambda * w0lambda * d2val));
-        atomicAdd(data1[n][c][h1+h1p][w1+w1p].data(),
-                  ScalarConvert<Acctype, Dtype>::to(h1lambda * w1lambda * d2val));
-      }
-    }
-  }
-}
-
-
-#include <THCUNN/generic/SpatialUpSamplingBilinear.cu>
-#include <THC/THCGenerateFloatTypes.h>
diff --git a/aten/src/THCUNN/SpatialUpSamplingNearest.cu b/aten/src/THCUNN/SpatialUpSamplingNearest.cu
deleted file mode 100644
index 13777b2bb28a..000000000000
--- a/aten/src/THCUNN/SpatialUpSamplingNearest.cu
+++ /dev/null
@@ -1,108 +0,0 @@
-#include <THCUNN/THCUNN.h>
-#include <THCUNN/common.h>
-#include <THC/THCTensor.hpp>
-
-#include <THCUNN/upsampling.h>
-#include <THC/THCDeviceTensor.cuh>
-#include <THC/THCDeviceTensorUtils.cuh>
-#include <THC/THCDeviceUtils.cuh>
-
-#include <TH/THHalf.h>
-#include <THCUNN/THCHalfAutoNumerics.cuh>
-#include <THC/THCAtomics.cuh>
-
-template<typename Dtype, typename Acctype>
-#ifdef __HIP_PLATFORM_HCC__
-C10_LAUNCH_BOUNDS_1(1024)
-#endif
-__global__ void nearest_neighbor_4d_kernel(
-                const int n,
-                const THCDeviceTensor<Dtype, 4> data1,
-                THCDeviceTensor<Dtype, 4> data2) {
-  int index = threadIdx.x + blockIdx.x * blockDim.x;
-  const int batchsize = data1.getSize(0);
-  const int channels = data1.getSize(1);
-  const int height1 = data1.getSize(2);
-  const int width1 = data1.getSize(3);
-  const int height2 = data2.getSize(2);
-  const int width2 = data2.getSize(3);
-  const float height_scale = (float) height1 / (float) height2;
-  const float width_scale = (float) width1 / (float) width2;
-
-  if (index < n) {
-    const int w2 = index % width2; // 0:width2-1
-    const int h2 = index / width2; // 0:height2-1
-    // special case: just copy
-    if (height1 == height2 && width1 == width2) {
-      const int h1 = h2;
-      const int w1 = w2;
-      for (int n = 0; n < batchsize; n++) {
-        for (int c = 0; c < channels; ++c) {
-          const Dtype val = data1[n][c][h1][w1];
-          data2[n][c][h2][w2] = val;
-        }
-      }
-      return;
-    }
-    //
-    const int h1 = nearest_neighbor_compute_source_index(height_scale, h2, height1);
-    const int w1 = nearest_neighbor_compute_source_index(width_scale, w2, width1);
-    for (int n = 0; n < batchsize; n++) {
-      for (int c = 0; c < channels; ++c) {
-        const Dtype val = data1[n][c][h1][w1];
-        data2[n][c][h2][w2] = val;
-      }
-    }
-  }
-}
-
-// Backward operation
-template <typename Dtype, typename Acctype>
-#ifdef __HIP_PLATFORM_HCC__
-C10_LAUNCH_BOUNDS_1(1024)
-#endif
-__global__ void nearest_neighbor_4d_kernel_backward(
-                const int n,
-                THCDeviceTensor<Dtype, 4> data1,
-                const THCDeviceTensor<Dtype, 4> data2) {
-  int index = threadIdx.x + blockIdx.x * blockDim.x;
-  const int batchsize = data1.getSize(0);
-  const int channels = data1.getSize(1);
-  const int height1 = data1.getSize(2);
-  const int width1 = data1.getSize(3);
-  const int height2 = data2.getSize(2);
-  const int width2 = data2.getSize(3);
-  const float height_scale = (float) height1 / (float) height2;
-  const float width_scale = (float) width1 / (float) width2;
-
-  if (index < n) {
-    const int w2 = index % width2; // 0:width2-1
-    const int h2 = index / width2; // 0:height2-1
-    // special case: just copy
-    if (height1 == height2 && width1 == width2) {
-      const int h1 = h2;
-      const int w1 = w2;
-      for (int n = 0; n < batchsize; n++) {
-        for (int c = 0; c < channels; ++c) {
-          const Dtype val = data2[n][c][h2][w2];
-          data1[n][c][h1][w1] = val;
-        }
-      }
-      return;
-    }
-    //
-    const int h1 = nearest_neighbor_compute_source_index(height_scale, h2, height1);
-    const int w1 = nearest_neighbor_compute_source_index(width_scale, w2, width1);
-
-    for (int n = 0; n < batchsize; n++) {
-      for (int c = 0; c < channels; ++c) {
-        const Dtype d2val = data2[n][c][h2][w2];
-        atomicAdd(data1[n][c][h1][w1].data(), d2val);
-      }
-    }
-  }
-}
-
-
-#include <THCUNN/generic/SpatialUpSamplingNearest.cu>
-#include <THC/THCGenerateFloatTypes.h>
diff --git a/aten/src/THCUNN/TemporalUpSamplingLinear.cu b/aten/src/THCUNN/TemporalUpSamplingLinear.cu
deleted file mode 100644
index a58cfb3196f1..000000000000
--- a/aten/src/THCUNN/TemporalUpSamplingLinear.cu
+++ /dev/null
@@ -1,104 +0,0 @@
-// Adapted from interp.cpp from Caffe util by Pauline Luc
-// Originally developed by George Papandreou
-#include <THCUNN/THCUNN.h>
-#include <THC/THCTensor.hpp>
-#include <THCUNN/common.h>
-#include <THCUNN/upsampling.h>
-#include <THC/THCDeviceTensor.cuh>
-#include <THC/THCDeviceTensorUtils.cuh>
-#include <THC/THCDeviceUtils.cuh>
-#include <TH/THHalf.h>
-#include <THCUNN/THCHalfAutoNumerics.cuh>
-#include <THC/THCAtomics.cuh>
-
-template<typename Dtype, typename Acctype>
-#ifdef __HIP_PLATFORM_HCC__
-C10_LAUNCH_BOUNDS_1(1024)
-#endif
-__global__ void caffe_gpu_interp2_kernel(const int n,
-    const Acctype rwidth, const bool align_corners,
-    const THCDeviceTensor<Dtype, 3> data1, THCDeviceTensor<Dtype, 3> data2) {
-  int index = threadIdx.x + blockIdx.x * blockDim.x;
-  const int batchsize = data1.getSize(0);
-  const int channels = data1.getSize(1);
-  const int width1 = data1.getSize(2);
-  const int width2 = data2.getSize(2);
-
-  if (index < n) {
-    const int w2 = index % width2;
-    // special case: just copy
-    if (width1 == width2) {
-      const int w1 = w2;
-      for (int n = 0; n < batchsize ; n++){
-        for (int c = 0; c < channels; ++c) {
-          const Dtype val = data1[n][c][w1];
-          data2[n][c][w2] = val;
-        }
-      }
-      return;
-    }
-    //
-    const Acctype w1r = area_pixel_compute_source_index<Acctype>(rwidth, w2, align_corners, /*cubic=*/false);
-    const int w1 = w1r;
-    const int w1p = (w1 < width1 - 1) ? 1 : 0;
-    const Acctype w1lambda = w1r - w1;
-    const Acctype w0lambda = Acctype(1) - w1lambda;
-    //
-    for (int n = 0; n < batchsize ; n++){
-        for (int c = 0; c < channels; ++c) {
-        const Acctype val = w0lambda * data1[n][c][w1]
-                            + w1lambda * data1[n][c][w1+w1p];
-        data2[n][c][w2] = ScalarConvert<Acctype, Dtype>::to(val);
-      }
-    }
-  }
-}
-
-// Backward (adjoint) operation 1 <- 2 (accumulates)
-template <typename Dtype, typename Acctype>
-#ifdef __HIP_PLATFORM_HCC__
-C10_LAUNCH_BOUNDS_1(1024)
-#endif
-__global__ void caffe_gpu_interp2_kernel_backward(const int n,
-    const Acctype rwidth, const bool align_corners,
-    THCDeviceTensor<Dtype, 3> data1, const THCDeviceTensor<Dtype, 3> data2){
-  int index = threadIdx.x + blockIdx.x * blockDim.x;
-  const int batchsize = data1.getSize(0);
-  const int channels = data1.getSize(1);
-  const int width1 = data1.getSize(2);
-  const int width2 = data2.getSize(2);
-  if (index < n) {
-    const int w2 = index % width2;
-    // special case: just copy
-    if (width1 == width2) {
-      const int w1 = w2;
-      for (int n = 0; n < batchsize ; n++){
-        for (int c = 0; c < channels; ++c) {
-          const Dtype val = data2[n][c][w1];
-          data1[n][c][w2] += val;
-        }
-      }
-      return;
-    }
-    //
-    const Acctype w1r = area_pixel_compute_source_index<Acctype>(rwidth, w2, align_corners, /*cubic=*/false);
-    const int w1 = w1r;
-    const int w1p = (w1 < width1 - 1) ? 1 : 0;
-    const Acctype w1lambda = w1r - w1;
-    const Acctype w0lambda = Acctype(1) - w1lambda;
-    //
-    for (int n = 0; n < batchsize ; n++){
-      for (int c = 0; c < channels; ++c) {
-        const Dtype d2val = data2[n][c][w2];
-        atomicAdd(data1[n][c][w1].data(),
-                  ScalarConvert<Acctype, Dtype>::to(w0lambda * d2val));
-        atomicAdd(data1[n][c][w1+w1p].data(),
-                  ScalarConvert<Acctype, Dtype>::to(w1lambda * d2val));
-      }
-    }
-  }
-}
-
-
-#include <THCUNN/generic/TemporalUpSamplingLinear.cu>
-#include <THC/THCGenerateFloatTypes.h>
diff --git a/aten/src/THCUNN/TemporalUpSamplingNearest.cu b/aten/src/THCUNN/TemporalUpSamplingNearest.cu
deleted file mode 100644
index b10f5e1392e7..000000000000
--- a/aten/src/THCUNN/TemporalUpSamplingNearest.cu
+++ /dev/null
@@ -1,95 +0,0 @@
-#include <THCUNN/THCUNN.h>
-#include <THCUNN/common.h>
-#include <THC/THCTensor.hpp>
-
-#include <THCUNN/upsampling.h>
-#include <THC/THCDeviceTensor.cuh>
-#include <THC/THCDeviceTensorUtils.cuh>
-#include <THC/THCDeviceUtils.cuh>
-
-#include <TH/THHalf.h>
-#include <THCUNN/THCHalfAutoNumerics.cuh>
-#include <THC/THCAtomics.cuh>
-
-template<typename Dtype, typename Acctype>
-#ifdef __HIP_PLATFORM_HCC__
-C10_LAUNCH_BOUNDS_1(1024)
-#endif
-__global__ void nearest_neighbor_3d_kernel(
-                const int n,
-                const THCDeviceTensor<Dtype, 3> data1,
-                THCDeviceTensor<Dtype, 3> data2) {
-  int index = threadIdx.x + blockIdx.x * blockDim.x;
-  const int batchsize = data1.getSize(0);
-  const int channels = data1.getSize(1);
-  const int width1 = data1.getSize(2);
-  const int width2 = data2.getSize(2);
-  const float scale = (float) width1 / (float) width2;
-
-  if (index < n) {
-    const int w2 = index % width2;
-    // special case: just copy
-    if (width1 == width2) {
-      const int w1 = w2;
-      for (int n = 0; n < batchsize; n++) {
-        for (int c = 0; c < channels; ++c) {
-          const Dtype val = data1[n][c][w1];
-          data2[n][c][w2] = val;
-        }
-      }
-      return;
-    }
-    //
-    const int w1 = nearest_neighbor_compute_source_index(scale, w2, width1);
-    for (int n = 0; n < batchsize; n++) {
-      for (int c = 0; c < channels; ++c) {
-        const Dtype val = data1[n][c][w1];
-        data2[n][c][w2] = val;
-      }
-    }
-  }
-}
-
-// Backward operation
-template <typename Dtype, typename Acctype>
-#ifdef __HIP_PLATFORM_HCC__
-C10_LAUNCH_BOUNDS_1(1024)
-#endif
-__global__ void nearest_neighbor_3d_kernel_backward(
-                const int n,
-                THCDeviceTensor<Dtype, 3> data1,
-                const THCDeviceTensor<Dtype, 3> data2) {
-  int index = threadIdx.x + blockIdx.x * blockDim.x;
-  const int batchsize = data1.getSize(0);
-  const int channels = data1.getSize(1);
-  const int width1 = data1.getSize(2);
-  const int width2 = data2.getSize(2);
-  const float scale = (float) width1 / (float) width2;
-
-  if (index < n) {
-    const int w2 = index % width2;
-    // special case: just copy
-    if (width1 == width2) {
-      const int w1 = w2;
-      for (int n = 0; n < batchsize; n++) {
-        for (int c = 0; c < channels; ++c) {
-          const Dtype val = data2[n][c][w1];
-          data1[n][c][w2] = val;
-        }
-      }
-      return;
-    }
-    //
-    const int w1 = nearest_neighbor_compute_source_index(scale, w2, width1);
-    for (int n = 0; n < batchsize; n++) {
-      for (int c = 0; c < channels; ++c) {
-        const Dtype d2val = data2[n][c][w2];
-        atomicAdd(data1[n][c][w1].data(), d2val);
-      }
-    }
-  }
-}
-
-
-#include <THCUNN/generic/TemporalUpSamplingNearest.cu>
-#include <THC/THCGenerateFloatTypes.h>
diff --git a/aten/src/THCUNN/VolumetricAdaptiveAveragePooling.cu b/aten/src/THCUNN/VolumetricAdaptiveAveragePooling.cu
deleted file mode 100644
index e94183e2cfc7..000000000000
--- a/aten/src/THCUNN/VolumetricAdaptiveAveragePooling.cu
+++ /dev/null
@@ -1,248 +0,0 @@
-#include <THCUNN/THCUNN.h>
-#include <THC/THCTensor.hpp>
-#include <TH/THHalf.h>
-#include <THCUNN/THCHalfAutoNumerics.cuh>
-#include <THC/THCAtomics.cuh>
-
-#define START_IND(a,b,c) (int)floor((float)(a * c) / b)
-#define END_IND(a,b,c) (int)ceil((float)((a + 1) * c) / b)
-// #define START_IND(a,b,c) a * c / b
-// #define END_IND(a,b,c)  (a + 1) * c / b + ((a + 1) * c % b > 0)?1:0
-
-
-#define CUDA_MAX_THREADS 1024   // this is safe, in reality 256 is our limit
-
-// 5d tensor B x D x T x H x W
-// All kernels view batch dim B and feature dim D as collapsed.
-
-/*
- * Description:
- *    This function adaptively average pools an input 5D tensor along dimensions
- *     2, 3 and 4.
- *
- *    gridDim.y blocks work together on a single 2D output plane specified by
- *    (blockIdx.x + offsetZ).
- */
- template <typename T>
-__global__ void cunn_VolumetricAdaptiveAveragePooling_updateOutput_kernel(
-                        T *input, T *output,
-                        int isizeT, int isizeH, int isizeW,
-                        int osizeT, int osizeH, int osizeW,
-                        int64_t istrideD,
-                        int64_t istrideT, int64_t istrideH, int64_t istrideW,
-                        int64_t offsetZ)
-{
-  // iterators on output pixels
-  int ot, oh, ow;
-
-  // compute offsets based on thread/block ID
-  int ostartH = blockIdx.y * blockDim.y + threadIdx.y;
-  int oendH   = osizeH;
-  int ostepH  = gridDim.y * blockDim.y;
-  int ostartW = threadIdx.x;
-  int oendW   = osizeW;
-  int ostepW  = blockDim.x;
-
-  // select output plane
-  int64_t o_plane = blockIdx.x + offsetZ;
-  ot = o_plane % osizeT;     // output frame/time
-  int d = o_plane / osizeT;  // slice/feature
-
-  // input frame/time ramge is fixed.
-  int istartT = START_IND(ot, osizeT, isizeT);
-  int iendT = END_IND(ot, osizeT, isizeT);
-  int kT = iendT - istartT;
-
-  // input offset by slice/feature and earliest relevant frame/time
-  T *input_dt = input + d*istrideD + istartT*istrideT;
-  // output offset by slice/feature and frame/time
-  T *output_dt = output + o_plane*osizeH*osizeW;
-
-  // For all output pixels...
-  for(oh = ostartH; oh < oendH; oh += ostepH) {
-
-    int istartH = START_IND(oh, osizeH, isizeH);
-    int iendH   = END_IND(oh, osizeH, isizeH);
-    int kH = iendH - istartH;
-
-    for(ow = ostartW; ow < oendW; ow += ostepW) {
-
-      int istartW = START_IND(ow, osizeW, isizeW);
-      int iendW   = END_IND(ow, osizeW, isizeW);
-      int kW = iendW - istartW;
-
-      // Compute the average pooling from corresponding input pixels
-      T *ptr_input = input_dt + istartH*istrideH + istartW*istrideW;
-      T *ptr_output = output_dt + oh*osizeW + ow;
-      T sum = ScalarConvert<int, T>::to(0);
-
-      int it, ih, iw;
-      for(it = 0; it < kT; ++it) {
-        for(ih = 0; ih < kH; ++ih) {
-          for(iw = 0; iw < kW; ++iw) {
-            T val = ptr_input[ih*istrideH + iw*istrideW];
-            sum += val;
-          }
-        }
-        ptr_input += istrideT;   // next input frame
-      }
-      // Update output
-      *ptr_output = sum / kT / kH / kW;
-    }
-  }
-}
-
-/*
- * Description:
- *    This function computes the gradInput from gradOutput.
- *
- *    gridDim.y blocks work together on a single 2D input plane specified by
- *    (blockIdx.x + offsetZ).
- */
- template <typename T>
-__global__ void cunn_VolumetricAdaptiveAveragePooling_updateGradInput_kernel(
-  T *gradInput, T *gradOutput,
-  int isizeT, int isizeH, int isizeW,
-  int osizeT, int osizeH, int osizeW,
-  int64_t offsetZ
-)
-{
-  // iterators on input pixels
-  int it, ih, iw;
-
-  // compute offsets based on thread/block ID
-  int istartH = blockIdx.y * blockDim.y + threadIdx.y;
-  int iendH   = isizeH;
-  int istepH  = gridDim.y * blockDim.y;
-  int istartW = threadIdx.x;
-  int iendW   = isizeW;
-  int istepW  = blockDim.x;
-
-  // select input plane
-  int64_t i_plane = blockIdx.x + offsetZ;
-  it = i_plane % isizeT;        // output frame/time
-  int d = i_plane / isizeT;     // slice/feature
-
-  // output frame/time ramge is fixed.
-  int ostartT = START_IND(it, isizeT, osizeT);
-  int oendT   = END_IND(it, isizeT, osizeT);
-
-  // gradInput offset by slice/feature and frame/time
-  T *gradInput_dt = gradInput + i_plane*isizeH*isizeW;
-  // gradOutput offset by slice/feature and earliest relevant frame/time
-  T *gradOutput_dt = gradOutput + (d*osizeT + ostartT)*osizeH*osizeW;
-
-  // For all input pixels...
-  for(ih = istartH; ih < iendH; ih += istepH) {
-
-    int ostartH = START_IND(ih, isizeH, osizeH);
-    int oendH   = END_IND(ih, isizeH, osizeH);
-
-    for(iw = istartW; iw < iendW; iw += istepW) {
-
-      int ostartW = START_IND(iw, isizeW, osizeW);
-      int oendW   = END_IND(iw, isizeW, osizeW);
-
-      // Compute the gradients from corresponding output pixels
-      T *ptr_gradInput = gradInput_dt + ih*isizeW + iw;
-      T *ptr_gradOutput = gradOutput_dt;
-
-      // for all relevant output pixels
-      int ot, oh, ow;
-      for(ot = ostartT; ot < oendT; ++ot) {
-        int kT = END_IND(ot, osizeT, isizeT) - START_IND(ot, osizeT, isizeT);
-        for(oh = ostartH; oh < oendH; ++oh) {
-          int kH = END_IND(oh, osizeH, isizeH) - START_IND(oh, osizeH, isizeH);
-          for(ow = ostartW; ow < oendW; ++ow) {
-            int kW = END_IND(ow, osizeW, isizeW) - START_IND(ow, osizeW, isizeW);
-            T grad_delta = ptr_gradOutput[oh*osizeW + ow] / kW / kH / kT;
-            *ptr_gradInput += grad_delta;
-          }
-        }
-        ptr_gradOutput += osizeH*osizeW;   // next output frame
-      }
-    }
-  }
-}
-
-/*
- * Description:
- *    This function computes the gradInput from gradOutput without assuming
- *    dependencies between input pixels and output pixels.
- *
- *    gridDim.y blocks work together on a single 2D output plane specified by
- *    (blockIdx.x + offsetZ).
- *
- *    (uses atomic add)
- */
- template <typename T>
-__global__ void cunn_atomic_VolumetricAdaptiveAveragePooling_updateGradInput_kernel(
-  T *gradInput, T *gradOutput,
-  int isizeT, int isizeH, int isizeW,
-  int osizeT, int osizeH, int osizeW,
-  int64_t offsetZ
-)
-{
-  // iterators on output pixels
-  int ot, oh, ow;
-
-  // compute offsets based on thread/block ID
-  int ostartH = blockIdx.y * blockDim.y + threadIdx.y;
-  int oendH   = osizeH;
-  int ostepH  = gridDim.y * blockDim.y;
-  int ostartW = threadIdx.x;
-  int oendW   = osizeW;
-  int ostepW  = blockDim.x;
-
-  // select output plane
-  int64_t o_plane = blockIdx.x + offsetZ;
-  ot = o_plane % osizeT;        // output frame/time
-  int d = o_plane / osizeT;     // output slice/feature
-
-  // input frame/time ramge is fixed.
-  int istartT = START_IND(ot, osizeT, isizeT);
-  int iendT = END_IND(ot, osizeT, isizeT);
-  int kT = iendT - istartT;
-
-  // gradInput offset by slice/feature and earliest relevant frame/time
-  T *gradInput_nt = gradInput + (d*isizeT + istartT)*isizeH*isizeW;
-  // gradOutput offset by slice/feature and frame/time
-  T *gradOutput_nt = gradOutput + o_plane*osizeH*osizeW;
-
-  // For all output pixels...
-  for(oh = ostartH; oh < oendH; oh += ostepH) {
-
-    int istartH = START_IND(oh, osizeH, isizeH);
-    int iendH   = END_IND(oh, osizeH, isizeH);
-    int kH = iendH - istartH;
-
-    for(ow = ostartW; ow < oendW; ow += ostepW) {
-
-      int istartW = START_IND(ow, osizeW, isizeW);
-      int iendW   = END_IND(ow, osizeW, isizeW);
-      int kW = iendW - istartW;
-
-      // Compute the gradients from corresponding input pixels
-      T *ptr_gradInput = gradInput_nt + istartH*isizeW + istartW;
-      T *ptr_gradOutput = gradOutput_nt + oh*osizeW + ow;
-      T grad_delta = *ptr_gradOutput / kT / kH / kW;
-
-      int it, ih, iw;
-      for(it = 0; it < kT; ++it) {
-        for(ih = 0; ih < kH; ++ih) {
-          for(iw = 0; iw < kW; ++iw) {
-            atomicAdd(&(ptr_gradInput[ih*isizeW + iw]), grad_delta);
-          }
-        }
-        ptr_gradInput += isizeH*isizeW;   // next input frame
-      }
-    }
-  }
-}
-
-#include <THCUNN/generic/VolumetricAdaptiveAveragePooling.cu>
-#include <THC/THCGenerateFloatTypes.h>
-
-#undef CUDA_MAX_THREADS
-#undef START_IND
-#undef END_IND
diff --git a/aten/src/THCUNN/VolumetricUpSamplingNearest.cu b/aten/src/THCUNN/VolumetricUpSamplingNearest.cu
deleted file mode 100644
index 8e98b400a0f0..000000000000
--- a/aten/src/THCUNN/VolumetricUpSamplingNearest.cu
+++ /dev/null
@@ -1,120 +0,0 @@
-#include <THCUNN/THCUNN.h>
-#include <THCUNN/common.h>
-#include <THC/THCTensor.hpp>
-
-#include <THCUNN/upsampling.h>
-#include <THC/THCDeviceTensor.cuh>
-#include <THC/THCDeviceTensorUtils.cuh>
-#include <THC/THCDeviceUtils.cuh>
-
-#include <TH/THHalf.h>
-#include <THCUNN/THCHalfAutoNumerics.cuh>
-#include <THC/THCAtomics.cuh>
-
-template<typename Dtype, typename Acctype>
-#ifdef __HIP_PLATFORM_HCC__
-C10_LAUNCH_BOUNDS_1(1024)
-#endif
-__global__ void nearest_neighbor_5d_kernel(
-                const int n,
-                const THCDeviceTensor<Dtype, 5> data1,
-                THCDeviceTensor<Dtype, 5> data2) {
-  int index = threadIdx.x + blockIdx.x * blockDim.x;
-  const int batchsize = data1.getSize(0);
-  const int channels = data1.getSize(1);
-  const int depth1 = data1.getSize(2);
-  const int height1 = data1.getSize(3);
-  const int width1 = data1.getSize(4);
-  const int depth2 = data2.getSize(2);
-  const int height2 = data2.getSize(3);
-  const int width2 = data2.getSize(4);
-  const float depth_scale = (float) depth1 / (float) depth2;
-  const float height_scale = (float) height1 / (float) height2;
-  const float width_scale = (float) width1 / (float) width2;
-
-  if (index < n) {
-    const int w2 = (index % (height2*width2)) % width2; // 0:width2-1
-    const int h2 = (index % (height2*width2)) / width2; // 0:height2-1
-    const int d2 = index / (height2*width2);            // 0:depth2-1
-    // special case: just copy
-    if (depth1 == depth2 && height1 == height2 && width1 == width2) {
-      const int d1 = d2;
-      const int h1 = h2;
-      const int w1 = w2;
-      for (int n = 0; n < batchsize ; n++){
-        for (int c = 0; c < channels; ++c) {
-          const Dtype val = data1[n][c][d1][h1][w1];
-          data2[n][c][d2][h2][w2] = val;
-        }
-      }
-      return;
-    }
-    //
-    const int h1 = nearest_neighbor_compute_source_index(height_scale, h2, height1);
-    const int w1 = nearest_neighbor_compute_source_index(width_scale, w2, width1);
-    const int d1 = nearest_neighbor_compute_source_index(depth_scale, d2, depth1);
-    for (int n = 0; n < batchsize; n++) {
-      for (int c = 0; c < channels; ++c) {
-        const Dtype val = data1[n][c][d1][h1][w1];
-        data2[n][c][d2][h2][w2] = val;
-      }
-    }
-  }
-}
-
-// Backward operation
-template <typename Dtype, typename Acctype>
-#ifdef __HIP_PLATFORM_HCC__
-C10_LAUNCH_BOUNDS_1(1024)
-#endif
-__global__ void nearest_neighbor_5d_kernel_backward(
-                const int n,
-                THCDeviceTensor<Dtype, 5> data1,
-                const THCDeviceTensor<Dtype, 5> data2) {
-  int index = threadIdx.x + blockIdx.x * blockDim.x;
-  const int batchsize = data1.getSize(0);
-  const int channels = data1.getSize(1);
-  const int depth1 = data1.getSize(2);
-  const int height1 = data1.getSize(3);
-  const int width1 = data1.getSize(4);
-  const int depth2 = data2.getSize(2);
-  const int height2 = data2.getSize(3);
-  const int width2 = data2.getSize(4);
-  const float depth_scale = (float) depth1 / (float) depth2;
-  const float height_scale = (float) height1 / (float) height2;
-  const float width_scale = (float) width1 / (float) width2;
-
-  if (index < n) {
-    const int w2 = (index % (height2*width2)) % width2; // 0:width2-1
-    const int h2 = (index % (height2*width2)) / width2; // 0:height2-1
-    const int d2 = index / (height2*width2);            // 0:depth2-1
-
-    // special case: just copy
-    if (depth1 == depth2 && height1 == height2 && width1 == width2) {
-      const int d1 = d2;
-      const int h1 = h2;
-      const int w1 = w2;
-      for (int n = 0; n < batchsize ; n++){
-        for (int c = 0; c < channels; ++c) {
-          const Dtype val = data2[n][c][d1][h1][w1];
-          data1[n][c][d2][h2][w2] = val;
-        }
-      }
-      return;
-    }
-    //
-    const int h1 = nearest_neighbor_compute_source_index(height_scale, h2, height1);
-    const int w1 = nearest_neighbor_compute_source_index(width_scale, w2, width1);
-    const int d1 = nearest_neighbor_compute_source_index(depth_scale, d2, depth1);
-    for (int n = 0; n < batchsize; n++) {
-      for (int c = 0; c < channels; ++c) {
-        const Dtype val = data2[n][c][d2][h2][w2];
-        atomicAdd(data1[n][c][d1][h1][w1].data(), val);
-      }
-    }
-  }
-}
-
-
-#include <THCUNN/generic/VolumetricUpSamplingNearest.cu>
-#include <THC/THCGenerateFloatTypes.h>
diff --git a/aten/src/THCUNN/VolumetricUpSamplingTrilinear.cu b/aten/src/THCUNN/VolumetricUpSamplingTrilinear.cu
deleted file mode 100644
index 48d72bba86d5..000000000000
--- a/aten/src/THCUNN/VolumetricUpSamplingTrilinear.cu
+++ /dev/null
@@ -1,160 +0,0 @@
-// Adapted from interp.cpp from Caffe util by Pauline Luc
-// Originally developed by George Papandreou
-#include <THCUNN/THCUNN.h>
-#include <THC/THCTensor.hpp>
-#include <THCUNN/common.h>
-#include <THCUNN/upsampling.h>
-#include <THC/THCDeviceTensor.cuh>
-#include <THC/THCDeviceTensorUtils.cuh>
-#include <THC/THCDeviceUtils.cuh>
-#include <TH/THHalf.h>
-#include <THCUNN/THCHalfAutoNumerics.cuh>
-#include <THC/THCAtomics.cuh>
-#include <c10/macros/Macros.h>
-
-template<typename Dtype, typename Acctype>
-C10_LAUNCH_BOUNDS_1(1024)
-__global__ void caffe_gpu_interp2_kernel(const int n,
-    const Acctype rdepth, const Acctype rheight, const Acctype rwidth, const bool align_corners,
-    const THCDeviceTensor<Dtype, 5> data1, THCDeviceTensor<Dtype, 5> data2) {
-  int index = threadIdx.x + blockIdx.x * blockDim.x;
-  const int batchsize = data1.getSize(0);
-  const int channels = data1.getSize(1);
-  const int depth1 = data1.getSize(2);
-  const int height1 = data1.getSize(3);
-  const int width1 = data1.getSize(4);
-  const int depth2 = data2.getSize(2);
-  const int height2 = data2.getSize(3);
-  const int width2 = data2.getSize(4);
-
-  if (index < n) {
-    const int w2 = (index % (height2*width2)) % width2; // 0:width2-1
-    const int h2 = (index % (height2*width2)) / width2; // 0:height2-1
-    const int t2 = index / (height2*width2);            // 0:depth2-1
-    // special case: just copy
-    if (depth1 == depth2 && height1 == height2 && width1 == width2) {
-      const int t1 = t2;
-      const int h1 = h2;
-      const int w1 = w2;
-      for (int n = 0; n < batchsize ; n++){
-        for (int c = 0; c < channels; ++c) {
-          const Dtype val = data1[n][c][t1][h1][w1];
-          data2[n][c][t2][h2][w2] = val;
-        }
-      }
-      return;
-    }
-    //
-    const Acctype t1r = area_pixel_compute_source_index<Acctype>(rdepth, t2, align_corners, /*cubic=*/false);
-    const int t1 = t1r;
-    const int t1p = (t1 < depth1 - 1) ? 1 : 0;
-    const Acctype t1lambda = t1r - t1;
-    const Acctype t0lambda = Acctype(1) - t1lambda;
-    //
-    const Acctype h1r = area_pixel_compute_source_index<Acctype>(rheight, h2, align_corners, /*cubic=*/false);
-    const int h1 = h1r;
-    const int h1p = (h1 < height1 - 1) ? 1 : 0;
-    const Acctype h1lambda = h1r - h1;
-    const Acctype h0lambda = Acctype(1) - h1lambda;
-    //
-    const Acctype w1r = area_pixel_compute_source_index<Acctype>(rwidth, w2, align_corners, /*cubic=*/false);
-    const int w1 = w1r;
-    const int w1p = (w1 < width1 - 1) ? 1 : 0;
-    const Acctype w1lambda = w1r - w1;
-    const Acctype w0lambda = Acctype(1) - w1lambda;
-    //
-    for (int n = 0; n < batchsize ; n++){
-        for (int c = 0; c < channels; ++c) {
-        const Acctype val = t0lambda * (h0lambda * (w0lambda * data1[n][c][t1][h1][w1]
-                                                  + w1lambda * data1[n][c][t1][h1][w1+w1p])
-                                      + h1lambda * (w0lambda * data1[n][c][t1][h1+h1p][w1]
-                                                  + w1lambda * data1[n][c][t1][h1+h1p][w1+w1p]))
-                          + t1lambda * (h0lambda * (w0lambda * data1[n][c][t1+t1p][h1][w1]
-                                                  + w1lambda * data1[n][c][t1+t1p][h1][w1+w1p])
-                                      + h1lambda * (w0lambda * data1[n][c][t1+t1p][h1+h1p][w1]
-                                                  + w1lambda * data1[n][c][t1+t1p][h1+h1p][w1+w1p]));
-        data2[n][c][t2][h2][w2] = ScalarConvert<Acctype, Dtype>::to(val);
-      }
-    }
-  }
-}
-
-// Backward (adjoint) operation 1 <- 2 (accumulates)
-template <typename Dtype, typename Acctype>
-C10_LAUNCH_BOUNDS_1(1024)
-__global__ void caffe_gpu_interp2_kernel_backward(const int n,
-    const Acctype rdepth, const Acctype rheight, const Acctype rwidth, const bool align_corners,
-    THCDeviceTensor<Dtype, 5> data1, const THCDeviceTensor<Dtype, 5> data2){
-  int index = threadIdx.x + blockIdx.x * blockDim.x;
-  const int batchsize = data1.getSize(0);
-  const int channels = data1.getSize(1);
-  const int depth1 = data1.getSize(2);
-  const int height1 = data1.getSize(3);
-  const int width1 = data1.getSize(4);
-  const int depth2 = data2.getSize(2);
-  const int height2 = data2.getSize(3);
-  const int width2 = data2.getSize(4);
-  if (index < n) {
-    const int w2 = (index % (height2*width2)) % width2; // 0:width2-1
-    const int h2 = (index % (height2*width2)) / width2; // 0:height2-1
-    const int t2 = index / (height2*width2);            // 0:depth2-1
-    // special case: just copy
-    if (depth1 == depth2 && height1 == height2 && width1 == width2) {
-      const int t1 = t2;
-      const int h1 = h2;
-      const int w1 = w2;
-      for (int n = 0; n < batchsize ; n++){
-        for (int c = 0; c < channels; ++c) {
-          const Dtype val = data2[n][c][t1][h1][w1];
-          data1[n][c][t2][h2][w2] += val;
-        }
-      }
-      return;
-    }
-    //
-    const Acctype t1r = area_pixel_compute_source_index<Acctype>(rdepth, t2, align_corners, /*cubic=*/false);
-    const int t1 = t1r;
-    const int t1p = (t1 < depth1 - 1) ? 1 : 0;
-    const Acctype t1lambda = t1r - t1;
-    const Acctype t0lambda = Acctype(1) - t1lambda;
-    //
-    const Acctype h1r = area_pixel_compute_source_index<Acctype>(rheight, h2, align_corners, /*cubic=*/false);
-    const int h1 = h1r;
-    const int h1p = (h1 < height1 - 1) ? 1 : 0;
-    const Acctype h1lambda = h1r - h1;
-    const Acctype h0lambda = Acctype(1) - h1lambda;
-    //
-    const Acctype w1r = area_pixel_compute_source_index<Acctype>(rwidth, w2, align_corners, /*cubic=*/false);
-    const int w1 = w1r;
-    const int w1p = (w1 < width1 - 1) ? 1 : 0;
-    const Acctype w1lambda = w1r - w1;
-    const Acctype w0lambda = Acctype(1) - w1lambda;
-    //
-    for (int n = 0; n < batchsize ; n++){
-      for (int c = 0; c < channels; ++c) {
-        const Dtype d2val = data2[n][c][t2][h2][w2];
-        atomicAdd(data1[n][c][t1][h1][w1].data(),
-                  ScalarConvert<Acctype, Dtype>::to(t0lambda * h0lambda * w0lambda * d2val));
-        atomicAdd(data1[n][c][t1][h1][w1+w1p].data(),
-                  ScalarConvert<Acctype, Dtype>::to(t0lambda * h0lambda * w1lambda * d2val));
-        atomicAdd(data1[n][c][t1][h1+h1p][w1].data(),
-                  ScalarConvert<Acctype, Dtype>::to(t0lambda * h1lambda * w0lambda * d2val));
-        atomicAdd(data1[n][c][t1][h1+h1p][w1+w1p].data(),
-                  ScalarConvert<Acctype, Dtype>::to(t0lambda * h1lambda * w1lambda * d2val));
-        atomicAdd(data1[n][c][t1+t1p][h1][w1].data(),
-                  ScalarConvert<Acctype, Dtype>::to(t1lambda * h0lambda * w0lambda * d2val));
-        atomicAdd(data1[n][c][t1+t1p][h1][w1+w1p].data(),
-                  ScalarConvert<Acctype, Dtype>::to(t1lambda * h0lambda * w1lambda * d2val));
-        atomicAdd(data1[n][c][t1+t1p][h1+h1p][w1].data(),
-                  ScalarConvert<Acctype, Dtype>::to(t1lambda * h1lambda * w0lambda * d2val));
-        atomicAdd(data1[n][c][t1+t1p][h1+h1p][w1+w1p].data(),
-                  ScalarConvert<Acctype, Dtype>::to(t1lambda * h1lambda * w1lambda * d2val));
-      }
-    }
-  }
-  /////////////////////////////////////////////////////////
-}
-
-
-#include <THCUNN/generic/VolumetricUpSamplingTrilinear.cu>
-#include <THC/THCGenerateFloatTypes.h>
diff --git a/aten/src/THCUNN/generic/SpatialClassNLLCriterion.cu b/aten/src/THCUNN/generic/SpatialClassNLLCriterion.cu
index 78055cf38729..29b2834e5def 100644
--- a/aten/src/THCUNN/generic/SpatialClassNLLCriterion.cu
+++ b/aten/src/THCUNN/generic/SpatialClassNLLCriterion.cu
@@ -8,10 +8,10 @@ void THNN_(SpatialClassNLLCriterion_shapeCheck)(
            THCIndexTensor *target,
            THCTensor *weights)
 {
-  AT_CHECK(!target->is_empty() && target->dim() == 3, 1,
+  TORCH_CHECK(!target->is_empty() && target->dim() == 3, 1,
            "only batches of spatial targets supported (non-empty 3D tensors)" \
            " but got targets of size: : ", target->sizes());
-  AT_CHECK(!input->is_empty() && input->dim() == 4, 2,
+  TORCH_CHECK(!input->is_empty() && input->dim() == 4, 2,
            "only batches of spatial inputs supported (non-empty 4D tensors), "      \
            "but got input of size: ", input->sizes());
   if (THCTensor_(size)(state, input, 0) != THCIndexTensor_(size)(state, target, 0) ||
@@ -33,7 +33,7 @@ static void THNN_(SpatialClassNLLCriterion_gradOutput_no_reduce_shapeCheck)(
            THCTensor *gradOutput,
            THCIndexTensor *target)
 {
-  AT_CHECK(!gradOutput->is_empty() && THCTensor_(nDimensionLegacyNoScalars)(state, gradOutput) == 3, 2,
+  TORCH_CHECK(!gradOutput->is_empty() && THCTensor_(nDimensionLegacyNoScalars)(state, gradOutput) == 3, 2,
            "Expected non-empty dimension 3 but got gradOutput of size: ", gradOutput->sizes());
   if (THCTensor_(size)(state, gradOutput, 0) != THCIndexTensor_(size)(state, target, 0) ||
       THCTensor_(size)(state, gradOutput, 1) != THCIndexTensor_(size)(state, target, 1) ||
diff --git a/aten/src/THCUNN/generic/SpatialConvolutionLocal.cu b/aten/src/THCUNN/generic/SpatialConvolutionLocal.cu
index c75a8bed967c..9e9825414096 100644
--- a/aten/src/THCUNN/generic/SpatialConvolutionLocal.cu
+++ b/aten/src/THCUNN/generic/SpatialConvolutionLocal.cu
@@ -53,7 +53,7 @@ static THCTensor* THNN_(view_weight_local)(
                  THCTensor *_weight)
 {
   THCTensor *weight = THCTensor_(newContiguous)(state, _weight);
-  AT_CHECK(!weight->is_empty() && (weight->dim() == 3 || weight->dim() == 6), 4,
+  TORCH_CHECK(!weight->is_empty() && (weight->dim() == 3 || weight->dim() == 6), 4,
            "weight tensor should be (non-empty) 3D or 6D - got size: ", weight->sizes());
   if (weight->dim() == 6) {
     int64_t s1 = weight->size(0) * weight->size(1);
diff --git a/aten/src/THCUNN/generic/SpatialDilatedMaxPooling.cu b/aten/src/THCUNN/generic/SpatialDilatedMaxPooling.cu
deleted file mode 100644
index aa3cb035e3d3..000000000000
--- a/aten/src/THCUNN/generic/SpatialDilatedMaxPooling.cu
+++ /dev/null
@@ -1,199 +0,0 @@
-#ifndef THC_GENERIC_FILE
-#define THC_GENERIC_FILE "THCUNN/generic/SpatialDilatedMaxPooling.cu"
-#else
-
-#include <THCUNN/common.h>
-#include <THCUNN/generic/pooling_shape.h>
-#include <ATen/cuda/CUDAContext.h>
-
-static inline void THNN_(SpatialDilatedMaxPooling_shapeCheck)(
-                         THCState *state,
-                         THCTensor *input, THCTensor *gradOutput, THCIndexTensor *indices,
-                         int kH, int kW, int dH, int dW, int padH, int padW,
-                         int dilationH, int dilationW, bool ceil_mode) {
-
-  THArgCheck(kW > 0 && kH > 0, 5,
-             "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
-  THArgCheck(dW > 0 && dH > 0, 8,
-             "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
-  THArgCheck(dilationH > 0 && dilationW > 0, 12,
-             "dilation should be greater than zero, but got dilationH: %d dilationW: %d",
-             dilationH, dilationW);
-
-  int ndim = input->dim();
-  int dimf = 0;
-  int dimh = 1;
-  int dimw = 2;
-  int batchSize = 1;
-
-  if (ndim == 4) {
-    batchSize = input->size(0);
-    dimf++;
-    dimh++;
-    dimw++;
-  }
-
-  THCUNN_argCheck(state, !input->is_empty() && (ndim == 3 || ndim == 4), 2, input,
-                  "non-empty 3D or 4D input tensor expected but got: %s");
-  THArgCheck(kW/2 >= padW && kH/2 >= padH, 2,
-             "pad should be smaller than half of kernel size, but got "
-             "padW = %d, padH = %d, kW = %d, kH = %d",
-             padW, padH, kW, kH);
-
-  int64_t nInputPlane = input->size(dimh-1);
-  int64_t nInputRows = input->size(dimh);
-  int64_t nInputCols = input->size(dimw);
-  int64_t nOutputPlane = nInputPlane;
-
-  int64_t nOutputRows = pooling_output_shape<int64_t>(nInputRows, kH, padH, dH, dilationH, ceil_mode);
-  int64_t nOutputCols = pooling_output_shape<int64_t>(nInputCols, kW, padW, dW, dilationW, ceil_mode);
-
-  if (nOutputCols < 1 || nOutputRows < 1)
-    THError("Given input size: (%dx%dx%d). "
-            "Calculated output size: (%dx%dx%d). Output size is too small",
-            nInputPlane,nInputRows,nInputCols,nInputPlane,nOutputRows,nOutputCols);
-
-  if (gradOutput != NULL) {
-    THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane);
-    THCUNN_check_dim_size(state, gradOutput, ndim, dimh, nOutputRows);
-    THCUNN_check_dim_size(state, gradOutput, ndim, dimw, nOutputCols);
-  }
-  if (indices != NULL) {
-    THCUNN_check_dim_size_indices(state, indices, 4, 0, batchSize);
-    THCUNN_check_dim_size_indices(state, indices, 4, 1, nOutputPlane);
-    THCUNN_check_dim_size_indices(state, indices, 4, 2, nOutputRows);
-    THCUNN_check_dim_size_indices(state, indices, 4, 3, nOutputCols);
-  }
-}
-
-void THNN_(SpatialDilatedMaxPooling_updateOutput)(
-           THCState *state,
-           THCTensor *input,
-           THCTensor *output,
-           THCIndexTensor *indices,
-           int kW, int kH,
-           int dW, int dH,
-           int padW, int padH,
-           int dilationW, int dilationH,
-           bool ceil_mode)
-{
-
-  THCUNN_assertSameGPU(state, 3, input, output, indices);
-  THNN_(SpatialDilatedMaxPooling_shapeCheck)
-       (state, input, NULL, NULL, kH, kW, dH, dW,
-        padH, padW, dilationH, dilationW, ceil_mode);
-
-  int64_t nInputCols, nInputRows, nInputPlane, batchSize;
-  int64_t nOutputCols, nOutputRows;
-
-  if (input->dim() == 3) {
-    nInputCols = input->size(2);
-    nInputRows = input->size(1);
-    nInputPlane = input->size(0);
-    batchSize = 1;
-  }
-  else
-  {
-    nInputCols = input->size(3);
-    nInputRows = input->size(2);
-    nInputPlane = input->size(1);
-    batchSize = input->size(0);
-  }
-
-  nOutputCols = pooling_output_shape<int64_t>(nInputCols, kW, padW, dW, dilationW, ceil_mode);
-  nOutputRows = pooling_output_shape<int64_t>(nInputRows, kH, padH, dH, dilationH, ceil_mode);
-
-  input = THCTensor_(newContiguous)(state, input);
-  scalar_t* input_data = THCTensor_(data)(state, input);
-
-  THCTensor_(resize4d)(state, output, batchSize, nInputPlane, nOutputRows, nOutputCols);
-  THCUNN_resizeAs_indices(state, indices, output);
-
-  THCIndex_t* indices_data = THCIndexTensor_(data)(state, indices);
-  scalar_t* output_data = THCTensor_(data)(state, output);
-
-  int count = THCTensor_(nElement)(state, output);
-
-  MaxPoolForward<scalar_t, accreal> <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>>
-      (count, input_data,
-      batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols,
-      kH, kW, dH, dW, padH, padW, dilationH, dilationW, output_data, indices_data);
-  THCudaCheck(cudaGetLastError());
-
-  if(input->dim() == 3)
-    THCTensor_(resize3d)(state, output, nInputPlane, nOutputRows, nOutputCols);
-
-  THCTensor_(free)(state, input);
-}
-
-void THNN_(SpatialDilatedMaxPooling_updateGradInput)(
-           THCState *state,
-           THCTensor *input,
-           THCTensor *gradOutput,
-           THCTensor *gradInput,
-           THCIndexTensor *indices,
-           int kW, int kH,
-           int dW, int dH,
-           int padW, int padH,
-           int dilationW, int dilationH,
-           bool ceil_mode)
-{
-  THCUNN_assertSameGPU(state, 4, input, gradOutput, indices, gradInput);
-  THNN_(SpatialDilatedMaxPooling_shapeCheck)
-       (state, input, gradOutput, indices, kH, kW, dH, dW,
-       padH, padW, dilationH, dilationW, ceil_mode);
-
-  input = THCTensor_(newContiguous)(state, input);
-  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
-
-  int64_t nInputCols, nInputRows, nInputPlane, batchSize;
-  int64_t nOutputCols, nOutputRows;
-
-  if (THTensor_nDimensionLegacyAll(input) == 3) {
-    nInputCols = input->size(2);
-    nInputRows = input->size(1);
-    nInputPlane = input->size(0);
-    batchSize = 1;
-  }
-  else
-  {
-    nInputCols = input->size(3);
-    nInputRows = input->size(2);
-    nInputPlane = input->size(1);
-    batchSize = input->size(0);
-  }
-
-  nOutputCols = pooling_output_shape<int64_t>(nInputCols, kW, padW, dW, dilationW, ceil_mode);
-  nOutputRows = pooling_output_shape<int64_t>(nInputRows, kH, padH, dH, dilationH, ceil_mode);
-
-  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
-  THCTensor_(resizeAs)(state, gradInput, input);
-
-  int count = THCTensor_(nElement)(state, input);
-  dim3 grid;
-  int imgcount = nInputCols * nInputRows;
-  const int blocks = (imgcount + BACKWARD_THREADS - 1) / BACKWARD_THREADS;
-  grid.x = blocks;
-  grid.y = batchSize;
-  grid.z = nInputPlane;
-  uint64_t maxGridY = at::cuda::getCurrentDeviceProperties()->maxGridSize[1];
-  uint64_t maxGridZ = at::cuda::getCurrentDeviceProperties()->maxGridSize[2];
-  if (maxGridY < grid.y) grid.y = maxGridY;
-  if (maxGridZ < grid.z) grid.z = maxGridZ;
-  MaxPoolBackward<scalar_t, accreal> <<< grid, BACKWARD_THREADS, 0, THCState_getCurrentStream(state) >>>
-      (count,
-      THCTensor_(data)(state, gradOutput),
-      THCIndexTensor_(data)(state, indices),
-      batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols,
-      kH, kW, dH, dW, padH, padW, dilationH, dilationW,
-      THCTensor_(data)(state, gradInput));
-  THCudaCheck(cudaGetLastError());
-
-  THCTensor_(free)(state, gradOutput);
-
-  // clean
-  THCTensor_(free)(state, input);
-  THCTensor_(free)(state, gradOutput);
-}
-
-#endif
diff --git a/aten/src/THCUNN/generic/SpatialMaxPooling.cu b/aten/src/THCUNN/generic/SpatialMaxPooling.cu
deleted file mode 100644
index 21a65f506a1d..000000000000
--- a/aten/src/THCUNN/generic/SpatialMaxPooling.cu
+++ /dev/null
@@ -1,40 +0,0 @@
-#ifndef THC_GENERIC_FILE
-#define THC_GENERIC_FILE "THCUNN/generic/SpatialMaxPooling.cu"
-#else
-
-#include <THCUNN/common.h>
-
-void THNN_(SpatialMaxPooling_updateOutput)(
-           THCState *state,
-           THCTensor *input,
-           THCTensor *output,
-           THCIndexTensor *indices,
-           int kW, int kH,
-           int dW, int dH,
-           int padW, int padH,
-           bool ceil_mode)
-{
-  THNN_(SpatialDilatedMaxPooling_updateOutput)(
-    state, input, output, indices,
-    kW, kH, dW, dH, padW, padH, 1, 1, ceil_mode);
-
-}
-
-void THNN_(SpatialMaxPooling_updateGradInput)(
-           THCState *state,
-           THCTensor *input,
-           THCTensor *gradOutput,
-           THCTensor *gradInput,
-           THCIndexTensor *indices,
-           int kW, int kH,
-           int dW, int dH,
-           int padW, int padH,
-           bool ceil_mode)
-{
-  THNN_(SpatialDilatedMaxPooling_updateGradInput)(
-    state, input, gradOutput, gradInput, indices,
-    kW, kH, dW, dH, padW, padH, 1, 1, ceil_mode);
-
-}
-
-#endif
diff --git a/aten/src/THCUNN/generic/SpatialUpSamplingBicubic.cu b/aten/src/THCUNN/generic/SpatialUpSamplingBicubic.cu
deleted file mode 100644
index b984745147c0..000000000000
--- a/aten/src/THCUNN/generic/SpatialUpSamplingBicubic.cu
+++ /dev/null
@@ -1,115 +0,0 @@
-#ifndef THC_GENERIC_FILE
-#define THC_GENERIC_FILE "THCUNN/generic/SpatialUpSamplingBicubic.cu"
-#else
-
-#include <THCUNN/upsampling.h>
-#include <ATen/cuda/CUDAContext.h>
-
-static inline void THNN_(SpatialUpSamplingBicubic_shapeCheck)
-                        (THCState *state,
-                         THCTensor *input, THCTensor *gradOutput,
-                         int nBatch, int nChannels,
-                         int inputHeight, int inputWidth,
-                         int outputHeight, int outputWidth) {
-  THArgCheck(inputHeight > 0 && inputWidth > 0
-             && outputHeight > 0 && outputWidth > 0, 2,
-             "input and output sizes should be greater than 0,"
-             " but got input (H: %d, W: %d) output (H: %d, W: %d)",
-             inputHeight, inputWidth, outputHeight, outputWidth);
-  if (input != NULL) {
-     THCUNN_argCheck(state, !input->is_empty() && input->dim() == 4, 2, input,
-                     "non-empty 4D input tensor expected but got: %s");
-  }
-
-  if (gradOutput != NULL) {
-    THCUNN_check_dim_size(state, gradOutput, 4, 0, nBatch);
-    THCUNN_check_dim_size(state, gradOutput, 4, 1, nChannels);
-    THCUNN_check_dim_size(state, gradOutput, 4, 2, outputHeight);
-    THCUNN_check_dim_size(state, gradOutput, 4, 3, outputWidth);
-  }
-}
-
-void THNN_(SpatialUpSamplingBicubic_updateOutput)(
-           THCState *state,
-           THCTensor *input,
-           THCTensor *output,
-           int outputHeight,
-           int outputWidth,
-           bool align_corners)
-{
-  int nbatch = THCTensor_(size)(state, input, 0);
-  int channels = THCTensor_(size)(state, input, 1);
-  int inputHeight = THCTensor_(size)(state, input, 2);
-  int inputWidth = THCTensor_(size)(state, input, 3);
-  THNN_(SpatialUpSamplingBicubic_shapeCheck)
-       (state, input, NULL,
-        nbatch, channels,
-        inputHeight, inputWidth,
-        outputHeight, outputWidth);
-
-  THCUNN_assertSameGPU(state, 2, input, output);
-  THCTensor_(resize4d)(state, output,
-                       THCTensor_(size)(state, input, 0),
-                       THCTensor_(size)(state, input, 1),
-                       outputHeight, outputWidth);
-  THCTensor_(zero)(state, output);
-  THCDeviceTensor<scalar_t, 4> idata = toDeviceTensor<scalar_t, 4>(state, input);
-  THCDeviceTensor<scalar_t, 4> odata = toDeviceTensor<scalar_t, 4>(state, output);
-  THAssert(inputHeight > 0 && inputWidth > 0 && outputHeight > 0 && outputWidth > 0);
-
-  // Get scaling factors
-  const accreal rheight = area_pixel_compute_scale<accreal>(inputHeight, outputHeight, align_corners);
-  const accreal rwidth = area_pixel_compute_scale<accreal>(inputWidth, outputWidth, align_corners);
-
-  const int num_output_elements = outputHeight * outputWidth;
-  const int max_threads =
-    at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock;
-
-  // Launch kernel
-  cudaStream_t stream = THCState_getCurrentStream(state);
-  bicubic_interp2d_kernel<scalar_t, accreal> <<<
-    THCCeilDiv(num_output_elements, max_threads),
-    max_threads,
-    0,
-    stream
-  >>>(num_output_elements, rheight, rwidth, align_corners, idata, odata);
-  THCudaCheck(cudaGetLastError());
-}
-
-
-void THNN_(SpatialUpSamplingBicubic_updateGradInput)(
-           THCState *state,
-           THCTensor *gradOutput,
-           THCTensor *gradInput,
-           int nbatch,
-           int nchannels,
-           int inputHeight,
-           int inputWidth,
-           int outputHeight,
-           int outputWidth,
-           bool align_corners)
-{
-  THNN_(SpatialUpSamplingBicubic_shapeCheck)
-       (state, NULL, gradOutput,
-        nbatch, nchannels,
-        inputHeight, inputWidth,
-        outputHeight, outputWidth);
-  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
-  THCUNN_assertSameGPU(state, 2, gradOutput, gradInput);
-  THCTensor_(resize4d)(state, gradInput, nbatch, nchannels, inputHeight, inputWidth);
-  THCTensor_(zero)(state, gradInput);
-  THCDeviceTensor<scalar_t, 4> in_data = toDeviceTensor<scalar_t, 4>(state, gradInput);
-  THCDeviceTensor<scalar_t, 4> out_data = toDeviceTensor<scalar_t, 4>(state, gradOutput);
-  const accreal rheight = area_pixel_compute_scale<accreal>(inputHeight, outputHeight, align_corners);
-  const accreal rwidth = area_pixel_compute_scale<accreal>(inputWidth, outputWidth, align_corners);
-  const int num_kernels = outputHeight * outputWidth;
-  const int num_threads =
-    at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock;
-  cudaStream_t stream = THCState_getCurrentStream(state);
-  bicubic_interp2d_backward_kernel<scalar_t ,accreal> <<<THCCeilDiv(num_kernels, num_threads),
-  num_threads, 0, stream>>>(num_kernels, rheight, rwidth, align_corners, in_data, out_data);
-  THCudaCheck(cudaGetLastError());
-  THCTensor_(free)(state, gradOutput);
-}
-
-#endif
diff --git a/aten/src/THCUNN/generic/SpatialUpSamplingBilinear.cu b/aten/src/THCUNN/generic/SpatialUpSamplingBilinear.cu
deleted file mode 100644
index 6afb863f8dc3..000000000000
--- a/aten/src/THCUNN/generic/SpatialUpSamplingBilinear.cu
+++ /dev/null
@@ -1,106 +0,0 @@
-#ifndef THC_GENERIC_FILE
-#define THC_GENERIC_FILE "THCUNN/generic/SpatialUpSamplingBilinear.cu"
-#else
-
-#include <THCUNN/upsampling.h>
-#include "ATen/cuda/CUDAContext.h"
-
-static inline void THNN_(SpatialUpSamplingBilinear_shapeCheck)
-                        (THCState *state,
-                         THCTensor *input, THCTensor *gradOutput,
-                         int nBatch, int nChannels,
-                         int inputHeight, int inputWidth,
-                         int outputHeight, int outputWidth) {
-  THArgCheck(inputHeight > 0 && inputWidth > 0
-             && outputHeight > 0 && outputWidth > 0, 2,
-             "input and output sizes should be greater than 0,"
-             " but got input (H: %d, W: %d) output (H: %d, W: %d)",
-             inputHeight, inputWidth, outputHeight, outputWidth);
-  if (input != NULL) {
-     THCUNN_argCheck(state, !input->is_empty() && input->dim() == 4, 2, input,
-                     "non-empty 4D input tensor expected but got: %s");
-  }
-
-  if (gradOutput != NULL) {
-    THCUNN_check_dim_size(state, gradOutput, 4, 0, nBatch);
-    THCUNN_check_dim_size(state, gradOutput, 4, 1, nChannels);
-    THCUNN_check_dim_size(state, gradOutput, 4, 2, outputHeight);
-    THCUNN_check_dim_size(state, gradOutput, 4, 3, outputWidth);
-  }
-}
-
-void THNN_(SpatialUpSamplingBilinear_updateOutput)(
-           THCState *state,
-           THCTensor *input,
-           THCTensor *output,
-           int outputHeight,
-           int outputWidth,
-           bool align_corners)
-{
-  int nbatch = THCTensor_(size)(state, input, 0);
-  int channels = THCTensor_(size)(state, input, 1);
-  int inputHeight = THCTensor_(size)(state, input, 2);
-  int inputWidth = THCTensor_(size)(state, input, 3);
-  THNN_(SpatialUpSamplingBilinear_shapeCheck)
-       (state, input, NULL,
-        nbatch, channels,
-        inputHeight, inputWidth,
-        outputHeight, outputWidth);
-
-  THCUNN_assertSameGPU(state, 2, input, output);
-  THCTensor_(resize4d)(state, output,
-                       THCTensor_(size)(state, input, 0),
-                       THCTensor_(size)(state, input, 1),
-                       outputHeight, outputWidth);
-  THCTensor_(zero)(state, output);
-  THCDeviceTensor<scalar_t, 4> idata = toDeviceTensor<scalar_t, 4>(state, input);
-  THCDeviceTensor<scalar_t, 4> odata = toDeviceTensor<scalar_t, 4>(state, output);
-  THAssert(inputHeight > 0 && inputWidth > 0 && outputHeight > 0 && outputWidth > 0);
-  const accreal rheight = area_pixel_compute_scale<accreal>(inputHeight, outputHeight, align_corners);
-  const accreal rwidth = area_pixel_compute_scale<accreal>(inputWidth, outputWidth, align_corners);
-  const int num_kernels = outputHeight * outputWidth;
-  const int num_threads =
-    at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock;
-  cudaStream_t stream = THCState_getCurrentStream(state);
-  caffe_gpu_interp2_kernel<scalar_t, accreal> <<<THCCeilDiv(num_kernels, num_threads), num_threads ,
-   0 , stream>>>(num_kernels, rheight, rwidth, align_corners, idata, odata);
-  THCudaCheck(cudaGetLastError());
-}
-
-
-void THNN_(SpatialUpSamplingBilinear_updateGradInput)(
-           THCState *state,
-           THCTensor *gradOutput,
-           THCTensor *gradInput,
-           int nbatch,
-           int nchannels,
-           int inputHeight,
-           int inputWidth,
-           int outputHeight,
-           int outputWidth,
-           bool align_corners)
-{
-  THNN_(SpatialUpSamplingBilinear_shapeCheck)
-       (state, NULL, gradOutput,
-        nbatch, nchannels,
-        inputHeight, inputWidth,
-        outputHeight, outputWidth);
-  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
-  THCUNN_assertSameGPU(state, 2, gradOutput, gradInput);
-  THCTensor_(resize4d)(state, gradInput, nbatch, nchannels, inputHeight, inputWidth);
-  THCTensor_(zero)(state, gradInput);
-  THCDeviceTensor<scalar_t, 4> data1 = toDeviceTensor<scalar_t, 4>(state, gradInput);
-  THCDeviceTensor<scalar_t, 4> data2 = toDeviceTensor<scalar_t, 4>(state, gradOutput);
-  const accreal rheight = area_pixel_compute_scale<accreal>(inputHeight, outputHeight, align_corners);
-  const accreal rwidth = area_pixel_compute_scale<accreal>(inputWidth, outputWidth, align_corners);
-  const int num_kernels = outputHeight * outputWidth;
-  const int num_threads =
-    at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock;
-  cudaStream_t stream = THCState_getCurrentStream(state);
-  caffe_gpu_interp2_kernel_backward<scalar_t ,accreal> <<<THCCeilDiv(num_kernels, num_threads),
-  num_threads, 0, stream>>>(num_kernels, rheight, rwidth, align_corners, data1, data2);
-  THCudaCheck(cudaGetLastError());
-  THCTensor_(free)(state, gradOutput);
-}
-
-#endif
diff --git a/aten/src/THCUNN/generic/SpatialUpSamplingNearest.cu b/aten/src/THCUNN/generic/SpatialUpSamplingNearest.cu
deleted file mode 100644
index 85a7b831561d..000000000000
--- a/aten/src/THCUNN/generic/SpatialUpSamplingNearest.cu
+++ /dev/null
@@ -1,102 +0,0 @@
-#ifndef THC_GENERIC_FILE
-#define THC_GENERIC_FILE "THCUNN/generic/SpatialUpSamplingNearest.cu"
-#else
-
-#include <THCUNN/common.h>
-#include "ATen/cuda/CUDAContext.h"
-
-static inline void THNN_(SpatialUpSamplingNearest_shapeCheck)
-                        (THCState *state,
-                         THCTensor *input, THCTensor *gradOutput,
-                         int nBatch, int nChannels,
-                         int inputHeight, int inputWidth,
-                         int outputHeight, int outputWidth) {
-  THArgCheck(inputHeight > 0 && inputWidth > 0
-             && outputHeight > 0 && outputWidth > 0, 2,
-             "input and output sizes should be greater than 0,"
-             " but got input (H: %d, W: %d) output (H: %d, W: %d)",
-             inputHeight, inputWidth, outputHeight, outputWidth);
-  if (input != NULL) {
-     THCUNN_argCheck(state, THTensor_nDimensionLegacyAll(input) == 4, 2, input,
-                     "4D input tensor expected but got: %s");
-  }
-
-  if (gradOutput != NULL) {
-    THCUNN_check_dim_size(state, gradOutput, 4, 0, nBatch);
-    THCUNN_check_dim_size(state, gradOutput, 4, 1, nChannels);
-    THCUNN_check_dim_size(state, gradOutput, 4, 2, outputHeight);
-    THCUNN_check_dim_size(state, gradOutput, 4, 3, outputWidth);
-  }
-}
-
-
-void THNN_(SpatialUpSamplingNearest_updateOutput)(
-           THCState *state,
-           THCTensor *input,
-           THCTensor *output,
-           int outputHeight,
-           int outputWidth)
-{
-  THCUNN_assertSameGPU(state, 2, input, output);
-  int nbatch = THCTensor_(size)(state, input, 0);
-  int channels = THCTensor_(size)(state, input, 1);
-  int inputHeight = THCTensor_(size)(state, input, 2);
-  int inputWidth  = THCTensor_(size)(state, input, 3);
-
-  THNN_(SpatialUpSamplingNearest_shapeCheck)(state, input, NULL, nbatch, channels,
-                  inputHeight, inputWidth,
-                  outputHeight, outputWidth);
-  THAssert(inputHeight > 0 && inputWidth > 0 && outputHeight > 0 && outputWidth > 0);
-
-  THCTensor_(resize4d)(state, output,
-                       THCTensor_(size)(state, input, 0),
-                       THCTensor_(size)(state, input, 1),
-                       outputHeight,
-                       outputWidth);
-  THCTensor_(zero)(state, output);
-
-  THCDeviceTensor<scalar_t, 4> idata = toDeviceTensor<scalar_t, 4>(state, input);
-  THCDeviceTensor<scalar_t, 4> odata = toDeviceTensor<scalar_t, 4>(state, output);
-
-  const int num_kernels = outputHeight * outputWidth;
-  const int num_threads = at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock;
-  cudaStream_t stream = THCState_getCurrentStream(state);
-  nearest_neighbor_4d_kernel<scalar_t, accreal> <<<THCCeilDiv(num_kernels, num_threads), num_threads,
-         0, stream>>>(num_kernels, idata, odata);
-  THCudaCheck(cudaGetLastError());
-}
-
-
-
-void THNN_(SpatialUpSamplingNearest_updateGradInput)(
-           THCState *state,
-           THCTensor *gradOutput,
-           THCTensor *gradInput,
-           int nbatch,
-           int nchannels,
-           int inputHeight,
-           int inputWidth,
-           int outputHeight,
-           int outputWidth)
-{
-  THCUNN_assertSameGPU(state, 2, gradOutput, gradInput);
-  THNN_(SpatialUpSamplingNearest_shapeCheck)(state, NULL, gradOutput, nbatch, nchannels,
-                  inputHeight, inputWidth, outputHeight, outputWidth);
-  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
-  THCTensor_(resize4d)(state, gradInput, nbatch, nchannels, inputHeight, inputWidth);
-
-  THCTensor_(zero)(state, gradInput);
-  THCDeviceTensor<scalar_t, 4> data1 = toDeviceTensor<scalar_t, 4>(state, gradInput);
-  THCDeviceTensor<scalar_t, 4> data2 = toDeviceTensor<scalar_t, 4>(state, gradOutput);
-
-  const int num_kernels = outputHeight * outputWidth;
-  const int num_threads = at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock;
-  cudaStream_t stream = THCState_getCurrentStream(state);
-
-  nearest_neighbor_4d_kernel_backward<scalar_t, accreal> <<<THCCeilDiv(num_kernels, num_threads),
-          num_threads, 0, stream>>>(num_kernels, data1, data2);
-  THCudaCheck(cudaGetLastError());
-  THCTensor_(free)(state, gradOutput);
-}
-
-#endif
diff --git a/aten/src/THCUNN/generic/THCUNN.h b/aten/src/THCUNN/generic/THCUNN.h
index 58b4b3c42c8f..9739dc06b18c 100644
--- a/aten/src/THCUNN/generic/THCUNN.h
+++ b/aten/src/THCUNN/generic/THCUNN.h
@@ -665,29 +665,6 @@ THC_API void THNN_(SpatialFullDilatedConvolution_accGradParameters)(
                   int adjW, int adjH,
                   accreal scale);
 
-THC_API void THNN_(SpatialDilatedMaxPooling_updateOutput)(
-                  THCState *state,
-                  THCTensor *input,
-                  THCTensor *output,
-                  THCIndexTensor *indices,
-                  int kW, int kH,
-                  int dW, int dH,
-                  int padW, int padH,
-                  int dilationW, int dilationH,
-                  bool ceil_mode);
-
-THC_API void THNN_(SpatialDilatedMaxPooling_updateGradInput)(
-                  THCState *state,
-                  THCTensor *input,
-                  THCTensor *gradOutput,
-                  THCTensor *gradInput,
-                  THCIndexTensor *indices,
-                  int kW, int kH,
-                  int dW, int dH,
-                  int padW, int padH,
-                  int dilationW, int dilationH,
-                  bool ceil_mode);
-
 THC_API void THNN_(SpatialFullConvolution_updateOutput)(
                   THCState *state,
                   THCTensor *input,
@@ -727,27 +704,6 @@ THC_API void THNN_(SpatialFullConvolution_accGradParameters)(
                   int adjW, int adjH,
                   accreal scale);
 
-THC_API void THNN_(SpatialMaxPooling_updateOutput)(
-                  THCState *state,
-                  THCTensor *input,
-                  THCTensor *output,
-                  THCIndexTensor *indices,
-                  int kW, int kH,
-                  int dW, int dH,
-                  int padW, int padH,
-                  bool ceil_mode);
-
-THC_API void THNN_(SpatialMaxPooling_updateGradInput)(
-                  THCState *state,
-                  THCTensor *input,
-                  THCTensor *gradOutput,
-                  THCTensor *gradInput,
-                  THCIndexTensor *indices,
-                  int kW, int kH,
-                  int dW, int dH,
-                  int padW, int padH,
-                  bool ceil_mode);
-
 THC_API void THNN_(SpatialMaxUnpooling_updateOutput)(
                   THCState *state,
                   THCTensor *input,
@@ -791,64 +747,6 @@ THC_API void THNN_(SpatialSubSampling_accGradParameters)(
                   int dW, int dH,
                   accreal scale);
 
-THC_API void THNN_(SpatialUpSamplingBilinear_updateOutput)(
-                  THCState *state,
-                  THCTensor *input,
-                  THCTensor *output,
-                  int outputHeight,
-                  int outputWidth,
-                  bool align_corners);
-
-THC_API void THNN_(SpatialUpSamplingBilinear_updateGradInput)(
-                  THCState *state,
-                  THCTensor *gradOutput,
-                  THCTensor *gradInput,
-                  int nbatch,
-                  int nchannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int outputHeight,
-                  int outputWidth,
-                  bool align_corners);
-
-THC_API void THNN_(SpatialUpSamplingBicubic_updateOutput)(
-                  THCState *state,
-                  THCTensor *input,
-                  THCTensor *output,
-                  int outputHeight,
-                  int outputWidth,
-                  bool align_corners);
-
-THC_API void THNN_(SpatialUpSamplingBicubic_updateGradInput)(
-                  THCState *state,
-                  THCTensor *gradOutput,
-                  THCTensor *gradInput,
-                  int nbatch,
-                  int nchannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int outputHeight,
-                  int outputWidth,
-                  bool align_corners);
-
-THC_API void THNN_(SpatialUpSamplingNearest_updateGradInput)(
-                  THCState *state,
-                  THCTensor *gradOutput,
-                  THCTensor *gradInput,
-                  int nbatch,
-                  int nchannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int outputHeight,
-                  int outputWidth);
-
-THC_API void THNN_(SpatialUpSamplingNearest_updateOutput)(
-                  THCState *state,
-                  THCTensor *input,
-                  THCTensor *output,
-                  int outputHeight,
-                  int outputWidth);
-
 THC_API void THNN_(RReLU_updateOutput)(
                   THCState *state,
                   THCTensor *input,
@@ -1043,38 +941,6 @@ THC_API void THNN_(TemporalRowConvolution_accGradParameters)(
                   bool featFirst,
                   accreal scale);
 
-THC_API void THNN_(TemporalUpSamplingLinear_updateOutput)(
-                  THCState *state,
-                  THCTensor *input,
-                  THCTensor *output,
-                  int outputWidth,
-                  bool align_corners);
-
-THC_API void THNN_(TemporalUpSamplingLinear_updateGradInput)(
-                  THCState *state,
-                  THCTensor *gradOutput,
-                  THCTensor *gradInput,
-                  int nbatch,
-                  int nchannels,
-                  int inputWidth,
-                  int outputWidth,
-                  bool align_corners);
-
-THC_API void THNN_(TemporalUpSamplingNearest_updateGradInput)(
-                  THCState *state,
-                  THCTensor *gradOutput,
-                  THCTensor *gradInput,
-                  int nbatch,
-                  int nchannels,
-                  int inputWidth,
-                  int outputWidth);
-
-THC_API void THNN_(TemporalUpSamplingNearest_updateOutput)(
-                  THCState *state,
-                  THCTensor *input,
-                  THCTensor *output,
-                  int outputWidth);
-
 THC_API void THNN_(VolumetricAveragePooling_updateOutput)(
                   THCState *state,
                   THCTensor *input,
@@ -1315,62 +1181,4 @@ THC_API void THNN_(VolumetricMaxUnpooling_updateGradInput)(
                   int dT, int dW, int dH,
                   int padT, int padW, int padH);
 
-THC_API void THNN_(VolumetricAdaptiveAveragePooling_updateOutput)(
-                  THCState *state,
-                  THCTensor *input,
-                  THCTensor *output,
-                  int osizeT,
-                  int osizeW,
-                  int osizeH);
-
-THC_API void THNN_(VolumetricAdaptiveAveragePooling_updateGradInput)(
-                  THCState *state,
-                  THCTensor *input,
-                  THCTensor *gradOutput,
-                  THCTensor *gradInput);
-
-THC_API void THNN_(VolumetricUpSamplingNearest_updateGradInput)(
-                  THCState *state,
-                  THCTensor *gradOutput,
-                  THCTensor *gradInput,
-                  int nbatch,
-                  int nchannels,
-                  int inputDepth,
-                  int inputHeight,
-                  int inputWidth,
-                  int outputDepth,
-                  int outputHeight,
-                  int outputWidth);
-
-THC_API void THNN_(VolumetricUpSamplingNearest_updateOutput)(
-                  THCState *state,
-                  THCTensor *input,
-                  THCTensor *output,
-                  int outputDepth,
-                  int outputHeight,
-                  int outputWidth);
-
-THC_API void THNN_(VolumetricUpSamplingTrilinear_updateOutput)(
-                  THCState *state,
-                  THCTensor *input,
-                  THCTensor *output,
-                  int outputDepth,
-                  int outputHeight,
-                  int outputWidth,
-                  bool align_corners);
-
-THC_API void THNN_(VolumetricUpSamplingTrilinear_updateGradInput)(
-                  THCState *state,
-                  THCTensor *gradOutput,
-                  THCTensor *gradInput,
-                  int nbatch,
-                  int nchannels,
-                  int inputDepth,
-                  int inputHeight,
-                  int inputWidth,
-                  int outputDepth,
-                  int outputHeight,
-                  int outputWidth,
-                  bool align_corners);
-
 #endif
diff --git a/aten/src/THCUNN/generic/TemporalUpSamplingLinear.cu b/aten/src/THCUNN/generic/TemporalUpSamplingLinear.cu
deleted file mode 100644
index 4b1ef97c9c09..000000000000
--- a/aten/src/THCUNN/generic/TemporalUpSamplingLinear.cu
+++ /dev/null
@@ -1,96 +0,0 @@
-#ifndef THC_GENERIC_FILE
-#define THC_GENERIC_FILE "THCUNN/generic/TemporalUpSamplingLinear.cu"
-#else
-
-#include <THCUNN/upsampling.h>
-#include "ATen/cuda/CUDAContext.h"
-
-static inline void THNN_(TemporalUpSamplingLinear_shapeCheck)
-                        (THCState *state,
-                         THCTensor *input, THCTensor *gradOutput,
-                         int nBatch, int nChannels,
-                         int inputWidth,
-                         int outputWidth) {
-  THArgCheck(inputWidth > 0 && outputWidth > 0, 2,
-             "input and output sizes should be greater than 0,"
-             " but got input (W: %d) output (W: %d)",
-             inputWidth, outputWidth);
-  if (input != NULL) {
-     THCUNN_argCheck(state, !input->is_empty() && input->dim() == 3, 2, input,
-                     "non-empty 3D input tensor expected but got: %s");
-  }
-
-  if (gradOutput != NULL) {
-    THCUNN_check_dim_size(state, gradOutput, 3, 0, nBatch);
-    THCUNN_check_dim_size(state, gradOutput, 3, 1, nChannels);
-    THCUNN_check_dim_size(state, gradOutput, 3, 2, outputWidth);
-  }
-}
-
-void THNN_(TemporalUpSamplingLinear_updateOutput)(
-           THCState *state,
-           THCTensor *input,
-           THCTensor *output,
-           int outputWidth,
-           bool align_corners)
-{
-  int nbatch = THCTensor_(size)(state, input, 0);
-  int channels = THCTensor_(size)(state, input, 1);
-  int inputWidth = THCTensor_(size)(state, input, 2);
-  THNN_(TemporalUpSamplingLinear_shapeCheck)
-       (state, input, NULL,
-        nbatch, channels,
-        inputWidth, outputWidth);
-
-  THCUNN_assertSameGPU(state, 2, input, output);
-  THCTensor_(resize3d)(state, output,
-                       THCTensor_(size)(state, input, 0),
-                       THCTensor_(size)(state, input, 1),
-                       outputWidth);
-  THCTensor_(zero)(state, output);
-  THCDeviceTensor<scalar_t, 3> idata = toDeviceTensor<scalar_t, 3>(state, input);
-  THCDeviceTensor<scalar_t, 3> odata = toDeviceTensor<scalar_t, 3>(state, output);
-  THAssert(inputWidth > 0 && outputWidth > 0);
-  const accreal rwidth = area_pixel_compute_scale<accreal>(inputWidth, outputWidth, align_corners);
-  const int num_kernels = outputWidth;
-  const int num_threads =
-    at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock;
-  cudaStream_t stream = THCState_getCurrentStream(state);
-  caffe_gpu_interp2_kernel<scalar_t, accreal> <<<THCCeilDiv(num_kernels, num_threads), num_threads ,
-   0 , stream>>>(num_kernels, rwidth, align_corners, idata, odata);
-  THCudaCheck(cudaGetLastError());
-}
-
-
-void THNN_(TemporalUpSamplingLinear_updateGradInput)(
-           THCState *state,
-           THCTensor *gradOutput,
-           THCTensor *gradInput,
-           int nbatch,
-           int nchannels,
-           int inputWidth,
-           int outputWidth,
-           bool align_corners)
-{
-  THNN_(TemporalUpSamplingLinear_shapeCheck)
-       (state, NULL, gradOutput,
-        nbatch, nchannels,
-        inputWidth, outputWidth);
-  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
-  THCUNN_assertSameGPU(state, 2, gradOutput, gradInput);
-  THCTensor_(resize3d)(state, gradInput, nbatch, nchannels, inputWidth);
-  THCTensor_(zero)(state, gradInput);
-  THCDeviceTensor<scalar_t, 3> data1 = toDeviceTensor<scalar_t, 3>(state, gradInput);
-  THCDeviceTensor<scalar_t, 3> data2 = toDeviceTensor<scalar_t, 3>(state, gradOutput);
-  const accreal rwidth = area_pixel_compute_scale<accreal>(inputWidth, outputWidth, align_corners);
-  const int num_kernels = outputWidth;
-  const int num_threads =
-    at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock;
-  cudaStream_t stream = THCState_getCurrentStream(state);
-  caffe_gpu_interp2_kernel_backward<scalar_t ,accreal> <<<THCCeilDiv(num_kernels, num_threads),
-  num_threads, 0, stream>>>(num_kernels, rwidth, align_corners, data1, data2);
-  THCudaCheck(cudaGetLastError());
-  THCTensor_(free)(state, gradOutput);
-}
-
-#endif
diff --git a/aten/src/THCUNN/generic/TemporalUpSamplingNearest.cu b/aten/src/THCUNN/generic/TemporalUpSamplingNearest.cu
deleted file mode 100644
index 1658b180c85a..000000000000
--- a/aten/src/THCUNN/generic/TemporalUpSamplingNearest.cu
+++ /dev/null
@@ -1,91 +0,0 @@
-#ifndef THC_GENERIC_FILE
-#define THC_GENERIC_FILE "THCUNN/generic/TemporalUpSamplingNearest.cu"
-#else
-
-#include <THCUNN/common.h>
-#include "ATen/cuda/CUDAContext.h"
-
-static inline void THNN_(TemporalUpSamplingNearest_shapeCheck)
-                        (THCState *state,
-                         THCTensor *input, THCTensor *gradOutput,
-                         int nBatch, int nChannels,
-                         int inputWidth,
-                         int outputWidth) {
-  THArgCheck(inputWidth > 0 && outputWidth > 0, 2,
-             "input and output sizes should be greater than 0,"
-             " but got input (W: %d) output (W: %d)",
-             inputWidth, outputWidth);
-  if (input != NULL) {
-     THCUNN_argCheck(state, THTensor_nDimensionLegacyAll(input) == 3, 2, input,
-                     "3D input tensor expected but got: %s");
-  }
-
-  if (gradOutput != NULL) {
-    THCUNN_check_dim_size(state, gradOutput, 3, 0, nBatch);
-    THCUNN_check_dim_size(state, gradOutput, 3, 1, nChannels);
-    THCUNN_check_dim_size(state, gradOutput, 3, 2, outputWidth);
-  }
-}
-
-void THNN_(TemporalUpSamplingNearest_updateOutput)(
-           THCState *state,
-           THCTensor *input,
-           THCTensor *output,
-           int outputWidth)
-{
-  THCUNN_assertSameGPU(state, 2, input, output);
-  int nbatch = THCTensor_(size)(state, input, 0);
-  int channels = THCTensor_(size)(state, input, 1);
-  int inputWidth  = THCTensor_(size)(state, input, 2);
-
-  THNN_(TemporalUpSamplingNearest_shapeCheck)(state, input, NULL, nbatch, channels, inputWidth, outputWidth);
-  THAssert(inputWidth > 0 && outputWidth > 0);
-
-  THCTensor_(resize3d)(state, output,
-                       THCTensor_(size)(state, input, 0),
-                       THCTensor_(size)(state, input, 1),
-                       outputWidth);
-  THCTensor_(zero)(state, output);
-
-  THCDeviceTensor<scalar_t, 3> idata = toDeviceTensor<scalar_t, 3>(state, input);
-  THCDeviceTensor<scalar_t, 3> odata = toDeviceTensor<scalar_t, 3>(state, output);
-
-  const int num_kernels = outputWidth;
-  const int num_threads = at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock;
-  cudaStream_t stream = THCState_getCurrentStream(state);
-  nearest_neighbor_3d_kernel<scalar_t, accreal> <<<THCCeilDiv(num_kernels, num_threads), num_threads,
-         0, stream>>>(num_kernels, idata, odata);
-  THCudaCheck(cudaGetLastError());
-}
-
-
-void THNN_(TemporalUpSamplingNearest_updateGradInput)(
-           THCState *state,
-           THCTensor *gradOutput,
-           THCTensor *gradInput,
-           int nbatch,
-           int nchannels,
-           int inputWidth,
-           int outputWidth)
-{
-  THCUNN_assertSameGPU(state, 2, gradOutput, gradInput);
-  THNN_(TemporalUpSamplingNearest_shapeCheck)(state, NULL, gradOutput, nbatch, nchannels, inputWidth, outputWidth);
-  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
-  THCTensor_(resize3d)(state, gradInput, nbatch, nchannels, inputWidth);
-
-  THCTensor_(zero)(state, gradInput);
-  THCDeviceTensor<scalar_t, 3> data1 = toDeviceTensor<scalar_t, 3>(state, gradInput);
-  THCDeviceTensor<scalar_t, 3> data2 = toDeviceTensor<scalar_t, 3>(state, gradOutput);
-
-  const int num_kernels = outputWidth;
-  const int num_threads = at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock;
-  cudaStream_t stream = THCState_getCurrentStream(state);
-
-  nearest_neighbor_3d_kernel_backward<scalar_t, accreal> <<<THCCeilDiv(num_kernels, num_threads),
-          num_threads, 0, stream>>>(num_kernels, data1, data2);
-
-  THCudaCheck(cudaGetLastError());
-  THCTensor_(free)(state, gradOutput);
-}
-
-#endif
diff --git a/aten/src/THCUNN/generic/VolumetricAdaptiveAveragePooling.cu b/aten/src/THCUNN/generic/VolumetricAdaptiveAveragePooling.cu
deleted file mode 100644
index 5e315a407029..000000000000
--- a/aten/src/THCUNN/generic/VolumetricAdaptiveAveragePooling.cu
+++ /dev/null
@@ -1,173 +0,0 @@
-#ifndef THC_GENERIC_FILE
-#define THC_GENERIC_FILE "THCUNN/generic/VolumetricAdaptiveAveragePooling.cu"
-#else
-
-#include <THCUNN/common.h>
-
-// 5d tensor B x D x T x H x W
-
-void THNN_(VolumetricAdaptiveAveragePooling_updateOutput)(
-           THCState *state,
-           THCTensor *input,
-           THCTensor *output,
-           int osizeT,
-           int osizeW,
-           int osizeH)
-{
-  THCUNN_assertSameGPU(state, 2, input, output);
-
-  THCUNN_argCheck(state, !input->is_empty() && (input->dim() == 4 || input->dim() == 5), 2, input,
-                  "non-empty 4D or 5D (batch mode) tensor expected for input, but got: %s");
-
-
-  scalar_t *output_data;
-  scalar_t *input_data;
-
-  int64_t sizeD, isizeT, isizeH, isizeW;
-  int64_t istrideD, istrideT, istrideH, istrideW;
-  int64_t totalZ;
-
-  if (input->dim() == 4) {
-    sizeD = input->size(0);
-    isizeT = input->size(1);
-    isizeH = input->size(2);
-    isizeW = input->size(3);
-
-    istrideD = input->stride(0);
-    istrideT = input->stride(1);
-    istrideH = input->stride(2);
-    istrideW = input->stride(3);
-
-    THCTensor_(resize4d)(state, output, sizeD, osizeT, osizeH, osizeW);
-
-    totalZ = sizeD * osizeT;
-  } else {
-    input = THCTensor_(newContiguous)(state, input);
-
-    int64_t sizeB = input->size(0);
-    sizeD = input->size(1);
-    isizeT = input->size(2);
-    isizeH = input->size(3);
-    isizeW = input->size(4);
-
-    istrideD = input->stride(1);
-    istrideT = input->stride(2);
-    istrideH = input->stride(3);
-    istrideW = input->stride(4);
-
-    THCTensor_(resize5d)(state, output, sizeB, sizeD, osizeT, osizeH, osizeW);
-
-    totalZ = sizeB * sizeD * osizeT;
-  }
-
-  input_data = THCTensor_(data)(state, input);
-  output_data = THCTensor_(data)(state, output);
-
-  int64_t offsetZ = 0;
-  dim3 threads(32, 8);
-  // each H*W plane is processed by blocksH thread blocks
-  int blocksH = max((int)(16L / totalZ), 1);
-  while (totalZ > 0) {
-    dim3 blocks(totalZ > 65535 ? 65535 : totalZ, blocksH);
-    cunn_VolumetricAdaptiveAveragePooling_updateOutput_kernel
-      <<<blocks, threads, 0, THCState_getCurrentStream(state)>>>(
-        input_data, output_data, isizeT, isizeH, isizeW, osizeT, osizeH, osizeW,
-        istrideD, istrideT, istrideH, istrideW, offsetZ
-      );
-
-    totalZ -= 65535;
-    offsetZ += 65535;
-    THCudaCheck(cudaGetLastError());
-  }
-
-  if (input->dim() == 5) {
-    // clean
-    THCTensor_(free)(state, input);
-  }
-}
-
-void THNN_(VolumetricAdaptiveAveragePooling_updateGradInput)(
-           THCState *state,
-           THCTensor *input,
-           THCTensor *gradOutput,
-           THCTensor *gradInput)
-{
-  THCUNN_assertSameGPU(state, 3, input, gradOutput, gradInput);
-
-  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
-
-  THCTensor_(resizeAs)(state, gradInput, input);
-  THCTensor_(zero)(state, gradInput);
-
-  scalar_t *gradInput_data;
-  scalar_t *gradOutput_data;
-
-  int64_t sizeD, isizeT, isizeH, isizeW;
-  int64_t osizeT, osizeH, osizeW;
-  int64_t totalZ;
-
-  if (input->dim() == 4) {
-    sizeD = input->size(0);
-    isizeT = input->size(1);
-    isizeH = input->size(2);
-    isizeW = input->size(3);
-
-    osizeT = gradOutput->size(1);
-    osizeH = gradOutput->size(2);
-    osizeW = gradOutput->size(3);
-  } else {
-    sizeD = input->size(1);
-    isizeT = input->size(2);
-    isizeH = input->size(3);
-    isizeW = input->size(4);
-
-    osizeT = gradOutput->size(2);
-    osizeH = gradOutput->size(3);
-    osizeW = gradOutput->size(4);
-  }
-
-  // somehow nonatomic is passing all test for volumetric case.
-  bool atomic = false; //(isizeW%osizeW != 0) || (isizeH%osizeH != 0) || (isizeT%osizeT != 0);
-
-  if (input->dim() == 4) {
-    totalZ = atomic ? sizeD * osizeT : sizeD * isizeT;
-  } else {
-    int sizeB = input->size(0);
-    totalZ = atomic ? sizeB * sizeD * osizeT : sizeB * sizeD * isizeT;
-  }
-
-  gradInput_data = THCTensor_(data)(state, gradInput);
-  gradOutput_data = THCTensor_(data)(state, gradOutput);
-
-  int64_t offsetZ = 0;
-  dim3 threads(32, 8);
-  // each H*W plane is processed by blocksH thread blocks
-  int blocksH = max((int)(16L / totalZ), 1);
-  while (totalZ > 0) {
-    dim3 blocks(totalZ > 65535 ? 65535 : totalZ, blocksH);
-
-    if (atomic)
-    {
-      cunn_atomic_VolumetricAdaptiveAveragePooling_updateGradInput_kernel
-        <<<blocks, threads, 0, THCState_getCurrentStream(state)>>>(
-          gradInput_data, gradOutput_data, isizeT, isizeH, isizeW,
-          osizeT, osizeH, osizeW, offsetZ
-        );
-    } else {
-        cunn_VolumetricAdaptiveAveragePooling_updateGradInput_kernel
-          <<<blocks, threads, 0, THCState_getCurrentStream(state)>>>(
-            gradInput_data, gradOutput_data, isizeT, isizeH, isizeW,
-            osizeT, osizeH, osizeW, offsetZ
-          );
-    }
-
-    totalZ -= 65535;
-    offsetZ += 65535;
-    THCudaCheck(cudaGetLastError());
-  }
-  // clean
-  THCTensor_(free)(state, gradOutput);
-
-}
-
-#endif
diff --git a/aten/src/THCUNN/generic/VolumetricUpSamplingNearest.cu b/aten/src/THCUNN/generic/VolumetricUpSamplingNearest.cu
deleted file mode 100644
index 7b3a142876f3..000000000000
--- a/aten/src/THCUNN/generic/VolumetricUpSamplingNearest.cu
+++ /dev/null
@@ -1,108 +0,0 @@
-#ifndef THC_GENERIC_FILE
-#define THC_GENERIC_FILE "THCUNN/generic/VolumetricUpSamplingNearest.cu"
-#else
-
-#include <THCUNN/common.h>
-#include "ATen/cuda/CUDAContext.h"
-
-static inline void THNN_(VolumetricUpSamplingNearest_shapeCheck)
-                        (THCState *state,
-                         THCTensor *input, THCTensor *gradOutput,
-                         int nBatch, int nChannels,
-                         int inputDepth, int inputHeight, int inputWidth,
-                         int outputDepth, int outputHeight, int outputWidth) {
-  THArgCheck(inputDepth > 0 && inputHeight > 0 && inputWidth > 0
-             && outputDepth && outputHeight > 0 && outputWidth > 0, 2,
-             "input and output sizes should be greater than 0,"
-             " but got input (D: %d, H: %d, W: %d) output (D: %d, H: %d, W: %d)",
-             inputDepth, inputHeight, inputWidth, outputDepth, outputHeight, outputWidth);
-  if (input != NULL) {
-     THCUNN_argCheck(state, THTensor_nDimensionLegacyAll(input) == 5, 2, input,
-                     "5D input tensor expected but got: %s");
-  }
-
-  if (gradOutput != NULL) {
-    THCUNN_check_dim_size(state, gradOutput, 5, 0, nBatch);
-    THCUNN_check_dim_size(state, gradOutput, 5, 1, nChannels);
-    THCUNN_check_dim_size(state, gradOutput, 5, 2, outputDepth);
-    THCUNN_check_dim_size(state, gradOutput, 5, 3, outputHeight);
-    THCUNN_check_dim_size(state, gradOutput, 5, 4, outputWidth);
-  }
-}
-
-
-void THNN_(VolumetricUpSamplingNearest_updateOutput)(
-           THCState *state,
-           THCTensor *input,
-           THCTensor *output,
-           int outputDepth,
-           int outputHeight,
-           int outputWidth)
-{
-  THCUNN_assertSameGPU(state, 2, input, output);
-  int nbatch = THCTensor_(size)(state, input, 0);
-  int channels = THCTensor_(size)(state, input, 1);
-  int inputDepth = THCTensor_(size)(state, input, 2);
-  int inputHeight = THCTensor_(size)(state, input, 3);
-  int inputWidth  = THCTensor_(size)(state, input, 4);
-
-  THNN_(VolumetricUpSamplingNearest_shapeCheck)(state, input, NULL, nbatch, channels,
-                  inputDepth, inputHeight, inputWidth,
-                  outputDepth, outputHeight, outputWidth);
-  THAssert(inputDepth > 0 && inputHeight > 0 && inputWidth > 0 &&
-                  outputDepth > 0 && outputHeight > 0 && outputWidth > 0);
-
-  THCTensor_(resize5d)(state, output,
-                       THCTensor_(size)(state, input, 0),
-                       THCTensor_(size)(state, input, 1),
-                       outputDepth,
-                       outputHeight,
-                       outputWidth);
-  THCTensor_(zero)(state, output);
-
-  THCDeviceTensor<scalar_t, 5> idata = toDeviceTensor<scalar_t, 5>(state, input);
-  THCDeviceTensor<scalar_t, 5> odata = toDeviceTensor<scalar_t, 5>(state, output);
-
-  const int num_kernels = outputDepth * outputHeight * outputWidth;
-  const int num_threads = at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock;
-  cudaStream_t stream = THCState_getCurrentStream(state);
-  nearest_neighbor_5d_kernel<scalar_t, accreal> <<<THCCeilDiv(num_kernels, num_threads), num_threads,
-         0, stream>>>(num_kernels, idata, odata);
-  THCudaCheck(cudaGetLastError());
-}
-
-
-
-void THNN_(VolumetricUpSamplingNearest_updateGradInput)(
-           THCState *state,
-           THCTensor *gradOutput,
-           THCTensor *gradInput,
-           int nbatch,
-           int nchannels,
-           int inputDepth,
-           int inputHeight,
-           int inputWidth,
-           int outputDepth,
-           int outputHeight,
-           int outputWidth)
-{
-  THCUNN_assertSameGPU(state, 2, gradOutput, gradInput);
-  THNN_(VolumetricUpSamplingNearest_shapeCheck)(state, NULL, gradOutput, nbatch, nchannels,
-                  inputDepth, inputHeight, inputWidth,
-                  outputDepth, outputHeight, outputWidth);
-  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
-  THCTensor_(resize5d)(state, gradInput, nbatch, nchannels, inputDepth, inputHeight, inputWidth);
-
-  THCTensor_(zero)(state, gradInput);
-  THCDeviceTensor<scalar_t, 5> data1 = toDeviceTensor<scalar_t, 5>(state, gradInput);
-  THCDeviceTensor<scalar_t, 5> data2 = toDeviceTensor<scalar_t, 5>(state, gradOutput);
-  const int num_kernels = outputDepth * outputHeight * outputWidth;
-  const int num_threads = at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock;
-  cudaStream_t stream = THCState_getCurrentStream(state);
-  nearest_neighbor_5d_kernel_backward<scalar_t, accreal> <<<THCCeilDiv(num_kernels, num_threads),
-          num_threads, 0, stream>>>(num_kernels, data1, data2);
-  THCudaCheck(cudaGetLastError());
-  THCTensor_(free)(state, gradOutput);
-}
-
-#endif
diff --git a/aten/src/THCUNN/generic/VolumetricUpSamplingTrilinear.cu b/aten/src/THCUNN/generic/VolumetricUpSamplingTrilinear.cu
deleted file mode 100644
index 73e0655352e3..000000000000
--- a/aten/src/THCUNN/generic/VolumetricUpSamplingTrilinear.cu
+++ /dev/null
@@ -1,113 +0,0 @@
-#ifndef THC_GENERIC_FILE
-#define THC_GENERIC_FILE "THCUNN/generic/VolumetricUpSamplingTrilinear.cu"
-#else
-
-#include <THCUNN/upsampling.h>
-#include "ATen/cuda/CUDAContext.h"
-
-static inline void THNN_(VolumetricUpSamplingTrilinear_shapeCheck)
-                        (THCState *state,
-                         THCTensor *input, THCTensor *gradOutput,
-                         int nBatch, int nChannels,
-                         int inputDepth, int inputHeight, int inputWidth,
-                         int outputDepth, int outputHeight, int outputWidth) {
-  THArgCheck(inputDepth > 0 && inputHeight > 0 && inputWidth > 0
-             && outputDepth && outputHeight > 0 && outputWidth > 0, 2,
-             "input and output sizes should be greater than 0,"
-             " but got input (D: %d, H: %d, W: %d) output (D: %d, H: %d, W: %d)",
-             inputDepth, inputHeight, inputWidth, outputDepth, outputHeight, outputWidth);
-  if (input != NULL) {
-     THCUNN_argCheck(state, !input->is_empty() && input->dim() == 5, 2, input,
-                     "non-empty 5D input tensor expected but got: %s");
-  }
-
-  if (gradOutput != NULL) {
-    THCUNN_check_dim_size(state, gradOutput, 5, 0, nBatch);
-    THCUNN_check_dim_size(state, gradOutput, 5, 1, nChannels);
-    THCUNN_check_dim_size(state, gradOutput, 5, 2, outputDepth);
-    THCUNN_check_dim_size(state, gradOutput, 5, 3, outputHeight);
-    THCUNN_check_dim_size(state, gradOutput, 5, 4, outputWidth);
-  }
-}
-
-void THNN_(VolumetricUpSamplingTrilinear_updateOutput)(
-           THCState *state,
-           THCTensor *input,
-           THCTensor *output,
-           int outputDepth,
-           int outputHeight,
-           int outputWidth,
-           bool align_corners)
-{
-  int nbatch = THCTensor_(size)(state, input, 0);
-  int channels = THCTensor_(size)(state, input, 1);
-  int inputDepth = THCTensor_(size)(state, input, 2);
-  int inputHeight = THCTensor_(size)(state, input, 3);
-  int inputWidth = THCTensor_(size)(state, input, 4);
-  THNN_(VolumetricUpSamplingTrilinear_shapeCheck)
-       (state, input, NULL,
-        nbatch, channels,
-        inputDepth, inputHeight, inputWidth,
-        outputDepth, outputHeight, outputWidth);
-
-  THCUNN_assertSameGPU(state, 2, input, output);
-  THCTensor_(resize5d)(state, output,
-                       THCTensor_(size)(state, input, 0),
-                       THCTensor_(size)(state, input, 1),
-                       outputDepth, outputHeight, outputWidth);
-  THCTensor_(zero)(state, output);
-  THCDeviceTensor<scalar_t, 5> idata = toDeviceTensor<scalar_t, 5>(state, input);
-  THCDeviceTensor<scalar_t, 5> odata = toDeviceTensor<scalar_t, 5>(state, output);
-  THAssert(inputDepth > 0 && inputHeight > 0 && inputWidth > 0 && outputDepth > 0 && outputHeight > 0 && outputWidth > 0);
-  const accreal rdepth = area_pixel_compute_scale<accreal>(inputDepth, outputDepth, align_corners);
-  const accreal rheight = area_pixel_compute_scale<accreal>(inputHeight, outputHeight, align_corners);
-  const accreal rwidth = area_pixel_compute_scale<accreal>(inputWidth, outputWidth, align_corners);
-  const int num_kernels = outputDepth * outputHeight * outputWidth;
-  const int num_threads =
-    at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock;
-  cudaStream_t stream = THCState_getCurrentStream(state);
-  caffe_gpu_interp2_kernel<scalar_t, accreal> <<<THCCeilDiv(num_kernels, num_threads), num_threads ,
-   0 , stream>>>(num_kernels, rdepth, rheight, rwidth, align_corners, idata, odata);
-  THCudaCheck(cudaGetLastError());
-}
-
-
-void THNN_(VolumetricUpSamplingTrilinear_updateGradInput)(
-           THCState *state,
-           THCTensor *gradOutput,
-           THCTensor *gradInput,
-           int nbatch,
-           int nchannels,
-           int inputDepth,
-           int inputHeight,
-           int inputWidth,
-           int outputDepth,
-           int outputHeight,
-           int outputWidth,
-           bool align_corners)
-{
-  THNN_(VolumetricUpSamplingTrilinear_shapeCheck)
-       (state, NULL, gradOutput,
-        nbatch, nchannels,
-        inputDepth, inputHeight, inputWidth,
-        outputDepth, outputHeight, outputWidth);
-  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
-  THCUNN_assertSameGPU(state, 2, gradOutput, gradInput);
-  THCTensor_(resize5d)(state, gradInput, nbatch, nchannels, inputDepth, inputHeight, inputWidth);
-  THCTensor_(zero)(state, gradInput);
-  THCDeviceTensor<scalar_t, 5> data1 = toDeviceTensor<scalar_t, 5>(state, gradInput);
-  THCDeviceTensor<scalar_t, 5> data2 = toDeviceTensor<scalar_t, 5>(state, gradOutput);
-  const accreal rdepth = area_pixel_compute_scale<accreal>(inputDepth, outputDepth, align_corners);
-  const accreal rheight = area_pixel_compute_scale<accreal>(inputHeight, outputHeight, align_corners);
-  const accreal rwidth = area_pixel_compute_scale<accreal>(inputWidth, outputWidth, align_corners);
-  const int num_kernels = outputDepth * outputHeight * outputWidth;
-  const int num_threads =
-    at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock;
-  cudaStream_t stream = THCState_getCurrentStream(state);
-  caffe_gpu_interp2_kernel_backward<scalar_t ,accreal> <<<THCCeilDiv(num_kernels, num_threads),
-  num_threads, 0, stream>>>(num_kernels, rdepth, rheight, rwidth, align_corners, data1, data2);
-  THCudaCheck(cudaGetLastError());
-  THCTensor_(free)(state, gradOutput);
-}
-
-#endif
diff --git a/aten/src/THCUNN/upsampling.h b/aten/src/THCUNN/upsampling.h
deleted file mode 100644
index 66daea10d754..000000000000
--- a/aten/src/THCUNN/upsampling.h
+++ /dev/null
@@ -1,130 +0,0 @@
-#ifndef THCUNN_UPSAMPLING_H
-#define THCUNN_UPSAMPLING_H
-
-#include <THC/THCDeviceTensor.cuh>
-#include <THC/THCAtomics.cuh>
-
-#undef MIN
-#define MIN(a,b) ( ((a)<(b)) ? (a) : (b) )
-#undef MAX
-#define MAX(a,b) ( ((a)>(b)) ? (a) : (b) )
-
-
-template<typename Acctype>
-__host__ __forceinline__
-static Acctype area_pixel_compute_scale(
-                          int inputSize, int outputSize, bool align_corners) {
-  if (outputSize > 1) {
-    return align_corners ? (Acctype) (inputSize - 1) / (outputSize - 1)
-                         : (Acctype) inputSize / outputSize;
-  } else {
-    return Acctype(0);
-  }
-}
-
-template<typename Acctype>
-__device__ __forceinline__
-static Acctype area_pixel_compute_source_index(
-                          Acctype scale, int dst_index, bool align_corners, bool cubic) {
-  if (align_corners) {
-    return scale * dst_index;
-  } else {
-    Acctype src_idx = scale * (dst_index + Acctype(0.5)) - Acctype(0.5);
-    // See Note[Follow Opencv resize logic]
-    return (!cubic && src_idx < Acctype(0)) ? Acctype(0) : src_idx;
-  }
-}
-
-__device__ __forceinline__
-static int nearest_neighbor_compute_source_index(
-                const float scale, int dst_index, int inputSize) {
-  const int src_index = MIN(floor(dst_index * scale), inputSize - 1);
-  return src_index;
-}
-
-template<typename Dtype>
-__device__ __forceinline__
-static Dtype upsampling_get_value_bounded(
-  const THCDeviceTensor<Dtype, 4> data,
-  int channel,
-  int batch,
-  int width,
-  int height,
-  int x,
-  int y
-) {
-  int access_x = max(min(x, width - 1), 0);
-  int access_y = max(min(y, height - 1), 0);
-  return data[batch][channel][access_y][access_x];
-}
-
-template<typename Dtype, typename Acctype>
-__device__ __forceinline__
-static void upsampling_increment_value_bounded(
-  const THCDeviceTensor<Dtype, 4> data,
-  int channel,
-  int batch,
-  int width,
-  int height,
-  int x,
-  int y,
-  Acctype value
-) {
-  int access_x = max(min(x, width - 1), 0);
-  int access_y = max(min(y, height - 1), 0);
-  atomicAdd(
-    data[batch][channel][access_y][access_x].data(),
-    ScalarConvert<Acctype, Dtype>::to(value)
-  );
-}
-
-// Based on https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm
-template<typename Acctype>
-__device__ __forceinline__
-static Acctype cubic_convolution1(Acctype x, Acctype A) {
-  return ((A + 2) * x - (A + 3)) * x * x + 1;
-}
-
-template<typename Acctype>
-__device__ __forceinline__
-static Acctype cubic_convolution2(Acctype x, Acctype A) {
-  return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A;
-}
-
-template<typename Acctype>
-__device__ __forceinline__
-static void get_cubic_upsampling_coefficients(
-  Acctype coeffs[4],
-  Acctype t
-) {
-  Acctype A = -0.75;
-
-  Acctype x1 = t;
-  coeffs[0] = cubic_convolution2<Acctype>(x1 + 1.0, A);
-  coeffs[1] = cubic_convolution1<Acctype>(x1, A);
-
-  // opposite coefficients
-  Acctype x2 = 1.0 - t;
-  coeffs[2] = cubic_convolution1<Acctype>(x2, A);
-  coeffs[3] = cubic_convolution2<Acctype>(x2 + 1.0, A);
-}
-
-template<typename Dtype, typename Acctype>
-__device__ __forceinline__
-static Acctype cubic_interp1d(
-  Dtype x0,
-  Dtype x1,
-  Dtype x2,
-  Dtype x3,
-  Acctype t
-) {
-  Acctype coeffs[4];
-  get_cubic_upsampling_coefficients<Acctype>(coeffs, t);
-
-  return x0 * coeffs[0]
-    + x1 * coeffs[1]
-    + x2 * coeffs[2]
-    + x3 * coeffs[3];
-}
-
-#endif
diff --git a/aten/src/THNN/generic/MultiLabelMarginCriterion.c b/aten/src/THNN/generic/MultiLabelMarginCriterion.c
index 8be35566342d..cdc4d37a1744 100644
--- a/aten/src/THNN/generic/MultiLabelMarginCriterion.c
+++ b/aten/src/THNN/generic/MultiLabelMarginCriterion.c
@@ -17,21 +17,21 @@ void THNN_(MultiLabelMarginCriterion_updateOutput)(
   int64_t t, d, dt, ddt;
   scalar_t sum;
 
-  AT_CHECK(!input->is_empty() && input->dim() <= 2,
+  TORCH_CHECK(!input->is_empty() && input->dim() <= 2,
            "non-empty vector or matrix expected, got size: ", input->sizes());
 
   if (input->dim() <= 1)
   {
     nframe = 1;
     dim = THTensor_sizeLegacyNoScalars(input, 0);
-    AT_CHECK(!target->is_empty() && (target->dim() <= 1) && (THTensor_sizeLegacyNoScalars(target, 0) == dim),
+    TORCH_CHECK(!target->is_empty() && (target->dim() <= 1) && (THTensor_sizeLegacyNoScalars(target, 0) == dim),
              "inconsistent target size");
   }
   else
   {
     nframe = input->size(0);
     dim = input->size(1);
-    AT_CHECK(!target->is_empty() && target->dim() == 2 && (target->size(0) == nframe)
+    TORCH_CHECK(!target->is_empty() && target->dim() == 2 && (target->size(0) == nframe)
              && (target->size(1) == dim), "inconsistent target size");
   }
 
@@ -157,25 +157,25 @@ void THNN_(MultiLabelMarginCriterion_updateGradInput)(
   int64_t t, d, dt;
   scalar_t g;
 
-  AT_CHECK(!input->is_empty() && input->dim() <= 2,
+  TORCH_CHECK(!input->is_empty() && input->dim() <= 2,
            "vector or matrix expected, got size: ", input->sizes());
 
   if (input->dim() <= 1)
   {
     nframe = 1;
     dim = THTensor_sizeLegacyNoScalars(input, 0);
-    AT_CHECK((!target->is_empty() && target->dim() <= 1) && (THTensor_sizeLegacyNoScalars(target, 0) == dim),
+    TORCH_CHECK((!target->is_empty() && target->dim() <= 1) && (THTensor_sizeLegacyNoScalars(target, 0) == dim),
              "inconsistent target size");
-    AT_CHECK((!isTarget->is_empty() && isTarget->dim() <= 1) && (THTensor_sizeLegacyNoScalars(isTarget, 0) == dim),
+    TORCH_CHECK((!isTarget->is_empty() && isTarget->dim() <= 1) && (THTensor_sizeLegacyNoScalars(isTarget, 0) == dim),
              "inconsistent isTarget size");
   }
   else
   {
     nframe = input->size(0);
     dim = input->size(1);
-    AT_CHECK(!target->is_empty() && (target->dim() == 2) && (target->size(0) == nframe)
+    TORCH_CHECK(!target->is_empty() && (target->dim() == 2) && (target->size(0) == nframe)
              && (target->size(1) == dim), 3, "inconsistent target size");
-    AT_CHECK(!isTarget->is_empty() && (isTarget->dim() == 2) && (isTarget->size(0) == nframe)
+    TORCH_CHECK(!isTarget->is_empty() && (isTarget->dim() == 2) && (isTarget->size(0) == nframe)
              && (isTarget->size(1) == dim), 3, "inconsistent isTarget size");
   }
 
diff --git a/aten/src/THNN/generic/MultiMarginCriterion.c b/aten/src/THNN/generic/MultiMarginCriterion.c
index 34ecf26b8ac0..00080169ab86 100644
--- a/aten/src/THNN/generic/MultiMarginCriterion.c
+++ b/aten/src/THNN/generic/MultiMarginCriterion.c
@@ -20,7 +20,7 @@ void THNN_(MultiMarginCriterion_updateOutput)(
   int64_t t, d;
   scalar_t sum;
 
-  AT_CHECK(!input->is_empty() && input->dim() <= 2,
+  TORCH_CHECK(!input->is_empty() && input->dim() <= 2,
            "non-empty vector or matrix expected, got size: ", input->sizes());
 
   if (input->dim() <= 1)
@@ -32,7 +32,7 @@ void THNN_(MultiMarginCriterion_updateOutput)(
   {
     nframe = input->size(0);
     dim = input->size(1);
-    AT_CHECK(!target->is_empty() && (THTensor_nDimensionLegacyNoScalars(target) == 1) && (THTensor_sizeLegacyNoScalars(target, 0) == nframe),
+    TORCH_CHECK(!target->is_empty() && (THTensor_nDimensionLegacyNoScalars(target) == 1) && (THTensor_sizeLegacyNoScalars(target, 0) == nframe),
              "inconsistent target size, got: ", target->sizes());
   }
 
@@ -136,7 +136,7 @@ void THNN_(MultiMarginCriterion_updateGradInput)(
   int64_t t, d;
   scalar_t g;
 
-  AT_CHECK(!input->is_empty() && (input->dim() <= 2),
+  TORCH_CHECK(!input->is_empty() && (input->dim() <= 2),
            "non-empty vector or matrix expected, got size: ", input->sizes());
 
   if (input->dim() <= 1)
@@ -148,7 +148,7 @@ void THNN_(MultiMarginCriterion_updateGradInput)(
   {
     nframe = input->size(0);
     dim = input->size(1);
-    AT_CHECK(!target->is_empty() && (target->dim() <= 1) && (THTensor_sizeLegacyNoScalars(target, 0) == nframe),
+    TORCH_CHECK(!target->is_empty() && (target->dim() <= 1) && (THTensor_sizeLegacyNoScalars(target, 0) == nframe),
              "inconsistent target size, got: ", target->sizes());
   }
 
diff --git a/aten/src/THNN/generic/SpatialDilatedMaxPooling.c b/aten/src/THNN/generic/SpatialDilatedMaxPooling.c
deleted file mode 100644
index 94d438a36a37..000000000000
--- a/aten/src/THNN/generic/SpatialDilatedMaxPooling.c
+++ /dev/null
@@ -1,368 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "THNN/generic/SpatialDilatedMaxPooling.c"
-#else
-
-#include <THNN/generic/pooling_shape.h>
-#include <algorithm>
-
-#include <ATen/Parallel.h>
-
-static inline void THNN_(SpatialDilatedMaxPooling_shapeCheck)(
-        THTensor *input, THTensor *gradOutput, THIndexTensor *indices,
-        int kH, int kW, int dH, int dW, int padH, int padW,
-        int dilationH, int dilationW, bool ceil_mode) {
-
-  THArgCheck(kW > 0 && kH > 0, 5,
-             "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
-  THArgCheck(dW > 0 && dH > 0, 8,
-             "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
-  THArgCheck(dilationH > 0 && dilationW > 0, 12,
-             "dilation should be greater than zero, but got dilationH: %d dilationW: %d",
-             dilationH, dilationW);
-
-  int ndim = input->dim();
-  int dimf = 0;
-  int dimh = 1;
-  int dimw = 2;
-
-  if (ndim == 4) {
-    dimf++;
-    dimh++;
-    dimw++;
-  }
-
-  THNN_ARGCHECK(!input->is_empty() && (ndim == 3 || ndim == 4), 2, input,
-                "non-empty 3D or 4D input tensor expected but got: %s");
-
-  THArgCheck(kW/2 >= padW && kH/2 >= padH, 2,
-             "pad should be smaller than half of kernel size, but got "
-             "padW = %d, padH = %d, kW = %d, kH = %d",
-             padW, padH, kW, kH);
-
-  int64_t nInputPlane = input->size(dimh-1);
-  int64_t inputHeight = input->size(dimh);
-  int64_t inputWidth = input->size(dimw);
-  int64_t nOutputPlane = nInputPlane;
-
-  int64_t outputHeight = pooling_output_shape<int64_t>(inputHeight, kH, padH, dH, dilationH, ceil_mode);
-  int64_t outputWidth = pooling_output_shape<int64_t>(inputWidth, kW, padW, dW, dilationW, ceil_mode);
-
-  if (outputWidth < 1 || outputHeight < 1)
-    THError("Given input size: (%dx%dx%d). "
-            "Calculated output size: (%dx%dx%d). Output size is too small",
-            nInputPlane,inputHeight,inputWidth,nInputPlane,outputHeight,outputWidth);
-
-  if (gradOutput != NULL) {
-    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
-    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
-    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth);
-  }
-  if (indices != NULL) {
-    THNN_CHECK_DIM_SIZE_INDICES(indices, ndim, dimf, nOutputPlane);
-    THNN_CHECK_DIM_SIZE_INDICES(indices, ndim, dimh, outputHeight);
-    THNN_CHECK_DIM_SIZE_INDICES(indices, ndim, dimw, outputWidth);
-  }
-}
-
-static void THNN_(SpatialDilatedMaxPooling_updateOutput_frame)(
-          scalar_t *input_p,
-          scalar_t *output_p,
-          THIndex_t *ind_p,
-          int64_t nslices,
-          int64_t iwidth,
-          int64_t iheight,
-          int64_t owidth,
-          int64_t oheight,
-          int kW,
-          int kH,
-          int dW,
-          int dH,
-          int padW,
-          int padH,
-          int dilationW,
-          int dilationH
-          )
-{
-  at::parallel_for(0, nslices, 0, [&](int64_t start, int64_t end) {
-    for (auto k = start; k < end; k++)
-    {
-      /* loop over output */
-      int64_t i, j;
-      scalar_t *ip = input_p   + k*iwidth*iheight;
-      for(i = 0; i < oheight; i++)
-      {
-        for(j = 0; j < owidth; j++)
-        {
-          int64_t hstart = i * dH - padH;
-          int64_t wstart = j * dW - padW;
-          int64_t hend = std::min(hstart + (kH - 1) * dilationH + 1, iheight);
-          int64_t wend = std::min(wstart + (kW - 1) * dilationW + 1, iwidth);
-          while(hstart < 0)
-            hstart += dilationH;
-          while(wstart < 0)
-            wstart += dilationW;
-
-          /* local pointers */
-          scalar_t *op = output_p  + k*owidth*oheight + i*owidth + j;
-          THIndex_t *indp = ind_p   + k*owidth*oheight + i*owidth + j;
-
-          /* compute local max: */
-          int64_t maxindex = -1;
-          scalar_t maxval = -THInf;
-          int64_t tcntr = 0;
-          int64_t x,y;
-          for(y = hstart; y < hend; y += dilationH)
-          {
-            for(x = wstart; x < wend; x += dilationW)
-            {
-              tcntr = y*iwidth + x;
-              scalar_t val = *(ip + tcntr);
-              if ((val > maxval) || std::isnan(val))
-              {
-                maxval = val;
-                maxindex = tcntr;
-              }
-            }
-          }
-
-          /* set output to local max */
-          *op = maxval;
-
-          /* store location of max */
-          *indp = maxindex;
-        }
-      }
-    }
-  });
-}
-
-void THNN_(SpatialDilatedMaxPooling_updateOutput)(
-          THNNState *state,
-          THTensor *input,
-          THTensor *output,
-          THIndexTensor *indices,
-          int kW,
-          int kH,
-          int dW,
-          int dH,
-          int padW,
-          int padH,
-          int dilationW,
-          int dilationH,
-          bool ceil_mode)
-{
-
-  int dimw = 2;
-  int dimh = 1;
-  int64_t nbatch = 1;
-  int64_t nInputPlane;
-  int64_t inputHeight;
-  int64_t inputWidth;
-  int64_t outputHeight;
-  int64_t outputWidth;
-  scalar_t *input_data;
-  scalar_t *output_data;
-  THIndex_t *indices_data;
-
-  THNN_(SpatialDilatedMaxPooling_shapeCheck)
-    (input, NULL, NULL, kH, kW, dH, dW,
-     padH, padW, dilationH, dilationW, ceil_mode);
-
-  if (input->dim() == 4)
-  {
-    nbatch = input->size(0);
-    dimw++;
-    dimh++;
-  }
-
-  /* sizes */
-  nInputPlane = input->size(dimh-1);
-  inputHeight = input->size(dimh);
-  inputWidth = input->size(dimw);
-  outputHeight = pooling_output_shape<int64_t>(inputHeight, kH, padH, dH, dilationH, ceil_mode);
-  outputWidth = pooling_output_shape<int64_t>(inputWidth, kW, padW, dW, dilationW, ceil_mode);
-
-  /* get contiguous input */
-  input = THTensor_(newContiguous)(input);
-
-  /* resize output */
-  if (input->dim() == 3)
-  {
-    THTensor_(resize3d)(output, nInputPlane, outputHeight, outputWidth);
-    /* indices will contain the locations for each output point */
-    THIndexTensor_(resize3d)(indices,  nInputPlane, outputHeight, outputWidth);
-
-    input_data = input->data<scalar_t>();
-    output_data = output->data<scalar_t>();
-    indices_data = THIndexTensor_(data)(indices);
-
-    THNN_(SpatialDilatedMaxPooling_updateOutput_frame)
-      (input_data, output_data,
-       indices_data,
-       nInputPlane,
-       inputWidth, inputHeight,
-       outputWidth, outputHeight,
-       kW, kH, dW, dH,
-       padW, padH,
-       dilationW, dilationH
-       );
-  }
-  else
-  {
-    THTensor_(resize4d)(output, nbatch, nInputPlane, outputHeight, outputWidth);
-    /* indices will contain the locations for each output point */
-    THIndexTensor_(resize4d)(indices, nbatch, nInputPlane, outputHeight, outputWidth);
-
-    input_data = input->data<scalar_t>();
-    output_data = output->data<scalar_t>();
-    indices_data = THIndexTensor_(data)(indices);
-
-    at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) {
-      for (auto p = start; p < end; p++)
-      {
-        THNN_(SpatialDilatedMaxPooling_updateOutput_frame)
-          (input_data+p*nInputPlane*inputWidth*inputHeight,
-           output_data+p*nInputPlane*outputWidth*outputHeight,
-           indices_data+p*nInputPlane*outputWidth*outputHeight,
-           nInputPlane,
-           inputWidth, inputHeight,
-           outputWidth, outputHeight,
-           kW, kH, dW, dH,
-           padW, padH,
-           dilationW, dilationH
-           );
-      }
-    });
-  }
-
-  /* cleanup */
-  c10::raw::intrusive_ptr::decref(input);
-}
-
-static void THNN_(SpatialDilatedMaxPooling_updateGradInput_frame)(
-          scalar_t *gradInput_p,
-          scalar_t *gradOutput_p,
-          THIndex_t *ind_p,
-          int64_t nInputPlane,
-          int64_t inputWidth,
-          int64_t inputHeight,
-          int64_t outputWidth,
-          int64_t outputHeight,
-          int dW,
-          int dH)
-{
-  at::parallel_for(0, nInputPlane, 0, [&](int64_t start, int64_t end) {
-    for (auto k = start; k < end; k++)
-    {
-      scalar_t *gradInput_p_k = gradInput_p + k*inputWidth*inputHeight;
-      scalar_t *gradOutput_p_k = gradOutput_p + k*outputWidth*outputHeight;
-      THIndex_t *ind_p_k = ind_p + k*outputWidth*outputHeight;
-
-      /* calculate max points */
-      int64_t i, j;
-      for(i = 0; i < outputHeight; i++)
-      {
-        for(j = 0; j < outputWidth; j++)
-        {
-          /* retrieve position of max */
-          int64_t maxp = ind_p_k[i*outputWidth + j];
-          if (maxp != -1) {
-            /* update gradient */
-            gradInput_p_k[maxp] += gradOutput_p_k[i*outputWidth + j];
-          }
-        }
-      }
-    }
-  });
-}
-
-void THNN_(SpatialDilatedMaxPooling_updateGradInput)(
-          THNNState *state,
-          THTensor *input,
-          THTensor *gradOutput,
-          THTensor *gradInput,
-          THIndexTensor *indices,
-          int kW,
-          int kH,
-          int dW,
-          int dH,
-          int padW,
-          int padH,
-          int dilationW,
-          int dilationH,
-          bool ceil_mode)
-{
-  int dimw = 2;
-  int dimh = 1;
-  int64_t nbatch = 1;
-  int nInputPlane;
-  int inputHeight;
-  int inputWidth;
-  int outputHeight;
-  int outputWidth;
-  scalar_t *gradInput_data;
-  scalar_t *gradOutput_data;
-  THIndex_t *indices_data;
-
-  THNN_(SpatialDilatedMaxPooling_shapeCheck)
-    (input, gradOutput, indices, kH, kW, dH, dW,
-     padH, padW, dilationH, dilationW, ceil_mode);
-
-  /* get contiguous gradOutput */
-  gradOutput = THTensor_(newContiguous)(gradOutput);
-
-  /* resize */
-  THTensor_(resizeAs)(gradInput, input);
-  THTensor_(zero)(gradInput);
-
-  if (input->dim() == 4) {
-    nbatch = input->size(0);
-    dimw++;
-    dimh++;
-  }
-
-  /* sizes */
-  nInputPlane = input->size(dimh-1);
-  inputHeight = input->size(dimh);
-  inputWidth = input->size(dimw);
-  outputHeight = gradOutput->size(dimh);
-  outputWidth = gradOutput->size(dimw);
-
-  /* get raw pointers */
-  gradInput_data = gradInput->data<scalar_t>();
-  gradOutput_data = gradOutput->data<scalar_t>();
-  indices_data = THIndexTensor_(data)(indices);
-
-  /* backprop */
-  if (input->dim() == 3)
-  {
-    THNN_(SpatialDilatedMaxPooling_updateGradInput_frame)
-      (gradInput_data, gradOutput_data,
-       indices_data,
-       nInputPlane,
-       inputWidth, inputHeight,
-       outputWidth, outputHeight,
-       dW, dH);
-  }
-  else
-  {
-    at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) {
-      for (auto p = start; p < end; p++)
-      {
-        THNN_(SpatialDilatedMaxPooling_updateGradInput_frame)
-          (gradInput_data+p*nInputPlane*inputWidth*inputHeight,
-           gradOutput_data+p*nInputPlane*outputWidth*outputHeight,
-           indices_data+p*nInputPlane*outputWidth*outputHeight,
-           nInputPlane,
-           inputWidth, inputHeight,
-           outputWidth, outputHeight,
-           dW, dH);
-      }
-    });
-  }
-
-  /* cleanup */
-  c10::raw::intrusive_ptr::decref(gradOutput);
-}
-
-#endif
diff --git a/aten/src/THNN/generic/SpatialMaxUnpooling.c b/aten/src/THNN/generic/SpatialMaxUnpooling.c
index ffddf6144de4..d66164499dbf 100644
--- a/aten/src/THNN/generic/SpatialMaxUnpooling.c
+++ b/aten/src/THNN/generic/SpatialMaxUnpooling.c
@@ -63,7 +63,7 @@ void THNN_(SpatialMaxUnpooling_updateOutput)(
   THIndex_t *indices_data;
 
 
-  AT_CHECK(!input->is_empty() && (input->dim() == 3 || input->dim() == 4),
+  TORCH_CHECK(!input->is_empty() && (input->dim() == 3 || input->dim() == 4),
            "non-empty 3D or 4D (batch mode) tensor expected for input, but got sizes: ", input->sizes());
   THNN_CHECK_SHAPE_INDICES(input, indices);
 
diff --git a/aten/src/THNN/generic/SpatialUpSamplingBicubic.c b/aten/src/THNN/generic/SpatialUpSamplingBicubic.c
deleted file mode 100644
index a81d6bc0d88a..000000000000
--- a/aten/src/THNN/generic/SpatialUpSamplingBicubic.c
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "THNN/generic/SpatialUpSamplingBicubic.c"
-#else
-
-void THNN_(SpatialUpSamplingBicubic_updateOutput)(
-    THNNState* state,
-    THTensor* input,
-    THTensor* output,
-    int outputHeight,
-    int outputWidth,
-    bool align_corners) {
-  AT_ERROR("This function is deprecated, please use it from ATen.");
-}
-
-void THNN_(SpatialUpSamplingBicubic_updateGradInput)(
-    THNNState* state,
-    THTensor* gradOutput,
-    THTensor* gradInput,
-    int nbatch,
-    int channels,
-    int inputHeight,
-    int inputWidth,
-    int outputHeight,
-    int outputWidth,
-    bool align_corners) {
-  AT_ERROR("This function is deprecated, please use it from ATen.");
-}
-
-#endif
diff --git a/aten/src/THNN/generic/SpatialUpSamplingBilinear.c b/aten/src/THNN/generic/SpatialUpSamplingBilinear.c
deleted file mode 100644
index 0dc646307b11..000000000000
--- a/aten/src/THNN/generic/SpatialUpSamplingBilinear.c
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "THNN/generic/SpatialUpSamplingBilinear.c"
-#else
-
-void THNN_(SpatialUpSamplingBilinear_updateOutput)(
-    THNNState* state,
-    THTensor* input,
-    THTensor* output,
-    int outputHeight,
-    int outputWidth,
-    bool align_corners) {
-  AT_ERROR("This function is deprecated, please use it from ATen.");
-}
-
-void THNN_(SpatialUpSamplingBilinear_updateGradInput)(
-    THNNState* state,
-    THTensor* gradOutput,
-    THTensor* gradInput,
-    int nbatch,
-    int channels,
-    int inputHeight,
-    int inputWidth,
-    int outputHeight,
-    int outputWidth,
-    bool align_corners) {
-  AT_ERROR("This function is deprecated, please use it from ATen.");
-}
-
-#endif
diff --git a/aten/src/THNN/generic/SpatialUpSamplingNearest.c b/aten/src/THNN/generic/SpatialUpSamplingNearest.c
deleted file mode 100644
index 82f8237fe677..000000000000
--- a/aten/src/THNN/generic/SpatialUpSamplingNearest.c
+++ /dev/null
@@ -1,27 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "THNN/generic/SpatialUpSamplingNearest.c"
-#else
-
-void THNN_(SpatialUpSamplingNearest_updateOutput)(
-    THNNState* state,
-    THTensor* input,
-    THTensor* output,
-    int outputHeight,
-    int outputWidth) {
-  AT_ERROR("This function is deprecated, please use it from ATen.");
-}
-
-void THNN_(SpatialUpSamplingNearest_updateGradInput)(
-    THNNState* state,
-    THTensor* gradOutput,
-    THTensor* gradInput,
-    int nbatch,
-    int channels,
-    int inputHeight,
-    int inputWidth,
-    int outputHeight,
-    int outputWidth) {
-  AT_ERROR("This function is deprecated, please use it from ATen.");
-}
-
-#endif
diff --git a/aten/src/THNN/generic/THNN.h b/aten/src/THNN/generic/THNN.h
index 04a21329e5d5..188194cf35ed 100644
--- a/aten/src/THNN/generic/THNN.h
+++ b/aten/src/THNN/generic/THNN.h
@@ -390,36 +390,6 @@ TH_API void THNN_(TemporalRowConvolution_accGradParameters)(
           bool featFirst,
           accreal scale);
 
-TH_API void THNN_(TemporalUpSamplingNearest_updateOutput)(
-          THNNState *state,
-          THTensor *input,
-          THTensor *output,
-          int osizeW);
-TH_API void THNN_(TemporalUpSamplingNearest_updateGradInput)(
-          THNNState *state,
-          THTensor *gradOutput,
-          THTensor *gradInput,
-          int isizeB,
-          int isizeC,
-          int isizeW,
-          int osizeW);
-
-TH_API void THNN_(TemporalUpSamplingLinear_updateOutput)(
-          THNNState *state,
-          THTensor *input,
-          THTensor *output,
-          int osizeW,
-          bool align_corners);
-TH_API void THNN_(TemporalUpSamplingLinear_updateGradInput)(
-          THNNState *state,
-          THTensor *gradOutput,
-          THTensor *gradInput,
-          int isizeB,
-          int isizeC,
-          int isizeW,
-          int osizeW,
-          bool align_corners);
-
 TH_API void THNN_(SpatialConvolutionMM_updateOutput)(
           THNNState *state,
           THTensor *input,
@@ -556,28 +526,6 @@ TH_API void THNN_(SpatialFullDilatedConvolution_accGradParameters)(
           int adjW, int adjH,
           accreal scale);
 
-TH_API void THNN_(SpatialDilatedMaxPooling_updateOutput)(
-          THNNState *state,
-          THTensor *input,
-          THTensor *output,
-          THIndexTensor *indices,
-          int kW, int kH,
-          int dW, int dH,
-          int padW, int padH,
-          int dilationW, int dilationH,
-          bool ceil_mode);
-TH_API void THNN_(SpatialDilatedMaxPooling_updateGradInput)(
-          THNNState *state,
-          THTensor *input,
-          THTensor *gradOutput,
-          THTensor *gradInput,
-          THIndexTensor *indices,
-          int kW, int kH,
-          int dW, int dH,
-          int padW, int padH,
-          int dilationW, int dilationH,
-          bool ceil_mode);
-
 TH_API void THNN_(SpatialMaxUnpooling_updateOutput)(
           THNNState *state,
           THTensor *input,
@@ -592,64 +540,6 @@ TH_API void THNN_(SpatialMaxUnpooling_updateGradInput)(
           THIndexTensor *indices,
           int owidth, int oheight);
 
-TH_API void THNN_(SpatialUpSamplingNearest_updateOutput)(
-          THNNState *state,
-          THTensor *input,
-          THTensor *output,
-          int osizeH,
-          int osizeW);
-
-TH_API void THNN_(SpatialUpSamplingNearest_updateGradInput)(
-          THNNState *state,
-          THTensor *gradOutput,
-          THTensor *gradInput,
-          int isizeB,
-          int isizeC,
-          int isizeH,
-          int isizeW,
-          int osizeH,
-          int osizeW);
-
-TH_API void THNN_(SpatialUpSamplingBilinear_updateOutput)(
-          THNNState *state,
-          THTensor *input,
-          THTensor *output,
-          int osizeH,
-          int osizeW,
-          bool align_corners);
-
-TH_API void THNN_(SpatialUpSamplingBilinear_updateGradInput)(
-          THNNState *state,
-          THTensor *gradOutput,
-          THTensor *gradInput,
-          int isizeB,
-          int isizeC,
-          int isizeH,
-          int isizeW,
-          int osizeH,
-          int osizeW,
-          bool align_corners);
-
-TH_API void THNN_(SpatialUpSamplingBicubic_updateOutput)(
-          THNNState *state,
-          THTensor *input,
-          THTensor *output,
-          int osizeH,
-          int osizeW,
-          bool align_corners);
-
-TH_API void THNN_(SpatialUpSamplingBicubic_updateGradInput)(
-          THNNState *state,
-          THTensor *gradOutput,
-          THTensor *gradInput,
-          int isizeB,
-          int isizeC,
-          int isizeH,
-          int isizeW,
-          int osizeH,
-          int osizeW,
-          bool align_corners);
-
 TH_API void THNN_(unfolded_acc)(
           THTensor *finput,
           THTensor *input,
@@ -808,19 +698,6 @@ TH_API void THNN_(VolumetricMaxUnpooling_updateGradInput)(
           int dT, int dW, int dH,
           int pT, int pW, int pH);
 
-TH_API void THNN_(VolumetricAdaptiveAveragePooling_updateOutput)(
-          THNNState *state,
-          THTensor *input,
-          THTensor *output,
-          int osizeT,
-          int osizeW,
-          int osizeH);
-TH_API void THNN_(VolumetricAdaptiveAveragePooling_updateGradInput)(
-          THNNState *state,
-          THTensor *input,
-          THTensor *gradOutput,
-          THTensor *gradInput);
-
 TH_API void THNN_(FeatureLPPooling_updateOutput)(
           THNNState *state,
           THTensor *input,
@@ -841,50 +718,6 @@ TH_API void THNN_(FeatureLPPooling_updateGradInput)(
           int stride,
           bool batchMode);
 
-TH_API void THNN_(VolumetricUpSamplingNearest_updateOutput)(
-          THNNState *state,
-          THTensor *input,
-          THTensor *output,
-          int osizeT,
-          int osizeH,
-          int osizeW);
-
-TH_API void THNN_(VolumetricUpSamplingNearest_updateGradInput)(
-          THNNState *state,
-          THTensor *gradOutput,
-          THTensor *gradInput,
-          int isizeB,
-          int isizeC,
-          int isizeT,
-          int isizeH,
-          int isizeW,
-          int osizeT,
-          int osizeH,
-          int osizeW);
-
-TH_API void THNN_(VolumetricUpSamplingTrilinear_updateOutput)(
-          THNNState *state,
-          THTensor *input,
-          THTensor *output,
-          int osizeT,
-          int osizeH,
-          int osizeW,
-          bool align_corners);
-
-TH_API void THNN_(VolumetricUpSamplingTrilinear_updateGradInput)(
-          THNNState *state,
-          THTensor *gradOutput,
-          THTensor *gradInput,
-          int isizeB,
-          int isizeC,
-          int isizeT,
-          int isizeH,
-          int isizeW,
-          int osizeT,
-          int osizeH,
-          int osizeW,
-          bool align_corners);
-
 TH_API void THNN_(Tanh_updateOutput)(
           THNNState *state,
           THTensor *input,
diff --git a/aten/src/THNN/generic/TemporalUpSamplingLinear.c b/aten/src/THNN/generic/TemporalUpSamplingLinear.c
deleted file mode 100644
index 69680540917b..000000000000
--- a/aten/src/THNN/generic/TemporalUpSamplingLinear.c
+++ /dev/null
@@ -1,29 +0,0 @@
-// Adapted from interp.cpp from Caffe util by Pauline Luc
-// Originally developed by George Papandreou
-
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "THNN/generic/TemporalUpSamplingLinear.c"
-#else
-
-void THNN_(TemporalUpSamplingLinear_updateOutput)(
-    THNNState* state,
-    THTensor* input,
-    THTensor* output,
-    int outputWidth,
-    bool align_corners) {
-  AT_ERROR("This function is deprecated, please use it from ATen.");
-}
-
-void THNN_(TemporalUpSamplingLinear_updateGradInput)(
-    THNNState* state,
-    THTensor* gradOutput,
-    THTensor* gradInput,
-    int nbatch,
-    int channels,
-    int inputWidth,
-    int outputWidth,
-    bool align_corners) {
-  AT_ERROR("This function is deprecated, please use it from ATen.");
-}
-
-#endif
diff --git a/aten/src/THNN/generic/TemporalUpSamplingNearest.c b/aten/src/THNN/generic/TemporalUpSamplingNearest.c
deleted file mode 100644
index 0d3fca5ebb89..000000000000
--- a/aten/src/THNN/generic/TemporalUpSamplingNearest.c
+++ /dev/null
@@ -1,24 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "THNN/generic/TemporalUpSamplingNearest.c"
-#else
-
-void THNN_(TemporalUpSamplingNearest_updateOutput)(
-    THNNState* state,
-    THTensor* input,
-    THTensor* output,
-    int outputWidth) {
-  AT_ERROR("This function is deprecated, please use it from ATen.");
-}
-
-void THNN_(TemporalUpSamplingNearest_updateGradInput)(
-    THNNState* state,
-    THTensor* gradOutput,
-    THTensor* gradInput,
-    int nbatch,
-    int channels,
-    int inputWidth,
-    int outputWidth) {
-  AT_ERROR("This function is deprecated, please use it from ATen.");
-}
-
-#endif
diff --git a/aten/src/THNN/generic/VolumetricAdaptiveAveragePooling.c b/aten/src/THNN/generic/VolumetricAdaptiveAveragePooling.c
deleted file mode 100644
index af35f6e6b0ce..000000000000
--- a/aten/src/THNN/generic/VolumetricAdaptiveAveragePooling.c
+++ /dev/null
@@ -1,305 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "THNN/generic/VolumetricAdaptiveAveragePooling.c"
-#else
-
-#include <ATen/Parallel.h>
-
-#define START_IND(a,b,c) (int)floor((float)(a * c) / b)
-#define END_IND(a,b,c) (int)ceil((float)((a + 1) * c) / b)
-// #define START_IND(a,b,c) a * c / b
-// #define END_IND(a,b,c)  (a + 1) * c / b + ((a + 1) * c % b > 0)?1:0
-
-// 5d tensor B x D x T x H x W
-
-static void THNN_(VolumetricAdaptiveAveragePooling_updateOutput_frame)(
-          scalar_t *input_p,
-          scalar_t *output_p,
-          int64_t sizeD,
-          int64_t isizeT,
-          int64_t isizeH,
-          int64_t isizeW,
-          int64_t osizeT,
-          int64_t osizeH,
-          int64_t osizeW,
-          int64_t istrideD,
-          int64_t istrideT,
-          int64_t istrideH,
-          int64_t istrideW)
-{
-  at::parallel_for(0, sizeD, 0, [&](int64_t start, int64_t end) {
-    for (auto d = start; d < end; d++)
-    {
-      /* loop over output */
-      int64_t ot, oh, ow;
-      for(ot = 0; ot < osizeT; ot++)
-      {
-        int istartT = START_IND(ot, osizeT, isizeT);
-        int iendT   = END_IND(ot, osizeT, isizeT);
-        int kT = iendT - istartT;
-
-        for(oh = 0; oh < osizeH; oh++)
-        {
-          int istartH = START_IND(oh, osizeH, isizeH);
-          int iendH   = END_IND(oh, osizeH, isizeH);
-          int kH = iendH - istartH;
-
-          for(ow = 0; ow < osizeW; ow++)
-          {
-
-            int istartW = START_IND(ow, osizeW, isizeW);
-            int iendW   = END_IND(ow, osizeW, isizeW);
-            int kW = iendW - istartW;
-
-            /* local pointers */
-            scalar_t *ip = input_p  + d*istrideD + istartT*istrideT + istartH*istrideH + istartW*istrideW;
-            scalar_t *op = output_p + d*osizeT*osizeH*osizeW + ot*osizeH*osizeW + oh*osizeW + ow;
-
-            /* compute local average: */
-            scalar_t sum = 0;
-            int it, ih, iw;
-            for(it = 0; it < kT; it++)
-            {
-              for(ih = 0; ih < kH; ih++)
-              {
-                for(iw = 0; iw < kW; iw++)
-                {
-                  scalar_t val = *(ip + it*istrideT + ih*istrideH + iw*istrideW);
-                  sum += val;
-                }
-              }
-            }
-
-            /* set output to local average */
-            *op = sum / kT / kH / kW;
-          }
-        }
-      }
-    }
-  });
-}
-
-void THNN_(VolumetricAdaptiveAveragePooling_updateOutput)(
-          THNNState *state,
-          THTensor *input,
-          THTensor *output,
-          int osizeT,
-          int osizeW,
-          int osizeH)
-{
-  int dimD = 0;
-  int dimT = 1;
-  int dimH = 2;
-  int dimW = 3;
-  int64_t sizeB = 1;
-  int64_t sizeD = 0;
-  int64_t isizeT = 0;
-  int64_t isizeH = 0;
-  int64_t isizeW = 0;
-
-  int64_t istrideB = 0;
-  int64_t istrideD = 0;
-  int64_t istrideT = 0;
-  int64_t istrideH = 0;
-  int64_t istrideW = 0;
-
-  scalar_t *input_data = nullptr;
-  scalar_t *output_data = nullptr;
-
-
-  THNN_ARGCHECK(!input->is_empty() && (input->dim() == 4 || input->dim() == 5), 2, input,
-                "non-empty 4D or 5D (batch mode) tensor expected for input, but got: %s");
-
-  if (input->dim() == 5)
-  {
-    istrideB = input->stride(0);
-    sizeB = input->size(0);
-    dimD++;
-    dimT++;
-    dimH++;
-    dimW++;
-  }
-
-  /* sizes */
-  sizeD  = input->size(dimD);
-  isizeT = input->size(dimT);
-  isizeH = input->size(dimH);
-  isizeW = input->size(dimW);
-  /* strides */
-  istrideD = input->stride(dimD);
-  istrideT = input->stride(dimT);
-  istrideH = input->stride(dimH);
-  istrideW = input->stride(dimW);
-
-  /* resize output */
-  if (input->dim() == 4)
-  {
-    THTensor_(resize4d)(output, sizeD, osizeT, osizeH, osizeW);
-
-    input_data = input->data<scalar_t>();
-    output_data = output->data<scalar_t>();
-
-    THNN_(VolumetricAdaptiveAveragePooling_updateOutput_frame)(input_data, output_data,
-                                                      sizeD,
-                                                      isizeT, isizeH, isizeW,
-                                                      osizeT, osizeH, osizeW,
-                                                      istrideD, istrideT,
-                                                      istrideH, istrideW);
-  }
-  else
-  {
-    THTensor_(resize5d)(output, sizeB, sizeD, osizeT, osizeH, osizeW);
-
-    input_data = input->data<scalar_t>();
-    output_data = output->data<scalar_t>();
-
-    at::parallel_for(0, sizeB, 0, [&](int64_t start, int64_t end) {
-      for (auto b = start; b < end; b++)
-      {
-        THNN_(VolumetricAdaptiveAveragePooling_updateOutput_frame)(input_data+b*istrideB, output_data+b*sizeD*osizeT*osizeH*osizeW,
-                                                          sizeD,
-                                                          isizeT, isizeH, isizeW,
-                                                          osizeT, osizeH, osizeW,
-                                                          istrideD, istrideT,
-                                                          istrideH, istrideW);
-      }
-    });
-  }
-}
-
-static void THNN_(VolumetricAdaptiveAveragePooling_updateGradInput_frame)(
-          scalar_t *gradInput_p,
-          scalar_t *gradOutput_p,
-          int64_t sizeD,
-          int64_t isizeT,
-          int64_t isizeH,
-          int64_t isizeW,
-          int64_t osizeT,
-          int64_t osizeH,
-          int64_t osizeW)
-{
-  at::parallel_for(0, sizeD, 0, [&](int64_t start, int64_t end) {
-    for (auto d = start; d < end; d++)
-    {
-      scalar_t *gradInput_p_d  = gradInput_p + d*isizeT*isizeW*isizeH;
-      scalar_t *gradOutput_p_d = gradOutput_p + d*osizeT*osizeW*osizeH;
-
-      /* calculate average */
-      int64_t ot, oh, ow;
-      for(ot = 0; ot < osizeT; ot++)
-      {
-        int istartT = START_IND(ot, osizeT, isizeT);
-        int iendT   = END_IND(ot, osizeT, isizeT);
-        int kT = iendT - istartT;
-
-        for(oh = 0; oh < osizeH; oh++)
-        {
-          int istartH = START_IND(oh, osizeH, isizeH);
-          int iendH   = END_IND(oh, osizeH, isizeH);
-          int kH = iendH - istartH;
-
-          for(ow = 0; ow < osizeW; ow++)
-          {
-
-            int istartW = START_IND(ow, osizeW, isizeW);
-            int iendW   = END_IND(ow, osizeW, isizeW);
-            int kW = iendW - istartW;
-
-            scalar_t grad_delta = gradOutput_p_d[ot*osizeH*osizeW + oh*osizeW + ow] / kT / kH / kW;
-
-            int it, ih, iw;
-            for(it = istartT; it < iendT; it++)
-            {
-              for(ih = istartH; ih < iendH; ih++)
-              {
-                for(iw = istartW; iw < iendW; iw++)
-                {
-                  /* update gradient */
-                  gradInput_p_d[it*isizeH*isizeW + ih*isizeW + iw] += grad_delta;
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  });
-}
-
-void THNN_(VolumetricAdaptiveAveragePooling_updateGradInput)(
-          THNNState *state,
-          THTensor *input,
-          THTensor *gradOutput,
-          THTensor *gradInput)
-{
-  int dimD = 0;
-  int dimT = 1;
-  int dimH = 2;
-  int dimW = 3;
-  int64_t sizeB = 1;
-  int64_t sizeD;
-  int64_t isizeT;
-  int64_t isizeH;
-  int64_t isizeW;
-  int64_t osizeT;
-  int64_t osizeH;
-  int64_t osizeW;
-  scalar_t *gradInput_data;
-  scalar_t *gradOutput_data;
-
-  /* get contiguous gradOutput */
-  gradOutput = THTensor_(newContiguous)(gradOutput);
-
-  /* resize */
-  THTensor_(resizeAs)(gradInput, input);
-  THTensor_(zero)(gradInput);
-
-  if (input->dim() == 5) {
-    sizeB = input->size(0);
-    dimD++;
-    dimT++;
-    dimH++;
-    dimW++;
-  }
-
-  /* sizes */
-  sizeD  = input->size(dimD);
-  isizeT = input->size(dimT);
-  isizeH = input->size(dimH);
-  isizeW = input->size(dimW);
-  osizeT = gradOutput->size(dimT);
-  osizeH = gradOutput->size(dimH);
-  osizeW = gradOutput->size(dimW);
-
-  /* get raw pointers */
-  gradInput_data = gradInput->data<scalar_t>();
-  gradOutput_data = gradOutput->data<scalar_t>();
-
-  /* backprop */
-  if (input->dim() == 4)
-  {
-    THNN_(VolumetricAdaptiveAveragePooling_updateGradInput_frame)(gradInput_data, gradOutput_data,
-                                                         sizeD,
-                                                         isizeT, isizeH, isizeW,
-                                                         osizeT, osizeH, osizeW);
-  }
-  else
-  {
-    at::parallel_for(0, sizeB, 0, [&](int64_t start, int64_t end) {
-      for (auto b = start; b < end; b++)
-      {
-        THNN_(VolumetricAdaptiveAveragePooling_updateGradInput_frame)(gradInput_data+b*sizeD*isizeT*isizeH*isizeW, gradOutput_data+b*sizeD*osizeT*osizeH*osizeW,
-                                                             sizeD,
-                                                             isizeT, isizeH, isizeW,
-                                                             osizeT, osizeH, osizeW);
-      }
-    });
-  }
-
-  /* cleanup */
-  c10::raw::intrusive_ptr::decref(gradOutput);
-}
-
-#endif
-
-#undef START_IND
-#undef END_IND
diff --git a/aten/src/THNN/generic/VolumetricUpSamplingNearest.c b/aten/src/THNN/generic/VolumetricUpSamplingNearest.c
deleted file mode 100644
index f4ff9442fd6c..000000000000
--- a/aten/src/THNN/generic/VolumetricUpSamplingNearest.c
+++ /dev/null
@@ -1,30 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "THNN/generic/VolumetricUpSamplingNearest.c"
-#else
-
-void THNN_(VolumetricUpSamplingNearest_updateOutput)(
-    THNNState* state,
-    THTensor* input,
-    THTensor* output,
-    int outputDepth,
-    int outputHeight,
-    int outputWidth) {
-  AT_ERROR("This function is deprecated, please use it from ATen.");
-}
-
-void THNN_(VolumetricUpSamplingNearest_updateGradInput)(
-    THNNState* state,
-    THTensor* gradOutput,
-    THTensor* gradInput,
-    int nbatch,
-    int channels,
-    int inputDepth,
-    int inputHeight,
-    int inputWidth,
-    int outputDepth,
-    int outputHeight,
-    int outputWidth) {
-  AT_ERROR("This function is deprecated, please use it from ATen.");
-}
-
-#endif
diff --git a/aten/src/THNN/generic/VolumetricUpSamplingTrilinear.c b/aten/src/THNN/generic/VolumetricUpSamplingTrilinear.c
deleted file mode 100644
index 4eaa507298f2..000000000000
--- a/aten/src/THNN/generic/VolumetricUpSamplingTrilinear.c
+++ /dev/null
@@ -1,35 +0,0 @@
-// Adapted from interp.cpp from Caffe util by Pauline Luc
-// Originally developed by George Papandreou
-
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "THNN/generic/VolumetricUpSamplingTrilinear.c"
-#else
-
-void THNN_(VolumetricUpSamplingTrilinear_updateOutput)(
-    THNNState* state,
-    THTensor* input,
-    THTensor* output,
-    int outputDepth,
-    int outputHeight,
-    int outputWidth,
-    bool align_corners) {
-  AT_ERROR("This function is deprecated, please use it from ATen.");
-}
-
-void THNN_(VolumetricUpSamplingTrilinear_updateGradInput)(
-    THNNState* state,
-    THTensor* gradOutput,
-    THTensor* gradInput,
-    int nbatch,
-    int channels,
-    int inputDepth,
-    int inputHeight,
-    int inputWidth,
-    int outputDepth,
-    int outputHeight,
-    int outputWidth,
-    bool align_corners) {
-  AT_ERROR("This function is deprecated, please use it from ATen.");
-}
-
-#endif
diff --git a/aten/src/THNN/init.cpp b/aten/src/THNN/init.cpp
index 4d026987a7d4..97f14e2a6af0 100644
--- a/aten/src/THNN/init.cpp
+++ b/aten/src/THNN/init.cpp
@@ -127,12 +127,6 @@
 #include <THNN/generic/TemporalRowConvolution.c>
 #include <TH/THGenerateFloatTypes.h>
 
-#include <THNN/generic/TemporalUpSamplingNearest.c>
-#include <TH/THGenerateFloatTypes.h>
-
-#include <THNN/generic/TemporalUpSamplingLinear.c>
-#include <TH/THGenerateFloatTypes.h>
-
 #include <THNN/generic/FeatureLPPooling.c>
 #include <TH/THGenerateFloatTypes.h>
 
@@ -151,24 +145,12 @@
 #include <THNN/generic/SpatialAveragePooling.c>
 #include <TH/THGenerateFloatTypes.h>
 
-#include <THNN/generic/SpatialDilatedMaxPooling.c>
-#include <TH/THGenerateFloatTypes.h>
-
 #include <THNN/generic/SpatialMaxUnpooling.c>
 #include <TH/THGenerateFloatTypes.h>
 
-#include <THNN/generic/SpatialUpSamplingNearest.c>
-#include <TH/THGenerateFloatTypes.h>
-
-#include <THNN/generic/SpatialUpSamplingBilinear.c>
-#include <TH/THGenerateFloatTypes.h>
-
 #include <THNN/generic/VolumetricAveragePooling.c>
 #include <TH/THGenerateFloatTypes.h>
 
-#include <THNN/generic/SpatialUpSamplingBicubic.c>
-#include <TH/THGenerateFloatTypes.h>
-
 #include <THNN/generic/VolumetricConvolutionMM.c>
 #include <TH/THGenerateFloatTypes.h>
 
@@ -178,20 +160,11 @@
 #include <THNN/generic/VolumetricDilatedConvolution.c>
 #include <TH/THGenerateFloatTypes.h>
 
-#include <THNN/generic/VolumetricAdaptiveAveragePooling.c>
-#include <TH/THGenerateFloatTypes.h>
-
 #include <THNN/generic/VolumetricDilatedMaxPooling.c>
 #include <TH/THGenerateFloatTypes.h>
 
 #include <THNN/generic/VolumetricMaxUnpooling.c>
 #include <TH/THGenerateFloatTypes.h>
 
-#include <THNN/generic/VolumetricUpSamplingNearest.c>
-#include <TH/THGenerateFloatTypes.h>
-
-#include <THNN/generic/VolumetricUpSamplingTrilinear.c>
-#include <TH/THGenerateFloatTypes.h>
-
 #include <THNN/generic/SpatialClassNLLCriterion.c>
 #include <TH/THGenerateFloatTypes.h>
diff --git a/aten/tools/run_tests.sh b/aten/tools/run_tests.sh
index bb76979b6ae1..69e63ac7f3fb 100755
--- a/aten/tools/run_tests.sh
+++ b/aten/tools/run_tests.sh
@@ -34,6 +34,9 @@ fi
 if [[ -x ./cuda_half_test ]]; then
   ./cuda_half_test
 fi
+if [[ -x ./cuda_distributions_test ]]; then
+  ./cuda_distributions_test
+fi
 if [[ -x ./cuda_optional_test ]]; then
   ./cuda_optional_test
 fi
diff --git a/benchmarks/operator_benchmark/ops/repeat_benchmark.py b/benchmarks/operator_benchmark/ops/repeat_benchmark.py
new file mode 100644
index 000000000000..f9e53886f1d0
--- /dev/null
+++ b/benchmarks/operator_benchmark/ops/repeat_benchmark.py
@@ -0,0 +1,58 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+import numpy as np
+import torch
+
+import time
+
+"""Microbenchmarks for Tensor repeat operator. Supports PyTorch."""
+
+input_shapes = (
+               (4, 4, 1),
+               (16, 1, 32),
+               (64, 64, 1, 1),
+               (8, 256, 128),
+               (1, 64, 128, 32),
+               (512, 512),
+)
+
+repeats = (
+          (1, 1, 1, 64),
+          (1, 4, 1, 2),
+          (1, 2, 2, 15),
+          (1, 1, 3, 2),
+          (128, 1, 8, 1),
+          (1, 1, 2, 16),
+)
+
+NUM_WARMUP_ITERS = 5
+NUM_BENCHMARK_ITERS = 10
+DTYPE_TO_BYTES = {'float' : 4}
+
+def generate_data_for_repeat():
+    input_tensors = [torch.randn(*input_shape) for input_shape in input_shapes]
+    total_num_elements = 0
+    for input_tensor, repeat in zip(input_tensors, repeats):
+        total_num_elements += input_tensor.numel()
+        total_num_elements += input_tensor.numel() * np.prod(repeat)
+    return input_tensors, (total_num_elements * DTYPE_TO_BYTES['float'])
+
+input_tensors, total_bytes = generate_data_for_repeat()
+BYTES_TO_MB = (1. / 1000. / 1000.)
+
+def pt_repeat(input_tensor, repeat):
+    return input_tensor.repeat(repeat)
+
+def pt_repeat_n_times(niters):
+    for _ in range(niters):
+        for input_tensor, repeat in zip(input_tensors, repeats):
+            pt_repeat(input_tensor, repeat)
+
+if __name__ == "__main__":
+    # Warm up runs.
+    pt_repeat_n_times(NUM_WARMUP_ITERS)
+    s = time.time()
+    pt_repeat_n_times(NUM_BENCHMARK_ITERS)
+    total_time_s = (time.time() - s)
+    total_time_per_iter_s = total_time_s / NUM_BENCHMARK_ITERS
+    achieved_bandwidth = (total_bytes * BYTES_TO_MB) / total_time_per_iter_s
+    print("Time:{} Achieved Bandwidth:{} MB/s".format(total_time_per_iter_s, achieved_bandwidth))
diff --git a/c10/core/Backend.h b/c10/core/Backend.h
index 738c9b10cf91..709769195703 100644
--- a/c10/core/Backend.h
+++ b/c10/core/Backend.h
@@ -275,4 +275,15 @@ static inline const char* toString(Backend b) {
   }
 }
 
+static inline bool isSparse(Backend b) {
+  switch (b) {
+    case Backend::SparseCPU:
+    case Backend::SparseCUDA:
+    case Backend::SparseHIP:
+      return true;
+    default:
+      return false;
+  }
+}
+
 } // namespace c10
diff --git a/c10/core/CPUAllocator.cpp b/c10/core/CPUAllocator.cpp
index 7ce339d751f5..8e3013836568 100644
--- a/c10/core/CPUAllocator.cpp
+++ b/c10/core/CPUAllocator.cpp
@@ -52,13 +52,24 @@ void* alloc_cpu(size_t nbytes) {
 #elif defined(_MSC_VER)
   data = _aligned_malloc(nbytes, gAlignment);
 #else
-  CAFFE_ENFORCE_EQ(posix_memalign(&data, gAlignment, nbytes), 0);
+  int err = posix_memalign(&data, gAlignment, nbytes);
+  if (err != 0) {
+    CAFFE_THROW(
+        "DefaultCPUAllocator: can't allocate memory: you tried to allocate ",
+        nbytes,
+        " bytes. Error code ",
+        err,
+        " (",
+        strerror(err),
+        ")");
+  }
 #endif
 
   CAFFE_ENFORCE(
       data,
-      "DefaultCPUAllocator: not enough memory: you tried to allocate %dGB. Buy new RAM!",
-      nbytes / 1073741824);
+      "DefaultCPUAllocator: not enough memory: you tried to allocate ",
+      nbytes,
+      " bytes. Buy new RAM!");
 
   // move data to a thread's NUMA node
   NUMAMove(data, nbytes, GetCurrentNUMANode());
diff --git a/c10/core/Device.cpp b/c10/core/Device.cpp
index 79ee6251ec66..4ee041062a57 100644
--- a/c10/core/Device.cpp
+++ b/c10/core/Device.cpp
@@ -39,9 +39,9 @@ DeviceType parse_type(const std::string& device_string) {
 } // namespace
 
 void Device::validate() {
-  AT_CHECK(index_ == -1 || index_ >= 0,
+  TORCH_CHECK(index_ == -1 || index_ >= 0,
            "Device index must be -1 or non-negative, got ", index_);
-  AT_CHECK(!is_cpu() || index_ <= 0,
+  TORCH_CHECK(!is_cpu() || index_ <= 0,
            "CPU device index must be -1 or zero, got ", index_);
 }
 
@@ -56,7 +56,7 @@ void Device::validate() {
 //     std::regex_constants::basic);
 // std::smatch match;
 // const bool ok = std::regex_match(device_string, match, regex);
-// AT_CHECK(ok, "Invalid device string: '", device_string, "'");
+// TORCH_CHECK(ok, "Invalid device string: '", device_string, "'");
 // if (match[1].matched) {
 //   type_ = parse_type_from_string(match[1].str());
 // } else {
@@ -69,14 +69,14 @@ void Device::validate() {
 //   index_ = std::stoi(match[3].str());
 // }
 Device::Device(const std::string& device_string) : Device(Type::CPU) {
-  AT_CHECK(!device_string.empty(), "Device string must not be empty");
+  TORCH_CHECK(!device_string.empty(), "Device string must not be empty");
   int index = device_string.find(":");
   if (index == std::string::npos) {
     type_ = parse_type(device_string);
   } else {
     std::string s;
     s = device_string.substr(0, index);
-    AT_CHECK(!s.empty(), "Device string must not be empty");
+    TORCH_CHECK(!s.empty(), "Device string must not be empty");
     type_ = parse_type(s);
 
     std::string device_index = device_string.substr(index + 1);
@@ -86,7 +86,7 @@ Device::Device(const std::string& device_string) : Device(Type::CPU) {
       AT_ERROR("Could not parse device index '", device_index,
                "' in device string '", device_string, "'");
     }
-    AT_CHECK(index_ >= 0,
+    TORCH_CHECK(index_ >= 0,
              "Device index must be non-negative, got ", index_);
   }
   validate();
diff --git a/c10/core/MemoryFormat.h b/c10/core/MemoryFormat.h
new file mode 100644
index 000000000000..14209900e4c1
--- /dev/null
+++ b/c10/core/MemoryFormat.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#include <c10/core/Backend.h>
+#include <c10/util/Exception.h>
+
+#include <iostream>
+
+// Memory format is not the property of a Tensor. It is the way to tell an
+// operator how the result should be organized in memory and nothing more. That
+// means memory format should never be used as return value for any tensor state
+// interrogation functions (internally and externally).
+//
+// Possible options are:
+//  Any:
+//    An operator can return Tensor with any memory format. This describes the
+//    current behavior of operators.
+//
+//  Preserve:
+//    If any of the input tensors is in channels_last format, operator output
+//    should be in channels_last format
+//
+//  Contiguous:
+//    Regardless of input tensors format, the output should be contiguous Tensor.
+//
+//  ChannelsLast:
+//    Regardless of input tensors format, the output should be in channels_last format.
+
+
+namespace c10 {
+enum class MemoryFormat : int8_t { Any, Preserve, Contiguous, ChannelsLast };
+
+inline std::ostream& operator<<(
+    std::ostream& stream,
+    at::MemoryFormat memory_format) {
+  switch (memory_format) {
+    case MemoryFormat::Any:
+      return stream << "Any";
+    case MemoryFormat::Preserve:
+      return stream << "Preserve";
+    case MemoryFormat::Contiguous:
+      return stream << "Contiguous";
+    case MemoryFormat::ChannelsLast:
+      return stream << "ChannelsLast";
+    default:
+      AT_ERROR("Unknown memory format");
+  }
+}
+
+} // namespace c10
diff --git a/c10/core/ScalarType.h b/c10/core/ScalarType.h
index b34ed427d34f..6cb2102b2b0c 100644
--- a/c10/core/ScalarType.h
+++ b/c10/core/ScalarType.h
@@ -29,7 +29,9 @@ namespace c10 {
   _(std::complex<float>, ComplexFloat, z) /* 9 */    \
   _(std::complex<double>, ComplexDouble, z) /* 10 */ \
   _(bool, Bool, i) /* 11 */                          \
-  _(c10::qint8, QInt8, i) /* 12 */
+  _(c10::qint8, QInt8, i) /* 12 */                   \
+  _(c10::quint8, QUInt8, i) /* 13 */                 \
+  _(c10::qint32, QInt32, i) /* 14 */
 
 // If you want to support ComplexHalf for real, replace occurrences
 // of this macro with AT_FORALL_SCALAR_TYPES_WITH_COMPLEX.  But
@@ -46,7 +48,9 @@ namespace c10 {
   _(std::complex<float>, ComplexFloat, z)                          \
   _(std::complex<double>, ComplexDouble, z)                        \
   _(bool, Bool, i)                                                 \
-  _(c10::qint8, QInt8, i)
+  _(c10::qint8, QInt8, i)                                          \
+  _(c10::quint8, QUInt8, i)                                        \
+  _(c10::qint32, QInt32, i)
 
 #define AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF_AND_QINT(_) \
   _(uint8_t, Byte, i)                                                       \
@@ -70,7 +74,9 @@ namespace c10 {
   _(at::Half, Half, d)            \
   _(float, Float, d)              \
   _(double, Double, d)            \
-  _(c10::qint8, QInt8, i)
+  _(c10::qint8, QInt8, i)         \
+  _(c10::quint8, QUInt8, i)       \
+  _(c10::qint32, QInt32, i)
 
 #define AT_FORALL_SCALAR_TYPES_EXCEPT_QINT(_) \
   _(uint8_t, Byte, i)                         \
@@ -101,7 +107,9 @@ namespace c10 {
   _(int64_t, Long, i)                         \
   _(float, Float, d)                          \
   _(double, Double, d)                        \
-  _(c10::qint8, QInt8, i)
+  _(c10::qint8, QInt8, i)                     \
+  _(c10::quint8, QUInt8, i)                   \
+  _(c10::qint32, QInt32, i)
 
 #define AT_FORALL_SCALAR_TYPES_EXCEPT_HALF_AND_QINT(_) \
   _(uint8_t, Byte, i)                                  \
@@ -223,7 +231,37 @@ static inline bool isComplexType(ScalarType t) {
 
 static inline bool isQIntType(ScalarType t) {
   // Don't forget to extend this when adding new QInt types
-  return t == ScalarType::QInt8;
+  return t == ScalarType:: QInt8 || t == ScalarType::QUInt8 || t == ScalarType::QInt32;
+}
+
+static inline ScalarType toQIntType(ScalarType t) {
+  switch (t) {
+    case ScalarType::Byte:
+      return ScalarType::QUInt8;
+    case ScalarType::Char:
+      return ScalarType::QInt8;
+    case ScalarType::Int:
+      return ScalarType::QInt32;
+    default:
+      return t;
+  }
+}
+
+static inline ScalarType toUnderlying(ScalarType t) {
+  switch (t) {
+    case ScalarType::QUInt8:
+      return ScalarType::Byte;
+    case ScalarType::QInt8:
+      return ScalarType::Char;
+    case ScalarType::QInt32:
+      return ScalarType::Int;
+    default:
+      return t;
+  }
+}
+
+static inline bool isUnderlying(ScalarType type, ScalarType qtype) {
+  return type == toUnderlying(qtype);
 }
 
 static inline ScalarType promoteTypes(ScalarType a, ScalarType b) {
diff --git a/c10/core/Stream.h b/c10/core/Stream.h
index 85b3bcff656b..01e9a341ecc2 100644
--- a/c10/core/Stream.h
+++ b/c10/core/Stream.h
@@ -120,7 +120,7 @@ class Stream final {
     auto device_index = static_cast<DeviceIndex>(bits) & 0xFFFFull;
     bits >>= 16;
     auto device_type = static_cast<DeviceType>(bits);
-    AT_CHECK(isValidDeviceType(device_type));
+    TORCH_CHECK(isValidDeviceType(device_type));
     // Unfortunately, we can't check if the StreamId is valid here; it
     // will be checked upon first use.
     return Stream(UNSAFE, Device(device_type, device_index), stream_id);
diff --git a/c10/core/TensorImpl.cpp b/c10/core/TensorImpl.cpp
index 41af810b1ef1..84fcb466133b 100644
--- a/c10/core/TensorImpl.cpp
+++ b/c10/core/TensorImpl.cpp
@@ -113,6 +113,26 @@ bool TensorImpl::has_storage() const {
   return storage_;
 }
 
+bool TensorImpl::is_contiguous(at::MemoryFormat memory_format) const {
+#ifdef DEBUG
+  AT_ASSERT(compute_contiguous() == is_contiguous_);
+#endif
+  if (memory_format == at::MemoryFormat::ChannelsLast) {
+    if (dim() == 4) {
+      auto strides_1 = 1;
+      auto strides_3 = sizes_[1];
+      auto strides_2 = strides_3 * sizes_[3];
+      auto strides_0 = strides_2 * sizes_[2];
+      if (strides_0 == strides_[0] && strides_1 == strides_[1] &&
+          strides_2 == strides_[2] && strides_3 == strides_[3]) {
+        return true;
+      }
+    }
+    return false;
+  }
+  return is_contiguous_;
+}
+
 const Storage& TensorImpl::storage() const {
   return storage_;
 }
@@ -135,4 +155,52 @@ at::DataPtr PlacementDeleteContext::makeDataPtr(
 
 AutogradMetaInterface::~AutogradMetaInterface() {}
 
+/// NOTE [ Treating Variables as non-Variables in type dispatch ]
+///
+/// Previously, in VariableType_*.cpp (generated by gen_variable_type.py), when
+/// a function is using the 'use_derived' strategy, we call its implementation
+/// on the base non-Variable type (`baseType`), passing unwrapped tensors to the
+/// call so that any `.dispatch_type()` calls in the implementation can treat the passed
+/// tensors as non-Variables and won't dispatch back to functions in VariableType.
+///
+/// However, after the Variable/Tensor merge, there is no concept of unwrapping
+/// a tensor anymore, and directly passing variables to the base type calls will
+/// cause the `.dispatch_type()` dispatch in the implementation to treat the tensor as a
+/// variable, and any function dispatch based on `.dispatch_type()` will dispatch back to
+/// VariableType, which is not what we want.
+///
+/// The solution to the above problem is to add `at::NonVariableTypeMode`, which
+/// when enabled will cause `legacyTensorType()` and `getType()` to always return
+/// non-Variable type, even if the tensor being called on is a variable.
+///
+/// TODO: Since `torch::NoGradGuard` serves the same purpose in libtorch, we should
+/// merge these two thread-local guards.
+
+/// In the CAFFE2_FB_LIMITED_MOBILE_CAPABILITY build setting,
+/// thread_local is not supported. In that case, we don't provide
+/// `at::NonVariableTypeMode`.
+#ifndef CAFFE2_FB_LIMITED_MOBILE_CAPABILITY
+
+thread_local bool NonVariableTypeMode_enabled = false;
+
+bool NonVariableTypeMode::is_enabled() {
+  return NonVariableTypeMode_enabled;
+}
+
+void NonVariableTypeMode::set_enabled(bool enabled) {
+  NonVariableTypeMode_enabled = enabled;
+}
+
+#else // defined(CAFFE2_FB_LIMITED_MOBILE_CAPABILITY)
+
+bool NonVariableTypeMode::is_enabled() {
+  throw std::runtime_error("NonVariableTypeMode is not supported on mobile");
+}
+
+void NonVariableTypeMode::set_enabled(bool enabled) {
+  throw std::runtime_error("NonVariableTypeMode is not supported on mobile");
+}
+
+#endif
+
 } // namespace c10
diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h
index 907834c07404..7a4afeef8e2a 100644
--- a/c10/core/TensorImpl.h
+++ b/c10/core/TensorImpl.h
@@ -5,6 +5,7 @@
 #include <numeric>
 
 #include <c10/core/Backend.h>
+#include <c10/core/MemoryFormat.h>
 #include <c10/core/Storage.h>
 #include <c10/core/TensorOptions.h>
 #include <c10/core/TensorTypeId.h>
@@ -61,7 +62,7 @@ inline int64_t size_from_dim_(int k, IntArrayRef dims) {
 
 // Product of all dims up to k (not including dims[k])
 inline int64_t size_to_dim_(int k, IntArrayRef dims) {
-  AT_ASSERT((unsigned)k <= dims.size());
+  TORCH_CHECK((unsigned)k <= dims.size());
   int64_t r = 1;
   for (int i = 0; i < k; ++i) {
     r *= dims[i];
@@ -71,7 +72,7 @@ inline int64_t size_to_dim_(int k, IntArrayRef dims) {
 
 // Product of all dims between k and l (not including dims[k] and dims[l])
 inline int64_t size_between_dim_(int k, int l, IntArrayRef dims) {
-  AT_ASSERT((unsigned)l < dims.size());
+  TORCH_CHECK((unsigned)l < dims.size());
   int64_t r = 1;
   if (k < l) {
     for (int i = k + 1; i < l; ++i) {
@@ -87,8 +88,8 @@ inline int64_t size_between_dim_(int k, int l, IntArrayRef dims) {
 
 // Wrap around axis_index if it is negative, s.t., -1 is the last dim
 inline int canonical_axis_index_(int axis_index, int ndims) {
-  AT_ASSERT(axis_index >= -ndims);
-  AT_ASSERT(axis_index < ndims);
+  TORCH_CHECK(axis_index >= -ndims);
+  TORCH_CHECK(axis_index < ndims);
   if (axis_index < 0) {
     return axis_index + ndims;
   }
@@ -138,6 +139,11 @@ struct C10_API AutogradMetaInterface {
   virtual ~AutogradMetaInterface();
 };
 
+struct C10_API NonVariableTypeMode {
+  static bool is_enabled();
+  static void set_enabled(bool enabled);
+};
+
 // NOTE [ Version Counter Sharing ]
 //
 // Every Tensor has a version counter. Version counters are incremented whenever the
@@ -370,7 +376,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    */
   virtual int64_t numel() const {
 #ifdef DEBUG
-    AT_ASSERT(compute_numel() == numel_);
+    TORCH_INTERNAL_ASSERT(compute_numel() == numel_);
 #endif
     return numel_;
   }
@@ -382,12 +388,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    * compute_contiguous() for the exact definition of whether or not
    * a tensor is contiguous or not.
    */
-  virtual bool is_contiguous() const {
-#ifdef DEBUG
-    AT_ASSERT(compute_contiguous() == is_contiguous_);
-#endif
-    return is_contiguous_;
-  }
+  virtual bool is_contiguous(at::MemoryFormat memory_format=at::MemoryFormat::Any) const;
 
   bool is_sparse() const {
     // NB: This method is not virtual and avoid dispatches for performance reasons.
@@ -426,25 +427,21 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
   }
 
   int64_t get_device() const {
-    if (device_opt_.has_value()) {
-      // See NOTE [c10::optional operator usage in CUDA]
-      return (*device_opt_).index();
-    }
-
-    AT_ERROR(
+    TORCH_CHECK(
+        device_opt_.has_value(),
         "tensor with backend ", toString(tensorTypeIdToBackend(type_id())),
         " does not have a device");
+    // See NOTE [c10::optional operator usage in CUDA]
+    return (*device_opt_).index();
   }
 
   Device device() const {
-    if (device_opt_.has_value()) {
-      // See NOTE [c10::optional operator usage in CUDA]
-      return *device_opt_;
-    }
-
-    AT_ERROR(
+    TORCH_CHECK(
+        device_opt_.has_value(),
         "tensor with backend ", toString(tensorTypeIdToBackend(type_id())),
         " does not have a device");
+    // See NOTE [c10::optional operator usage in CUDA]
+    return *device_opt_;
   }
 
   Layout layout() const {
@@ -501,7 +498,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    * See Note [We regret making Variable hold a Tensor]
    */
   bool is_wrapped_number() const {
-    AT_ASSERT(!is_variable());  // TODO: remove this when Variable and Tensor are merged
+    TORCH_INTERNAL_ASSERT(!is_variable());  // TODO: remove this when Variable and Tensor are merged
     return is_wrapped_number_;
   }
 
@@ -514,8 +511,8 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    * See Note [We regret making Variable hold a Tensor]
    */
   void set_wrapped_number(bool value) {
-    AT_ASSERT(!is_variable());  // TODO: remove this when Variable and Tensor are merged
-    AT_ASSERT(dim() == 0);
+    TORCH_INTERNAL_ASSERT(!is_variable());  // TODO: remove this when Variable and Tensor are merged
+    TORCH_INTERNAL_ASSERT(dim() == 0);
     is_wrapped_number_ = value;
   }
 
@@ -563,11 +560,8 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    * See Note [Tensor versus Variable in C++].
    */
   void set_requires_grad(bool requires_grad) {
-    if (autograd_meta()) {
-      autograd_meta()->set_requires_grad(requires_grad, this);
-    } else {
-      AT_ERROR("set_requires_grad is not implemented for Tensor");
-    }
+    TORCH_INTERNAL_ASSERT(autograd_meta(), "set_requires_grad is not implemented for Tensor");
+    autograd_meta()->set_requires_grad(requires_grad, this);
   }
 
   /**
@@ -581,11 +575,8 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    * See Note [Tensor versus Variable in C++].
    */
   bool requires_grad() const {
-    if (autograd_meta()) {
-      return autograd_meta()->requires_grad();
-    } else {
-      AT_ERROR("requires_grad is not implemented for Tensor");
-    }
+    TORCH_INTERNAL_ASSERT(autograd_meta(), "requires_grad is not implemented for Tensor");
+    return autograd_meta()->requires_grad();
   }
 
   /**
@@ -624,15 +615,15 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    */
   template <typename T>
   inline T * data() const {
-    AT_ASSERT(!is_variable());  // TODO: remove this when Variable and Tensor are merged
-    AT_CHECK(has_storage(),
+    TORCH_INTERNAL_ASSERT(!is_variable());  // TODO: remove this when Variable and Tensor are merged
+    TORCH_CHECK(has_storage(),
         "Cannot access data pointer of Tensor that doesn't have storage");
-    AT_ASSERTM(
+    TORCH_CHECK(
         storage_initialized(),
         "The tensor has a non-zero number of elements, but its data is not allocated yet. "
         "Caffe2 uses a lazy allocation, so you will need to call "
         "mutable_data() or raw_mutable_data() to actually allocate memory.");
-    AT_ASSERTM(
+    TORCH_CHECK(
         storage_.IsType<T>(),
         "Tensor type mismatch, caller expects elements to be ",
         caffe2::TypeMeta::TypeName<T>(),
@@ -657,10 +648,12 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    * See Note [We regret making Variable hold a Tensor]
    */
   inline void* data() const {
-    AT_ASSERT(!is_variable());  // TODO: remove this when Variable and Tensor are merged
-    AT_CHECK(has_storage(),
+    TORCH_INTERNAL_ASSERT(!is_variable());  // TODO: remove this when Variable and Tensor are merged
+    TORCH_CHECK(has_storage(),
         "Cannot access data pointer of Tensor that doesn't have storage");
-    AT_ASSERT(dtype_initialized());
+    TORCH_CHECK(dtype_initialized(),
+        "Cannot access data pointer of Tensor that doesn't have initialized dtype "
+        "(e.g., caffe2::Tensor x(CPU), prior to calling mutable_data<T>() on x)");
     return static_cast<void*>(
         static_cast<char*>(storage_.data()) +
         data_type_.itemsize() * storage_offset_);
@@ -699,7 +692,9 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    * Return the size of a single element of this tensor in bytes.
    */
   size_t itemsize() const {
-    AT_ASSERT(dtype_initialized());
+    TORCH_CHECK(dtype_initialized(),
+        "Cannot report itemsize of Tensor that doesn't have initialized dtype "
+        "(e.g., caffe2::Tensor x(CPU), prior to calling mutable_data<T>() on x)");
     return data_type_.itemsize();
   }
 
@@ -735,7 +730,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    * which is harder to misuse.
    */
   virtual void resize_dim(int64_t ndim) {
-    AT_CHECK(allow_tensor_metadata_change(), "resize_dim is not allowed on Tensor created from .data or .detach()");
+    TORCH_CHECK(allow_tensor_metadata_change(), "resize_dim is not allowed on Tensor created from .data or .detach()");
     sizes_.resize(ndim, 0);
     strides_.resize(ndim, 0);
     refresh_numel();
@@ -751,7 +746,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    * which is harder to misuse.
    */
   virtual void set_size(int64_t dim, int64_t new_size) {
-    AT_CHECK(allow_tensor_metadata_change(), "set_size is not allowed on Tensor created from .data or .detach()");
+    TORCH_CHECK(allow_tensor_metadata_change(), "set_size is not allowed on Tensor created from .data or .detach()");
     sizes_.at(dim) = new_size;
     refresh_numel();
     refresh_contiguous();
@@ -764,7 +759,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    * which is harder to misuse.
    */
   virtual void set_stride(int64_t dim, int64_t new_stride) {
-    AT_CHECK(allow_tensor_metadata_change(), "set_stride is not allowed on Tensor created from .data or .detach()");
+    TORCH_CHECK(allow_tensor_metadata_change(), "set_stride is not allowed on Tensor created from .data or .detach()");
     strides_[dim] = new_stride;
     refresh_numel();
     refresh_contiguous();
@@ -778,7 +773,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    * (and resizing if necessary.)
    */
   virtual void set_storage_offset(int64_t storage_offset) {
-    AT_CHECK(allow_tensor_metadata_change(), "set_storage_offset is not allowed on Tensor created from .data or .detach()");
+    TORCH_CHECK(allow_tensor_metadata_change(), "set_storage_offset is not allowed on Tensor created from .data or .detach()");
     storage_offset_ = storage_offset;
   }
 
@@ -793,8 +788,8 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    * See Note [We regret making Variable hold a Tensor]
    */
   void set_sizes_contiguous(IntArrayRef new_size) {
-    AT_CHECK(allow_tensor_metadata_change(), "set_sizes_contiguous is not allowed on Tensor created from .data or .detach()");
-    AT_ASSERT(!is_variable());  // TODO: remove this when Variable and Tensor are merged
+    TORCH_CHECK(allow_tensor_metadata_change(), "set_sizes_contiguous is not allowed on Tensor created from .data or .detach()");
+    TORCH_INTERNAL_ASSERT(!is_variable());  // TODO: remove this when Variable and Tensor are merged
     auto old_dim = sizes_.size();
     auto new_dim = new_size.size();
 
@@ -818,9 +813,9 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    * See Note [We regret making Variable hold a Tensor]
    */
   void set_sizes_and_strides(IntArrayRef new_size, IntArrayRef new_stride) {
-    AT_CHECK(allow_tensor_metadata_change(), "set_sizes_and_strides is not allowed on Tensor created from .data or .detach()");
-    AT_ASSERT(!is_variable());  // TODO: remove this when Variable and Tensor are merged
-    AT_CHECK(
+    TORCH_INTERNAL_ASSERT(!is_variable());  // TODO: remove this when Variable and Tensor are merged
+    TORCH_CHECK(allow_tensor_metadata_change(), "set_sizes_and_strides is not allowed on Tensor created from .data or .detach()");
+    TORCH_CHECK(
         new_size.size() == new_stride.size(),
         "dimensionality of sizes (",
         new_size.size(),
@@ -871,10 +866,13 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
   /**
    * True if a tensor is a variable.  See Note [Tensor versus Variable in C++]
    */
-  bool is_variable() const { return autograd_meta_ != nullptr; };
+  bool is_variable() const {
+    return autograd_meta_ != nullptr && !at::NonVariableTypeMode::is_enabled();
+  }
 
   /**
    * Set whether a tensor allows changes to its metadata (e.g. sizes / strides / storage / storage_offset).
+   * See NOTE [ Metadata Change for a Detached Tensor ] for details.
    */
   virtual void set_allow_tensor_metadata_change(bool value) {
     allow_tensor_metadata_change_ = value;
@@ -882,6 +880,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
 
   /**
    * True if a tensor allows changes to its metadata (e.g. sizes / strides / storage / storage_offset).
+   * See NOTE [ Metadata Change for a Detached Tensor ] for details.
    */
   virtual bool allow_tensor_metadata_change() const {
     return allow_tensor_metadata_change_;
@@ -910,16 +909,16 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
 
   // NOTE: `shallow_copy_and_detach()` does not copy the following TensorImpl fields:
   // 1. the AutogradMeta pointer, because it is unique for each Variable.
-  // 2. the version counter, because although it lives in TensorImpl, the version counter is managed
-  // by autograd, and the call sites of `shallow_copy_and_detach()` (from autograd) should decide what
-  // the version counter should be for each new TensorImpl. See NOTE [ Version Counter Sharing ] for details.
+  // 2. the version counter, because it is set to the passed in `version_counter`.
+  //    See NOTE [ Version Counter Sharing ] for details.
   //
-  // NOTE: We don't set `allow_tensor_metadata_change_` to false here, because there are call sites
-  // to this function that need to change the shallow copy's size or storage afterwards, and setting
-  // `allow_tensor_metadata_change_` to false would prevent those changes from happening and is
-  // undesirable.
-  virtual c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach() const {
-    AT_ASSERT(!is_variable());  // TODO: remove this when Variable and Tensor are merged
+  // NOTE: `allow_tensor_metadata_change` determines whether the TensorImpl shallow-copy
+  // allows changes to its metadata (e.g. sizes / strides / storage / storage_offset).
+  // See NOTE [ Metadata Change for a Detached Tensor ] for details.
+  virtual c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
+      const c10::VariableVersion& version_counter,
+      bool allow_tensor_metadata_change) const {
+    TORCH_INTERNAL_ASSERT(!is_variable());  // TODO: remove this when Variable and Tensor are merged
     auto impl = c10::make_intrusive<TensorImpl>(Storage(storage()), type_id());
     impl->set_sizes_and_strides(sizes(), strides());
     impl->storage_offset_ = storage_offset_;
@@ -927,6 +926,8 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
     impl->reserved_ = reserved_;
     impl->refresh_numel();
     impl->refresh_contiguous();
+    impl->set_version_counter(version_counter);
+    impl->set_allow_tensor_metadata_change(allow_tensor_metadata_change);
     return impl;
   }
 
@@ -965,8 +966,10 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    * The device type of a Tensor, e.g., DeviceType::CPU or DeviceType::CUDA.
    */
   DeviceType device_type() const {
-    AT_ASSERT(!is_variable());  // TODO: remove this when Variable and Tensor are merged
-    AT_ASSERT(device_opt_.has_value());
+    TORCH_INTERNAL_ASSERT(!is_variable());  // TODO: remove this when Variable and Tensor are merged
+    // TODO: A useful internal assert would be to show that device_opt_ is null
+    // only if you are an undefined tensor
+    TORCH_CHECK(device_opt_.has_value(), "device_type cannot be run on undefined Tensor");
     // See NOTE [c10::optional operator usage in CUDA]
     return (*device_opt_).type();
   }
@@ -983,9 +986,9 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    * This op is auto-asynchronous if the underlying device (CUDA) supports it.
    */
   void Extend(int64_t num, float growthPct) {
-    AT_ASSERT(sizes_.size() >= 1u);
-    AT_ASSERTM(num >= 0, "`num` must be non-negative for Extend");
-    AT_ASSERTM(
+    TORCH_CHECK(sizes_.size() >= 1u);
+    TORCH_CHECK(num >= 0, "`num` must be non-negative for Extend");
+    TORCH_CHECK(
         is_contiguous_,
         "Right now Extend is only supported for contiguous Tensor.");
     auto newDims = sizes_;
@@ -1013,7 +1016,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
     Resize(newCapacity);
     auto* newData = raw_mutable_data(data_type_);
     if (data_type_.copy()) {
-      AT_ASSERTM(
+      TORCH_CHECK(
           device_type() == DeviceType::CPU,
           "non-POD types work only on CPU");
       data_type_.copy()(oldData.get(), newData, oldSize);
@@ -1048,10 +1051,10 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    */
   template <class T>
   void ReserveSpace(const T& outer_dim) {
-    AT_ASSERTM(
+    TORCH_CHECK(
         is_contiguous_,
         "Right now ReserveSpace is only supported for contiguous Tensor.");
-    AT_ASSERTM(
+    TORCH_CHECK(
         storage_.unique(), "Can't call ReserveSpace on shared storage.");
     auto newCapacity = sizes_;
     newCapacity[0] = outer_dim;
@@ -1122,15 +1125,15 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    * This requires the total size of the tensor to remains constant.
    */
   inline void Reshape(const std::vector<int64_t>& dims) {
-    AT_ASSERTM(
+    TORCH_CHECK(
         is_contiguous_,
         "Right now Reshape is only supported for contiguous Tensor.");
     int64_t new_size = 1;
     for (auto d : dims) {
-      AT_ASSERT(d >= 0);
+      TORCH_CHECK(d >= 0);
       new_size *= d;
     }
-    AT_ASSERTM(
+    TORCH_CHECK(
         new_size == numel_,
         "New size and old size are not equal. You cannot use Reshape, "
         "but should use Resize."
@@ -1172,20 +1175,20 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
     // Right now, we are assuming the device_type are the same, since it is
     // inherently the same in the non-templatized code. We should probably add
     // an assert here which might affect perf a little bit.
-    AT_ASSERTM(
+    TORCH_CHECK(
         src.numel_ == numel_,
         "Size mismatch - did you call reshape before sharing the data?");
     // It is possible that the source tensor hasn't called mutable_data() yet,
     // in which case ShareData() doesn't make much sense since we don't really
     // know what to share yet.
     // TODO: Add the assert after all uninitialized states are eliminated
-    // AT_ASSERTM(src.dtype_initialized(),
+    // TORCH_CHECK(src.dtype_initialized(),
     //            "Source tensor don't have a data type (did you call mutable_data<T> on the tensor?)");
     if (!src.dtype_initialized()) {
       C10_LOG_EVERY_MS(WARNING, 1000) <<
                    "Source tensor don't have a data type (did you call mutable_data<T> on the tensor?)";
     }
-    AT_ASSERTM(
+    TORCH_CHECK(
         src.storage_initialized(),
         "Source tensor has no content and has size > 0");
     // Finally, do sharing.
@@ -1202,7 +1205,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
       DataPtr&& data_ptr,
       const caffe2::TypeMeta& data_type,
       size_t capacity) {
-    AT_ASSERTM(
+    TORCH_CHECK(
         data_type.id() != caffe2::TypeIdentifier::uninitialized(),
         "To share with a raw external pointer you need to pass in an "
         "initialized data_type(TypeMeta).");
@@ -1264,7 +1267,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
       if (numel_ == 0 ||
           (meta.placementNew() == nullptr && !had_special_dtor &&
            storage_.numel() >= numel_)) {
-        AT_ASSERT(storage_offset_ == 0); // because we just reallocated
+        TORCH_INTERNAL_ASSERT(storage_offset_ == 0); // because we just reallocated
         return storage_.data();
       }
       const Allocator* allocator = storage_.allocator();
@@ -1291,7 +1294,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
             allocator->allocate(numel_ * storage_.itemsize()));
       }
       storage_.set_numel(numel_);
-      AT_ASSERT(storage_offset_ == 0); // because we just reallocated
+      TORCH_INTERNAL_ASSERT(storage_offset_ == 0); // because we just reallocated
       device_opt_ = storage_.device();
       return storage_.data();
     }
@@ -1321,7 +1324,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    * storage UNINITIALIZED after a Resize() or FreeMemory()
    */
   bool storage_initialized() const {
-    AT_ASSERT(has_storage());
+    TORCH_CHECK(has_storage(), "cannot call storage_initialized on tensor that does not have storage");
     return storage_.data() || numel_ == 0;
   }
 
@@ -1335,7 +1338,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
   }
 
   void set_storage(at::Storage storage) {
-    AT_CHECK(allow_tensor_metadata_change(), "set_storage is not allowed on Tensor created from .data or .detach()");
+    TORCH_CHECK(allow_tensor_metadata_change(), "set_storage is not allowed on Tensor created from .data or .detach()");
     storage_ = std::move(storage);
     data_type_ = storage_.dtype();
     device_opt_ = storage_.device();
@@ -1435,7 +1438,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    * Recompute the cached numel of a tensor.  Call this if you modify sizes.
    */
   void refresh_numel() {
-    AT_ASSERT(!is_variable());  // TODO: remove this when Variable and Tensor are merged
+    TORCH_INTERNAL_ASSERT(!is_variable());  // TODO: remove this when Variable and Tensor are merged
     numel_ = compute_numel();
   }
 
@@ -1444,7 +1447,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    * or strides.
    */
   void refresh_contiguous() {
-    AT_ASSERT(!is_variable());  // TODO: remove this when Variable and Tensor are merged
+    TORCH_INTERNAL_ASSERT(!is_variable());  // TODO: remove this when Variable and Tensor are merged
     is_contiguous_ = compute_contiguous();
   }
 
@@ -1487,6 +1490,9 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
 
   // INVARIANT: When storage is non-null, this Device must
   // agree with the type meta in storage.
+  //
+  // INVARIANT: device_opt_ is only nullopt for undefined tensors
+  // (which do not have a device.)
   c10::optional<c10::Device> device_opt_;
 
   // You get to have eight byte-size fields here, before you
@@ -1495,15 +1501,21 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
   bool is_contiguous_ = true;
   bool is_wrapped_number_ = false;
 
-  // Previously, if we change the tensor metadata (e.g. sizes / strides / storage / storage_offset)
-  // of a derived tensor (i.e. tensors created from Python `tensor.data` or Python/C++ `tensor.detach()`),
-  // those metadata in the original tensor will also be updated. However, the new behavior is that
-  // those metadata changes to a derived tensor will not update the original tensor anymore, and we
-  // need this flag to make such changes explicitly illegal, to prevent users from changing metadata of
-  // the derived tensor and expecting the original tensor to also be updated.
+  // NOTE [ Metadata Change for a Detached Tensor ]
+  //
+  // Normally, a user is allowed to change the tensor metadata
+  // (e.g. sizes / strides / storage / storage_offset) of a tensor.
+  // However, if the tensor is created by `t1_detached = t1.data` in Python
+  // or `t1_detached = t1.detach()` in Python/C++, those changes to the
+  // tensor metadata of `t1_detached` will not be propagated back to the
+  // original tensor `t1`. In order to make such changes explicitly illegal,
+  // we created the `allow_tensor_metadata_change_` flag, to prevent users
+  // from changing metadata of the detached tensor and expecting the original
+  // tensor to also be updated.
   //
-  // NOTE: For a full list of tensor metadata fields, please see `shallow_copy_and_detach()` in TensorImpl
-  // and its subclasses to find which fields are copied by value.
+  // NOTE: For a full list of tensor metadata fields, please see
+  // `shallow_copy_and_detach()` in TensorImpl and its subclasses to find
+  // which fields are copied by value.
   bool allow_tensor_metadata_change_ = true;
 
   // we decide to keep reserved_ and it will
diff --git a/c10/core/TensorTypeIdRegistration.h b/c10/core/TensorTypeIdRegistration.h
index 231f86f61842..0c51623b1f57 100644
--- a/c10/core/TensorTypeIdRegistration.h
+++ b/c10/core/TensorTypeIdRegistration.h
@@ -87,13 +87,13 @@ inline c10::TensorTypeId TensorTypeIdRegistrar::id() const noexcept {
   return id_;
 }
 
-#define C10_DECLARE_TENSOR_TYPE(TensorName) \
+#define C10_DECLARE_TENSOR_TYPE(TensorName)                \
   C10_API ::c10::TensorTypeId TensorName()
 
-#define C10_DEFINE_TENSOR_TYPE(TensorName)          \
-  ::c10::TensorTypeId TensorName() {                \
+#define C10_DEFINE_TENSOR_TYPE(TensorName)                 \
+  C10_EXPORT ::c10::TensorTypeId TensorName() {            \
     static ::c10::TensorTypeIdRegistrar registration_raii; \
-    return registration_raii.id();                  \
+    return registration_raii.id();                         \
   }
 
 C10_DECLARE_TENSOR_TYPE(UndefinedTensorId);
diff --git a/c10/core/thread_pool.cpp b/c10/core/thread_pool.cpp
index cc13566e29ee..1529f74bd4a9 100644
--- a/c10/core/thread_pool.cpp
+++ b/c10/core/thread_pool.cpp
@@ -2,8 +2,8 @@
 
 namespace c10 {
 
-ThreadPool::ThreadPool(std::size_t pool_size, int numa_node_id)
-    : threads_(pool_size),
+ThreadPool::ThreadPool(int pool_size, int numa_node_id)
+    : threads_(pool_size < 0 ? defaultNumThreads() : pool_size),
       running_(true),
       complete_(true),
       available_(threads_.size()),
@@ -48,6 +48,9 @@ bool ThreadPool::inThreadPool() const {
 }
 
 void ThreadPool::run(const std::function<void()>& func) {
+  if (threads_.size() == 0) {
+    throw std::runtime_error("No threads to run a task");
+  }
   std::unique_lock<std::mutex> lock(mutex_);
 
   // Set task and signal condition variable so that a worker thread will
@@ -120,20 +123,6 @@ void ThreadPool::main_loop(std::size_t index) {
   } // while running_
 }
 
-// constexpr initialization guaranteed to be before any static initialization
-std::atomic<int> num_threads{1};
-void setNumThreads(size_t v) {
-  if(-1  == num_threads.exchange(v)) {
-   throw std::runtime_error("Error: cannot set num threads after pool has started");
-  }
-}
-
-TaskThreadPoolBase& global_work_queue() {
-  static std::shared_ptr<TaskThreadPoolBase> pool =
-      ThreadPoolRegistry()->Create("C10", 0, num_threads.exchange(-1), false);
-  return *pool;
-}
-
 C10_DEFINE_SHARED_REGISTRY(
     ThreadPoolRegistry,
     TaskThreadPoolBase,
diff --git a/c10/core/thread_pool.h b/c10/core/thread_pool.h
index b4a716ac5b6a..5fe8b416c6f9 100644
--- a/c10/core/thread_pool.h
+++ b/c10/core/thread_pool.h
@@ -36,6 +36,10 @@ class C10_API TaskThreadPoolBase {
   virtual bool inThreadPool() const = 0;
 
   virtual ~TaskThreadPoolBase() noexcept {}
+
+  static size_t defaultNumThreads() {
+    return std::thread::hardware_concurrency();
+  }
 };
 
 class C10_API ThreadPool : public c10::TaskThreadPoolBase {
@@ -66,7 +70,7 @@ class C10_API ThreadPool : public c10::TaskThreadPoolBase {
   ThreadPool() = delete;
 
   explicit ThreadPool(
-      std::size_t pool_size,
+      int pool_size,
       int numa_node_id = -1);
 
   ~ThreadPool();
@@ -102,10 +106,6 @@ class C10_API ThreadPool : public c10::TaskThreadPoolBase {
   void main_loop(std::size_t index);
 };
 
-C10_API void setNumThreads(size_t v);
-
-C10_API TaskThreadPoolBase& global_work_queue();
-
 class C10_API TaskThreadPool : public c10::ThreadPool {
  public:
   explicit TaskThreadPool(
diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp
index f03ba432f26d..e5c5552ab1a4 100644
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@@ -338,7 +338,7 @@ struct THCCachingAllocator
     std::lock_guard<std::recursive_mutex> lock(mutex);
     Block* block = find_allocated_block(ptr);
     if (!block) {
-      AT_ERROR("invalid device pointer: %p", ptr);
+      AT_ERROR("invalid device pointer: ", ptr);
     }
     while (block->prev) {
       block = block->prev;
@@ -378,17 +378,21 @@ struct THCCachingAllocator
 
   void recordStream(void* ptr, cuda::CUDAStream stream)
   {
-    std::lock_guard<std::recursive_mutex> lock(mutex);
-    Block* block = find_allocated_block(ptr);
-    if (!block) {
-      AT_ERROR("invalid device pointer: %p", ptr);
-    }
-    if (stream.stream() == block->stream) {
-      // ignore uses on the allocation stream, since those don't require any
-      // special synchronization
-      return;
+    // Empty tensor's storage().data() might be a null ptr. As there is no
+    // blocks associated with those tensors, it is fine to do nothing here.
+    if (ptr) {
+      std::lock_guard<std::recursive_mutex> lock(mutex);
+      Block* block = find_allocated_block(ptr);
+      if (!block) {
+        AT_ERROR("invalid device pointer: ", ptr);
+      }
+      if (stream.stream() == block->stream) {
+        // ignore uses on the allocation stream, since those don't require any
+        // special synchronization
+        return;
+      }
+      block->stream_uses.insert(stream);
     }
-    block->stream_uses.insert(stream);
   }
 
   /** moves a block into a pool of cached free blocks */
diff --git a/c10/cuda/CUDAStream.h b/c10/cuda/CUDAStream.h
index 7f7f8640ae17..b23f8aa1c65f 100644
--- a/c10/cuda/CUDAStream.h
+++ b/c10/cuda/CUDAStream.h
@@ -68,7 +68,7 @@ class C10_CUDA_API CUDAStream {
   /// Construct a CUDAStream from a Stream.  This construction is checked,
   /// and will raise an error if the Stream is not, in fact, a CUDA stream.
   explicit CUDAStream(Stream stream) : stream_(stream) {
-    AT_CHECK(stream_.device_type() == DeviceType::CUDA);
+    TORCH_CHECK(stream_.device_type() == DeviceType::CUDA);
   }
 
   /// Construct a CUDAStream from a Stream with no error checking.
diff --git a/c10/macros/Macros.h b/c10/macros/Macros.h
index 879a276b6db4..09e4b46fcfee 100644
--- a/c10/macros/Macros.h
+++ b/c10/macros/Macros.h
@@ -35,19 +35,41 @@
 #define MACRO_EXPAND(args) args
 
 /// C10_NODISCARD - Warn if a type or return value is discarded.
+
+// Technically, we should check if __cplusplus > 201402L here, because
+// [[nodiscard]] is only defined in C++17.  However, some compilers
+// we care about don't advertise being C++17 (e.g., clang), but
+// support the attribute anyway.  In fact, this is not just a good idea,
+// it's the law: clang::warn_unused_result doesn't work on nvcc + clang
+// and the best workaround for this case is to use [[nodiscard]]
+// instead; see https://github.com/pytorch/pytorch/issues/13118
+//
+// Note to future editors: if you have noticed that a compiler is
+// misbehaving (e.g., it advertises support, but the support doesn't
+// actually work, or it is emitting warnings).  Some compilers which
+// are strict about the matter include MSVC, which will complain:
+//
+//  error C2429: attribute 'nodiscard' requires compiler flag '/std:c++latest'
+//
+// Exhibits:
+//  - MSVC 19.14: https://godbolt.org/z/Dzd7gn (requires /std:c++latest)
+//  - Clang 8.0.0: https://godbolt.org/z/3PYL4Z (always advertises support)
+//  - gcc 8.3: https://godbolt.org/z/4tLMQS (always advertises support)
 #define C10_NODISCARD
-#if __cplusplus > 201402L && defined(__has_cpp_attribute)
-#if __has_cpp_attribute(nodiscard)
-#undef C10_NODISCARD
-#define C10_NODISCARD [[nodiscard]]
-#endif
+#if defined(__has_cpp_attribute)
+# if __has_cpp_attribute(nodiscard)
+#  undef C10_NODISCARD
+#  define C10_NODISCARD [[nodiscard]]
+# endif
 // Workaround for llvm.org/PR23435, since clang 3.6 and below emit a spurious
 // error when __has_cpp_attribute is given a scoped attribute in C mode.
 #elif __cplusplus && defined(__has_cpp_attribute)
-#if __has_cpp_attribute(clang::warn_unused_result)
-#undef C10_NODISCARD
-#define C10_NODISCARD [[clang::warn_unused_result]]
-#endif
+# if __has_cpp_attribute(clang::warn_unused_result)
+// TODO: It's possible this is still triggering https://github.com/pytorch/pytorch/issues/13118
+// on Windows; if it is, better fix it.
+#  undef C10_NODISCARD
+#  define C10_NODISCARD [[clang::warn_unused_result]]
+# endif
 #endif
 
 // suppress an unused variable.
diff --git a/c10/util/ArrayRef.h b/c10/util/ArrayRef.h
index 989e12143935..2f376d46b9b0 100644
--- a/c10/util/ArrayRef.h
+++ b/c10/util/ArrayRef.h
@@ -145,13 +145,13 @@ class ArrayRef final {
 
   /// front - Get the first element.
   AT_CPP14_CONSTEXPR const T& front() const {
-    AT_CHECK(!empty(), "ArrayRef: attempted to access front() of empty list");
+    TORCH_CHECK(!empty(), "ArrayRef: attempted to access front() of empty list");
     return Data[0];
   }
 
   /// back - Get the last element.
   AT_CPP14_CONSTEXPR const T& back() const {
-    AT_CHECK(!empty(), "ArrayRef: attempted to access back() of empty list");
+    TORCH_CHECK(!empty(), "ArrayRef: attempted to access back() of empty list");
     return Data[Length - 1];
   }
 
@@ -163,7 +163,7 @@ class ArrayRef final {
   /// slice(n, m) - Chop off the first N elements of the array, and keep M
   /// elements in the array.
   AT_CPP14_CONSTEXPR ArrayRef<T> slice(size_t N, size_t M) const {
-    AT_CHECK(
+    TORCH_CHECK(
         N + M <= size(),
         "ArrayRef: invalid slice, N = ",
         N,
@@ -188,7 +188,7 @@ class ArrayRef final {
 
   /// Vector compatibility
   AT_CPP14_CONSTEXPR const T& at(size_t Index) const {
-    AT_CHECK(
+    TORCH_CHECK(
         Index < Length,
         "ArrayRef: invalid index Index = ",
         Index,
diff --git a/c10/util/Deprecated.h b/c10/util/Deprecated.h
index b9bcf22eb207..59acf78d6d21 100644
--- a/c10/util/Deprecated.h
+++ b/c10/util/Deprecated.h
@@ -39,36 +39,45 @@
 // Sample usage:
 //
 //    using BadType C10_DEPRECATED_USING = int;
-//
 
-#if defined(__cplusplus) && __cplusplus >= 201402L
+// technically [[deprecated]] syntax is from c++14 standard, but it works in
+// many compilers.
+#if defined(__has_cpp_attribute)
+#if __has_cpp_attribute(deprecated)
 # define C10_DEPRECATED_USING [[deprecated]]
-#elif defined(_MSC_VER) && defined(__CUDACC__)
-// Apparently, [[deprecated]] doesn't work on nvcc on Windows;
+#endif
+#endif
+
+#if !defined(C10_DEPRECATED_USING) && defined(_MSC_VER)
+#if defined(__CUDACC__)
+// [[deprecated]] doesn't work on nvcc on Windows;
 // you get the error:
 //
 //    error: attribute does not apply to any entity
 //
 // So we just turn the macro off in this case.
 # define C10_DEPRECATED_USING
-#elif defined(_MSC_VER)
-// __declspec(deprecated) does not work in using declarations:
-//  https://godbolt.org/z/lOwe1h
-// but it seems that most of C++14 is available in MSVC even if you don't ask for
-// it. (It's also harmless to specify an attribute because it is C++11 supported
-// syntax; you mostly risk it not being understood).  Some more notes at
-// https://blogs.msdn.microsoft.com/vcblog/2016/06/07/standards-version-switches-in-the-compiler/
+#else
+// [[deprecated]] does work in windows without nvcc, though msc doesn't support
+// `__has_cpp_attribute`.
 # define C10_DEPRECATED_USING [[deprecated]]
-#elif defined(__CUDACC__)
+#endif
+#endif
+
+#if !defined(C10_DEPRECATED_USING) && defined(__GNUC__)
 // nvcc has a bug where it doesn't understand __attribute__((deprecated))
-// declarations even when the host compiler supports it.  It's OK
-// with [[deprecated]] though (although, if you are on an old version
-// of gcc which doesn't understand attributes, you'll get a -Wattributes
-// error that it is ignored
-# define C10_DEPRECATED_USING [[deprecated]]
-#elif defined(__GNUC__)
+// declarations even when the host compiler supports it. We'll only use this gcc
+// attribute when not cuda, and when using a GCC compiler that doesn't support
+// the c++14 syntax we checked for above (availble in __GNUC__ >= 5)
+#if !defined(__CUDACC__)
 # define C10_DEPRECATED_USING __attribute__((deprecated))
 #else
+// using cuda + gcc < 5, neither deprecated syntax is available so turning off.
+# define C10_DEPRECATED_USING
+#endif
+#endif
+
+#if ! defined(C10_DEPRECATED_USING)
 # warning "You need to implement C10_DEPRECATED_USING for this compiler"
 # define C10_DEPRECATED_USING
 #endif
diff --git a/c10/util/Exception.h b/c10/util/Exception.h
index 8c4fbed94008..6d270d944a54 100644
--- a/c10/util/Exception.h
+++ b/c10/util/Exception.h
@@ -3,6 +3,7 @@
 
 #include "c10/macros/Macros.h"
 #include "c10/util/StringUtil.h"
+#include "c10/util/Deprecated.h"
 
 #include <cstddef>
 #include <exception>
@@ -19,8 +20,8 @@ namespace c10 {
 
 /// The primary ATen error class.
 /// Provides a complete error message with source location information via
-/// `what()`, and a more concise message via `what_without_backtrace()`. Should
-/// primarily be used with the `AT_ERROR` macro.
+/// `what()`, and a more concise message via `what_without_backtrace()`.
+/// Don't throw this directly; use TORCH_CHECK/TORCH_INTERNAL_ASSERT instead.
 ///
 /// NB: c10::Error is handled specially by the default torch to suppress the
 /// backtrace, see torch/csrc/Exceptions.h
@@ -102,7 +103,8 @@ class C10_API Warning {
 };
 
 // Used in ATen for out-of-bound indices that can reasonably only be detected
-// lazily inside a kernel (See: advanced indexing).
+// lazily inside a kernel (See: advanced indexing).  These turn into
+// IndexError when they cross to Python.
 class C10_API IndexError : public Error {
   using Error::Error;
 };
@@ -112,58 +114,260 @@ class C10_API IndexError : public Error {
 // exception type before its what() content
 C10_API std::string GetExceptionString(const std::exception& e);
 
-} // namespace c10
+namespace detail {
+
+// Return x if it is non-empty; otherwise return y.
+inline std::string if_empty_then(std::string x, std::string y) {
+  if (x.empty()) {
+    return y;
+  } else {
+    return x;
+  }
+}
 
-// TODO: variants that print the expression tested and thus don't require
-// strings
-// TODO: CAFFE_ENFORCE_WITH_CALLER style macro
+}
 
-// TODO: move AT_ERROR to C10_ERROR
-// TODO: consolidate the enforce and assert messages. Assert is a bit confusing
-// as c++ assert quits, while this throws.
-// TODO: merge AT_CHECK with AT_ASSERTM. CHECK in fbcode means strict failure if
-// not met.
 
-// In the debug build With MSVC, __LINE__ might be of long type (a.k.a int32_t),
+} // namespace c10
+
+// Private helper macro for implementing TORCH_INTERNAL_ASSERT and TORCH_CHECK
+//
+// Note: In the debug build With MSVC, __LINE__ might be of long type (a.k.a int32_t),
 // which is different from the definition of `SourceLocation` that requires
 // unsigned int (a.k.a uint32_t) and may cause a compile error with the message:
 // error C2397: conversion from 'long' to 'uint32_t' requires a narrowing conversion
 // Here the static cast is used to pass the build.
+#define C10_THROW_ERROR(err_type, msg) \
+  throw ::c10::err_type({__func__, __FILE__, static_cast<uint32_t>(__LINE__)}, msg)
 
-#define AT_ERROR(...) \
-  throw ::c10::Error({__func__, __FILE__, static_cast<uint32_t>(__LINE__)}, ::c10::str(__VA_ARGS__))
+// Private helper macro for workaround MSVC misexpansion of nested macro
+// invocations involving __VA_ARGS__.  See
+// https://stackoverflow.com/questions/5134523/msvc-doesnt-expand-va-args-correctly
+#define C10_EXPAND_MSVC_WORKAROUND(x) x
 
-#define AT_INDEX_ERROR(...) \
-  throw ::c10::IndexError({__func__, __FILE__, static_cast<uint32_t>(__LINE__)}, ::c10::str(__VA_ARGS__))
 
-#define AT_WARN(...) \
-  ::c10::Warning::warn({__func__, __FILE__, static_cast<uint32_t>(__LINE__)}, ::c10::str(__VA_ARGS__))
+// ----------------------------------------------------------------------------
+// Error reporting macros
+// ----------------------------------------------------------------------------
 
-#define AT_ASSERT(cond)                       \
+// A utility macro to provide assert()-like functionality; that is, enforcement
+// of internal invariants in code.  It supports an arbitrary number of extra
+// arguments (evaluated only on failure), which will be printed in the assert
+// failure message using operator<< (this is useful to print some variables
+// which may be useful for debugging.)
+//
+// Usage:
+//    TORCH_INTERNAL_ASSERT(should_be_true);
+//    TORCH_INTERNAL_ASSERT(x == 0, "x = ", x);
+//
+// Assuming no bugs in PyTorch, the conditions tested by this macro should
+// always be true; e.g., it should be possible to disable all of these
+// conditions without changing observable user behavior.  If you would like to
+// do error reporting for user input, please use TORCH_CHECK instead.
+//
+// NOTE: It is SAFE to use this macro in production code; on failure, this
+// simply raises an exception, it does NOT unceremoniously quit the process
+// (unlike assert()).
+//
+#ifdef C10_MOBILE
+#define TORCH_INTERNAL_ASSERT(cond, ...)      \
   if (!(cond)) {                              \
-    AT_ERROR(                                 \
-        #cond " ASSERT FAILED at ",           \
-        __FILE__,                             \
-        ":",                                  \
-        __LINE__,                             \
-        ", please report a bug to PyTorch."); \
+    C10_THROW_ERROR(Error,                    \
+        #cond " INTERNAL ASSERT FAILED at"    \
+        __FILE__                              \
+    );                                        \
   }
-
-#define AT_ASSERTM(cond, ...)                 \
+#else
+#define TORCH_INTERNAL_ASSERT(cond, ...)      \
   if (!(cond)) {                              \
-    AT_ERROR(::c10::str(                      \
-        #cond,                                \
-        " ASSERT FAILED at ",                 \
+    C10_THROW_ERROR(Error, ::c10::str(        \
+        #cond " INTERNAL ASSERT FAILED at ",  \
         __FILE__,                             \
         ":",                                  \
         __LINE__,                             \
         ", please report a bug to PyTorch. ", \
-        __VA_ARGS__));                        \
+        ::c10::str(__VA_ARGS__)               \
+    ));                                       \
   }
+#endif
+
+// A utility macro to make it easier to test for error conditions from user
+// input.  Like TORCH_INTERNAL_ASSERT, it supports an arbitrary number of extra
+// arguments (evaluated only on failure), which will be printed in the error
+// message using operator<< (e.g., you can pass any object which has
+// operator<< defined.  Most objects in PyTorch have these definitions!)
+//
+// Usage:
+//    TORCH_CHECK(should_be_true); // A default error message will be provided
+//                                 // in this case; but we recommend writing an
+//                                 // explicit error message, as it is more
+//                                 // user friendly.
+//    TORCH_CHECK(x == 0, "Expected x to be 0, but got ", x);
+//
+// On failure, this macro will raise an exception.  If this exception propagates
+// to Python, it will convert into a Python RuntimeError.
+//
+// NOTE: It is SAFE to use this macro in production code; on failure, this
+// simply raises an exception, it does NOT unceremoniously quit the process
+// (unlike CHECK() from glog.)
+//
+#ifdef C10_MOBILE
+#define TORCH_CHECK(cond, ...)                \
+  if (!(cond)) {                              \
+    C10_THROW_ERROR(Error,                    \
+        #cond " CHECK FAILED at "             \
+        __FILE__                              \
+    );                                        \
+  }
+#else
+#define TORCH_CHECK(cond, ...)                              \
+  if (!(cond)) {                                            \
+    C10_THROW_ERROR(Error,                                  \
+      ::c10::detail::if_empty_then(                         \
+        ::c10::str(__VA_ARGS__),                            \
+        "Expected " #cond " to be true, but got false.  "   \
+        "(Could this error message be improved?  If so, "   \
+        "please report an enhancement request to PyTorch.)" \
+      ) \
+    ); \
+  }
+#endif
+// TODO: We're going to get a lot of similar looking string literals
+// this way; check if this actually affects binary size.
 
-#define AT_CHECK(cond, ...)            \
-  if (!(cond)) {                       \
-    AT_ERROR(::c10::str(__VA_ARGS__)); \
+// Like TORCH_CHECK, but raises IndexErrors instead of Errors.
+#ifdef C10_MOBILE
+#define TORCH_CHECK_INDEX(cond, ...)          \
+  if (!(cond)) {                              \
+    C10_THROW_ERROR(Error,                    \
+        #cond " INDEX CHECK FAILED at "       \
+        __FILE__                              \
+    );                                        \
+  }
+#else
+#define TORCH_CHECK_INDEX(cond, ...)                        \
+  if (!(cond)) {                                            \
+    C10_THROW_ERROR(IndexError,                             \
+      ::c10::detail::if_empty_then(                         \
+        ::c10::str(__VA_ARGS__),                            \
+        "Expected " #cond " to be true, but got false.  "   \
+        "(Could this error message be improved?  If so, "   \
+        "please report an enhancement request to PyTorch.)" \
+      )                                                     \
+    );                                                      \
   }
+#endif
+
+
+// Report a warning to the user.  Accepts an arbitrary number of extra
+// arguments which are concatenated into the warning message using operator<<
+//
+#define TORCH_WARN(...) \
+  ::c10::Warning::warn({__func__, __FILE__, static_cast<uint32_t>(__LINE__)}, ::c10::str(__VA_ARGS__))
+
+
+// ----------------------------------------------------------------------------
+// Deprecated macros
+// ----------------------------------------------------------------------------
+
+namespace c10 { namespace detail {
+
+/*
+// Deprecation disabled until we fix sites in our codebase
+C10_DEPRECATED_MESSAGE("AT_ERROR(msg) is deprecated, use TORCH_CHECK(false, msg) instead.")
+*/
+inline void deprecated_AT_ERROR() {}
+
+/*
+// Deprecation disabled until we fix sites in our codebase
+C10_DEPRECATED_MESSAGE("AT_INDEX_ERROR(msg) is deprecated, use TORCH_CHECK_INDEX(false, msg) instead.")
+*/
+inline void deprecated_AT_INDEX_ERROR() {}
+
+/*
+// Deprecation disabled until we fix sites in our codebase
+C10_DEPRECATED_MESSAGE("AT_WARN is deprecated, use TORCH_WARN instead.")
+*/
+inline void deprecated_AT_WARN() {}
+
+C10_DEPRECATED_MESSAGE("AT_CHECK is deprecated, use TORCH_CHECK instead.")
+inline void deprecated_AT_CHECK() {}
+
+/*
+// Deprecation disabled until we fix sites in our codebase
+C10_DEPRECATED_MESSAGE("AT_ASSERT is deprecated, if you mean to indicate an internal invariant failure, use " \
+                       "TORCH_INTERNAL_ASSERT instead; if you mean to do user error checking, use " \
+                       "TORCH_CHECK.  See https://github.com/pytorch/pytorch/issues/20287 for more details.")
+*/
+inline void deprecated_AT_ASSERT() {}
+
+/*
+// Deprecation disabled until we fix sites in our codebase
+C10_DEPRECATED_MESSAGE("AT_ASSERTM is deprecated, if you mean to indicate an internal invariant failure, use " \
+                       "TORCH_INTERNAL_ASSERT instead; if you mean to do user error checking, use " \
+                       "TORCH_CHECK.  See https://github.com/pytorch/pytorch/issues/20287 for more details.")
+*/
+inline void deprecated_AT_ASSERTM() {}
+
+}} // namespace c10::detail
+
+// Deprecated alias; this alias was deprecated because it wasn't clear to
+// people that you should use a macro with AT_ prefix inside the torch/csrc
+// directory.  Use TORCH_CHECK instead.
+#define AT_CHECK(...)                                     \
+  do {                                                    \
+    ::c10::detail::deprecated_AT_CHECK();                 \
+    C10_EXPAND_MSVC_WORKAROUND(TORCH_CHECK(__VA_ARGS__)); \
+  } while (false);
+
+// Deprecated alias; this alias was deprecated because people kept mistakenly
+// using it for user error checking.  Use TORCH_INTERNAL_ASSERT or TORCH_CHECK
+// instead. See https://github.com/pytorch/pytorch/issues/20287 for more details.
+#define AT_ASSERT(...)                                              \
+  do {                                                              \
+    ::c10::detail::deprecated_AT_ASSERT();                          \
+    C10_EXPAND_MSVC_WORKAROUND(TORCH_INTERNAL_ASSERT(__VA_ARGS__)); \
+  } while (false);
+
+// Deprecated alias, like AT_ASSERT.  The new TORCH_INTERNAL_ASSERT macro supports
+// both 0-ary and variadic calls, so having a separate message-accepting macro
+// is not necessary.
+//
+// NB: we MUST include cond explicitly here, as MSVC will miscompile the macro
+// expansion, shunting all of __VA_ARGS__ to cond.  An alternate workaround
+// can be seen at
+// https://stackoverflow.com/questions/5134523/msvc-doesnt-expand-va-args-correctly
+#define AT_ASSERTM(cond, ...)                                                 \
+  do {                                                                        \
+    ::c10::detail::deprecated_AT_ASSERTM();                                   \
+    C10_EXPAND_MSVC_WORKAROUND(TORCH_INTERNAL_ASSERT(cond, __VA_ARGS__));     \
+  } while (false);
+
+// Deprecated alias; this alias was deprecated because it represents extra API
+// surface that makes it hard for people to understand what macro to use.
+// Use TORCH_CHECK(false, ...) or TORCH_INTERNAL_ASSERT(false, ...) to
+// unconditionally fail at a line of code.
+#define AT_ERROR(...)                                                         \
+  do {                                                                        \
+    ::c10::detail::deprecated_AT_ERROR();                                     \
+    C10_EXPAND_MSVC_WORKAROUND(TORCH_CHECK(false, ::c10::str(__VA_ARGS__)));  \
+  } while (false);
+
+// Deprecated alias; this alias was deprecated for consistency with TORCH_CHECK.
+#define AT_INDEX_ERROR(...)                                                         \
+  do {                                                                              \
+    ::c10::detail::deprecated_AT_INDEX_ERROR();                                     \
+    C10_EXPAND_MSVC_WORKAROUND(TORCH_CHECK_INDEX(false, ::c10::str(__VA_ARGS__)));  \
+  } while (false);
+
+// Deprecated alias; this alias was deprecated because it wasn't clear to
+// people that you should use a macro with AT_ prefix inside the torch/csrc
+// directory.  Use TORCH_WARN instead.
+#define AT_WARN(...)                                      \
+  do {                                                    \
+    ::c10::detail::deprecated_AT_WARN();                  \
+    C10_EXPAND_MSVC_WORKAROUND(TORCH_WARN(__VA_ARGS__));  \
+  } while (false);
+
 
 #endif // C10_UTIL_EXCEPTION_H_
diff --git a/c10/util/Registry.h b/c10/util/Registry.h
index 544ebed93aef..060f80d5fa0d 100644
--- a/c10/util/Registry.h
+++ b/c10/util/Registry.h
@@ -71,9 +71,11 @@ class Registry {
     if (registry_.count(key) != 0) {
       auto cur_priority = priority_[key];
       if (priority > cur_priority) {
+  #ifdef DEBUG
         std::string warn_msg =
             "Overwriting already registered item for key " + KeyStrRepr(key);
         fprintf(stderr, "%s\n", warn_msg.c_str());
+  #endif
         registry_[key] = creator;
         priority_[key] = priority;
       } else if (priority == cur_priority) {
diff --git a/c10/util/StringUtil.cpp b/c10/util/StringUtil.cpp
index 4f49fad6646f..7c46b1332074 100644
--- a/c10/util/StringUtil.cpp
+++ b/c10/util/StringUtil.cpp
@@ -26,8 +26,8 @@ std::ostream& operator<<(std::ostream& out, const SourceLocation& loc) {
 }
 
 size_t ReplaceAll(std::string& s, const char* from, const char* to) {
-  AT_CHECK(from && *from, "");
-  AT_CHECK(to, "");
+  TORCH_CHECK(from && *from, "");
+  TORCH_CHECK(to, "");
 
   size_t numReplaced = 0;
   std::string::size_type lenFrom = std::strlen(from);
diff --git a/c10/util/numa.cpp b/c10/util/numa.cpp
index 2f4ceda56f18..99d58c23f846 100644
--- a/c10/util/numa.cpp
+++ b/c10/util/numa.cpp
@@ -27,7 +27,7 @@ void NUMABind(int numa_node_id) {
     return;
   }
 
-  AT_CHECK(
+  TORCH_CHECK(
       numa_node_id <= numa_max_node(),
       "NUMA node id ",
       numa_node_id,
@@ -46,7 +46,7 @@ int GetNUMANode(const void* ptr) {
   AT_ASSERT(ptr);
 
   int numa_node = -1;
-  AT_CHECK(
+  TORCH_CHECK(
       get_mempolicy(
           &numa_node,
           NULL,
@@ -83,7 +83,7 @@ void NUMAMove(void* ptr, size_t size, int numa_node_id) {
       numa_node_id >= 0 &&
       static_cast<unsigned>(numa_node_id) < sizeof(unsigned long) * 8);
   unsigned long mask = 1UL << numa_node_id;
-  AT_CHECK(
+  TORCH_CHECK(
       mbind(
           reinterpret_cast<void*>(page_start_ptr),
           size + offset,
diff --git a/c10/util/qint32.h b/c10/util/qint32.h
new file mode 100644
index 000000000000..0aa744ee1701
--- /dev/null
+++ b/c10/util/qint32.h
@@ -0,0 +1,15 @@
+#pragma once
+#include <cstdint>
+
+namespace c10 {
+
+/**
+ * qint32 is for signed 32 bit quantized Tensors
+ */
+struct alignas(4) qint32 {
+  using underlying = int32_t;
+  int32_t val_;
+  explicit qint32(int32_t val) : val_(val) {}
+};
+
+} // namespace c10
diff --git a/c10/util/qint8.h b/c10/util/qint8.h
index e6c37216803c..27dd7b37351a 100644
--- a/c10/util/qint8.h
+++ b/c10/util/qint8.h
@@ -5,12 +5,13 @@ namespace c10 {
 
 /**
  * This is the data type for quantized Tensors. Right now we only have
- * qint8 which is for 8 bit Tensors, we might have 4 bit, 2 bit or 1 bit
- * data types in the future.
+ * qint8 which is for 8 bit Tensors, and qint32 for 32 bit int Tensors,
+ * we might have 4 bit, 2 bit or 1 bit data types in the future.
  */
 struct alignas(1) qint8 {
-  uint8_t val_;
-  explicit qint8(uint8_t val) : val_(val) {}
+  using underlying = int8_t;
+  int8_t val_;
+  explicit qint8(int8_t val) : val_(val) {}
 };
 
 } // namespace c10
diff --git a/c10/util/quint8.h b/c10/util/quint8.h
new file mode 100644
index 000000000000..0dbef3764283
--- /dev/null
+++ b/c10/util/quint8.h
@@ -0,0 +1,15 @@
+#pragma once
+#include <cstdint>
+
+namespace c10 {
+
+/**
+ * qint8 is for signed 8 bit quantized Tensors
+ */
+struct alignas(1) quint8 {
+  using underlying = uint8_t;
+  uint8_t val_;
+  explicit quint8(uint8_t val) : val_(val) {}
+};
+
+} // namespace c10
diff --git a/c10/util/typeid.cpp b/c10/util/typeid.cpp
index edfe204f6e41..0d56050a4de5 100644
--- a/c10/util/typeid.cpp
+++ b/c10/util/typeid.cpp
@@ -78,10 +78,13 @@ CAFFE_DEFINE_PREALLOCATED_KNOWN_TYPE(24, int*)
 CAFFE_DEFINE_PREALLOCATED_KNOWN_TYPE(25, detail::_guard_long_unique<long>);
 CAFFE_DEFINE_PREALLOCATED_KNOWN_TYPE(
     26,
-    detail::_guard_long_unique<std::vector<long>>);
+    detail::_guard_long_unique<std::vector<long>>)
+
+CAFFE_DEFINE_PREALLOCATED_KNOWN_TYPE(27, float*)
+CAFFE_DEFINE_PREALLOCATED_KNOWN_TYPE(28, at::Half*)
+CAFFE_DEFINE_PREALLOCATED_KNOWN_TYPE(29, c10::qint8)
+CAFFE_DEFINE_PREALLOCATED_KNOWN_TYPE(30, c10::quint8)
+CAFFE_DEFINE_PREALLOCATED_KNOWN_TYPE(31, c10::qint32)
+CAFFE_DEFINE_PREALLOCATED_KNOWN_TYPE(32, _CaffeHighestPreallocatedTypeId)
 
-CAFFE_DEFINE_PREALLOCATED_KNOWN_TYPE(27, c10::qint8);
-CAFFE_DEFINE_PREALLOCATED_KNOWN_TYPE(28, _CaffeHighestPreallocatedTypeId)
-CAFFE_DEFINE_PREALLOCATED_KNOWN_TYPE(29, float*)
-CAFFE_DEFINE_PREALLOCATED_KNOWN_TYPE(30, at::Half*)
 } // namespace caffe2
diff --git a/c10/util/typeid.h b/c10/util/typeid.h
index 058a0a7809e9..36e39458ac0f 100644
--- a/c10/util/typeid.h
+++ b/c10/util/typeid.h
@@ -17,15 +17,16 @@
 
 #include <exception>
 
-#include "c10/macros/Macros.h"
-#include "c10/util/Backtrace.h"
-#include "c10/util/C++17.h"
-#include "c10/util/Exception.h"
-#include "c10/util/Half.h"
-#include "c10/util/IdWrapper.h"
-#include "c10/util/qint8.h"
-
-#include "c10/util/Type.h"
+#include <c10/macros/Macros.h>
+#include <c10/util/Backtrace.h>
+#include <c10/util/C++17.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Half.h>
+#include <c10/util/IdWrapper.h>
+#include <c10/util/qint8.h>
+#include <c10/util/quint8.h>
+#include <c10/util/qint32.h>
+#include <c10/util/Type.h>
 
 /*
  * TypeIdentifier is a small type containing an id.
@@ -623,8 +624,11 @@ CAFFE_DECLARE_PREALLOCATED_KNOWN_TYPE(
     26,
     detail::_guard_long_unique<std::vector<long>>)
 
-CAFFE_DECLARE_PREALLOCATED_KNOWN_TYPE(27, c10::qint8);
-CAFFE_DECLARE_PREALLOCATED_KNOWN_TYPE(28, _CaffeHighestPreallocatedTypeId)
-CAFFE_DECLARE_PREALLOCATED_KNOWN_TYPE(29, float*)
-CAFFE_DECLARE_PREALLOCATED_KNOWN_TYPE(30, at::Half*)
+CAFFE_DECLARE_PREALLOCATED_KNOWN_TYPE(27, float*)
+CAFFE_DECLARE_PREALLOCATED_KNOWN_TYPE(28, at::Half*)
+CAFFE_DECLARE_PREALLOCATED_KNOWN_TYPE(29, c10::qint8)
+CAFFE_DECLARE_PREALLOCATED_KNOWN_TYPE(30, c10::quint8)
+CAFFE_DECLARE_PREALLOCATED_KNOWN_TYPE(31, c10::qint32)
+CAFFE_DECLARE_PREALLOCATED_KNOWN_TYPE(32, _CaffeHighestPreallocatedTypeId)
+
 } // namespace caffe2
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index d19d88aa0d3c..96c2ed92b81a 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -54,11 +54,6 @@ else()
   # See cmake/Codegen.cmake for header installation
 endif()
 
-# ---[ Torch build
-if(BUILD_TORCH)
-  add_subdirectory(../torch torch)
-endif()
-
 # ---[ Caffe2 build
 # Note: the folders that are being commented out have not been properly
 # addressed yet.
@@ -211,12 +206,630 @@ if(NOT BUILD_ATEN_ONLY)
   endif()
 endif()
 
+
+# ==========================================================
+# formerly-libtorch
+# ==========================================================
+
+set(TORCH_SRC_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../torch")
+set(TORCH_ROOT "${TORCH_SRC_DIR}/..")
+
+if(NOT TORCH_INSTALL_BIN_DIR)
+  set(TORCH_INSTALL_BIN_DIR bin)
+endif()
+
+if(NOT TORCH_INSTALL_INCLUDE_DIR)
+  set(TORCH_INSTALL_INCLUDE_DIR include)
+endif()
+
+if(NOT TORCH_INSTALL_LIB_DIR)
+  set(TORCH_INSTALL_LIB_DIR lib)
+endif()
+
+
+
+if (NOT INTERN_BUILD_MOBILE)
+
+
+  set(CMAKE_POSITION_INDEPENDENT_CODE TRUE)
+
+  # Generate files
+  set(TOOLS_PATH "${TORCH_ROOT}/tools")
+
+  configure_file("${TORCH_ROOT}/aten/src/ATen/common_with_cwrap.py"
+    "${TOOLS_PATH}/shared/cwrap_common.py"
+    COPYONLY)
+
+  configure_file("${TORCH_SRC_DIR}/_utils_internal.py"
+    "${TOOLS_PATH}/shared/_utils_internal.py"
+    COPYONLY)
+
+
+  set(GENERATED_CXX_TORCH
+    "${TORCH_SRC_DIR}/csrc/autograd/generated/Functions.cpp"
+    "${TORCH_SRC_DIR}/csrc/autograd/generated/VariableType_0.cpp"
+    "${TORCH_SRC_DIR}/csrc/autograd/generated/VariableType_1.cpp"
+    "${TORCH_SRC_DIR}/csrc/autograd/generated/VariableType_2.cpp"
+    "${TORCH_SRC_DIR}/csrc/autograd/generated/VariableType_3.cpp"
+    "${TORCH_SRC_DIR}/csrc/autograd/generated/VariableType_4.cpp"
+    "${TORCH_SRC_DIR}/csrc/jit/generated/register_aten_ops_0.cpp"
+    "${TORCH_SRC_DIR}/csrc/jit/generated/register_aten_ops_1.cpp"
+    "${TORCH_SRC_DIR}/csrc/jit/generated/register_aten_ops_2.cpp"
+    )
+
+  set(GENERATED_H_TORCH
+    "${TORCH_SRC_DIR}/csrc/autograd/generated/VariableType.h"
+    "${TORCH_SRC_DIR}/csrc/autograd/generated/Functions.h"
+    "${TORCH_SRC_DIR}/csrc/autograd/generated/variable_factories.h"
+    )
+
+  set(GENERATED_THNN_CXX_CUDA ${TORCH_SRC_DIR}/csrc/nn/THCUNN.cpp)
+  set(GENERATED_THNN_CXX ${TORCH_SRC_DIR}/csrc/nn/THNN.cpp)
+
+  set(GENERATED_CXX_PYTHON
+    "${TORCH_SRC_DIR}/csrc/autograd/generated/python_functions.cpp"
+    "${TORCH_SRC_DIR}/csrc/autograd/generated/python_variable_methods.cpp"
+    "${TORCH_SRC_DIR}/csrc/autograd/generated/python_torch_functions.cpp"
+    "${TORCH_SRC_DIR}/csrc/autograd/generated/python_nn_functions.cpp"
+    )
+
+  set(GENERATED_H_PYTHON
+    "${TORCH_SRC_DIR}/csrc/autograd/generated/python_functions.h"
+    "${TORCH_SRC_DIR}/csrc/autograd/generated/python_variable_methods_dispatch.h"
+    "${TORCH_SRC_DIR}/csrc/autograd/generated/python_torch_functions_dispatch.h"
+    "${TORCH_SRC_DIR}/csrc/autograd/generated/python_nn_functions.h"
+    "${TORCH_SRC_DIR}/csrc/autograd/generated/python_nn_functions_dispatch.h"
+    )
+
+  set(GENERATED_THNN_SOURCES
+    ${GENERATED_THNN_CXX}
+    ${GENERATED_THNN_CXX_CUDA}
+    )
+
+  set(TORCH_GENERATED_CODE
+    ${GENERATED_CXX_TORCH}
+    ${GENERATED_THNN_SOURCES}
+    ${GENERATED_H_TORCH}
+    ${GENERATED_CXX_PYTHON}
+    ${GENERATED_H_PYTHON}
+    )
+
+  add_custom_command(
+    OUTPUT
+    ${TORCH_GENERATED_CODE}
+    COMMAND
+    "${PYTHON_EXECUTABLE}" tools/setup_helpers/generate_code.py
+      --declarations-path "${CMAKE_BINARY_DIR}/aten/src/ATen/Declarations.yaml"
+      --nn-path "aten/src"
+    DEPENDS
+    "${CMAKE_BINARY_DIR}/aten/src/ATen/Declarations.yaml"
+    "${CMAKE_CURRENT_LIST_DIR}/../aten/src/THNN/generic/THNN.h"
+    "${TOOLS_PATH}/autograd/templates/VariableType.h"
+    "${TOOLS_PATH}/autograd/templates/VariableType.cpp"
+    "${TOOLS_PATH}/autograd/templates/Functions.h"
+    "${TOOLS_PATH}/autograd/templates/Functions.cpp"
+    "${TOOLS_PATH}/autograd/templates/python_functions.h"
+    "${TOOLS_PATH}/autograd/templates/python_functions.cpp"
+    "${TOOLS_PATH}/autograd/templates/python_variable_methods.cpp"
+    "${TOOLS_PATH}/autograd/templates/python_variable_methods_dispatch.h"
+    "${TOOLS_PATH}/autograd/templates/python_torch_functions.cpp"
+    "${TOOLS_PATH}/autograd/templates/python_torch_functions_dispatch.h"
+    "${TOOLS_PATH}/autograd/templates/python_nn_functions.cpp"
+    "${TOOLS_PATH}/autograd/templates/python_nn_functions.h"
+    "${TOOLS_PATH}/autograd/templates/python_nn_functions_dispatch.h"
+    "${TOOLS_PATH}/autograd/templates/variable_factories.h"
+    "${TOOLS_PATH}/autograd/deprecated.yaml"
+    "${TOOLS_PATH}/autograd/derivatives.yaml"
+    "${TOOLS_PATH}/autograd/gen_autograd_functions.py"
+    "${TOOLS_PATH}/autograd/gen_autograd.py"
+    "${TOOLS_PATH}/autograd/gen_python_functions.py"
+    "${TOOLS_PATH}/autograd/gen_variable_factories.py"
+    "${TOOLS_PATH}/autograd/gen_variable_type.py"
+    "${TOOLS_PATH}/autograd/load_derivatives.py"
+    "${TOOLS_PATH}/autograd/nested_dict.py"
+    "${TOOLS_PATH}/autograd/utils.py"
+    "${TOOLS_PATH}/jit/gen_jit_dispatch.py"
+    "${TOOLS_PATH}/jit/templates/register_aten_ops.cpp"
+    WORKING_DIRECTORY "${TORCH_ROOT}")
+
+
+  # Required workaround for libtorch_python.so build
+  # see https://samthursfield.wordpress.com/2015/11/21/cmake-dependencies-between-targets-and-files-and-custom-commands/#custom-commands-in-different-directories
+  add_custom_target(
+    generate-torch-sources
+    DEPENDS ${TORCH_GENERATED_CODE}
+    )
+
+
+  set(TORCH_SRCS
+    ${GENERATED_CXX_TORCH}
+    ${GENERATED_H_TORCH}
+    ${TORCH_SRC_DIR}/csrc/autograd/anomaly_mode.cpp
+    ${TORCH_SRC_DIR}/csrc/autograd/engine.cpp
+    ${TORCH_SRC_DIR}/csrc/autograd/function.cpp
+    ${TORCH_SRC_DIR}/csrc/autograd/function_hook.cpp
+    ${TORCH_SRC_DIR}/csrc/autograd/functions/accumulate_grad.cpp
+    ${TORCH_SRC_DIR}/csrc/autograd/functions/basic_ops.cpp
+    ${TORCH_SRC_DIR}/csrc/autograd/functions/tensor.cpp
+    ${TORCH_SRC_DIR}/csrc/autograd/functions/utils.cpp
+    ${TORCH_SRC_DIR}/csrc/autograd/grad_mode.cpp
+    ${TORCH_SRC_DIR}/csrc/autograd/input_buffer.cpp
+    ${TORCH_SRC_DIR}/csrc/autograd/profiler.cpp
+    ${TORCH_SRC_DIR}/csrc/autograd/record_function.cpp
+    ${TORCH_SRC_DIR}/csrc/autograd/saved_variable.cpp
+    ${TORCH_SRC_DIR}/csrc/autograd/variable.cpp
+    ${TORCH_SRC_DIR}/csrc/autograd/VariableTypeManual.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/autodiff.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/attributes.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/argument_spec.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/export.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/pass_manager.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/pickler.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/graph_executor.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/import_source.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/import.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/import_export_helpers.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/interpreter.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/constants.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/node_hashing.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/ir.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/irparser.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/netdef_converter.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/operator.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/register_c10_ops.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/subgraph_matcher.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/symbolic_script.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/profiling_record.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/profiling_graph_executor_impl.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/passes/alias_analysis.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/passes/batch_mm.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/passes/canonicalize.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/passes/constant_propagation.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/passes/constant_pooling.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/passes/common_subexpression_elimination.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/passes/create_autodiff_subgraphs.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/passes/inline_autodiff_subgraphs.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/passes/insert_guards.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/passes/dead_code_elimination.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/passes/decompose_ops.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/passes/canonicalize_ops.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/passes/erase_number_types.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/passes/inline_fork_wait.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/passes/graph_fuser.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/passes/inplace_check.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/passes/loop_unrolling.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/passes/lower_grad_of.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/passes/lower_tuples.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/passes/peephole.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/passes/remove_expands.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/passes/remove_inplace_ops.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/passes/shape_analysis.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/passes/requires_grad_analysis.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/passes/specialize_autogradzero.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/passes/subgraph_rewrite.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/passes/python_print.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/passes/utils/subgraph_utils.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/passes/utils/check_alias_annotation.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/passes/utils/memory_dag.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/passes/quantization.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/fuser/interface.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/register_prim_ops.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/register_special_ops.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/register_quantized_ops.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/scope.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/script/compiler.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/jit.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/testing/file_check.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/script/final_returns.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/script/schema_matching.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/script/script_type_parser.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/script/sugared_value.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/script/class_type.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/script/parser.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/script/builtin_functions.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/script/edit_distance.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/script/logging.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/script/module.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/script/jit_exception.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/tracer.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/hooks_for_testing.cpp
+    ${TORCH_SRC_DIR}/csrc/utils/tensor_flatten.cpp
+    ${TORCH_SRC_DIR}/csrc/utils/variadic.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/fuser/kernel_cache.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/fuser/compiler.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/fuser/executor.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/fuser/codegen.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/fuser/fallback.cpp
+    ${TORCH_ROOT}/test/cpp/jit/test.cpp
+    )
+
+  if (WIN32)
+    list(APPEND TORCH_SRCS
+      ${TORCH_SRC_DIR}/csrc/jit/fuser/cpu/dynamic_library_win.cpp
+    )
+  else ()
+    list(APPEND TORCH_SRCS
+      ${TORCH_SRC_DIR}/csrc/jit/fuser/cpu/dynamic_library_unix.cpp
+      ${TORCH_SRC_DIR}/csrc/jit/fuser/cpu/fused_kernel.cpp
+    )
+    if (USE_CUDA AND NOT USE_ROCM)
+      list(APPEND Caffe2_GPU_SRCS
+        ${TORCH_SRC_DIR}/csrc/jit/fuser/cuda/fused_kernel.cpp
+      )
+      add_library(thnvrtc SHARED ${TORCH_SRC_DIR}/csrc/jit/fuser/cuda/thnvrtc.cpp)
+      target_link_libraries(thnvrtc ${CUDA_NVRTC} ${CUDA_CUDA_LIB} ${CUDA_NVRTC_LIB})
+      target_include_directories(thnvrtc PRIVATE ${CUDA_INCLUDE_DIRS})
+      install(TARGETS thnvrtc DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+
+    endif()
+  endif ()
+
+  if (USE_CUDA)
+    list(APPEND Caffe2_GPU_SRCS
+      ${TORCH_SRC_DIR}/csrc/autograd/profiler_cuda.cpp
+      ${TORCH_SRC_DIR}/csrc/autograd/functions/comm.cpp
+      ${TORCH_SRC_DIR}/csrc/cuda/comm.cpp
+    )
+  endif()
+
+  if (USE_ROCM)
+    list(APPEND Caffe2_HIP_SRCS
+      ${TORCH_SRC_DIR}/csrc/autograd/functions/comm.cpp
+      ${TORCH_SRC_DIR}/csrc/cuda/comm.cpp
+    )
+  endif()
+
+  if (NOT NO_API)
+    list(APPEND TORCH_SRCS
+      ${TORCH_SRC_DIR}/csrc/api/src/cuda.cpp
+      ${TORCH_SRC_DIR}/csrc/api/src/data/datasets/mnist.cpp
+      ${TORCH_SRC_DIR}/csrc/api/src/data/samplers/distributed.cpp
+      ${TORCH_SRC_DIR}/csrc/api/src/data/samplers/random.cpp
+      ${TORCH_SRC_DIR}/csrc/api/src/data/samplers/sequential.cpp
+      ${TORCH_SRC_DIR}/csrc/api/src/data/samplers/stream.cpp
+      ${TORCH_SRC_DIR}/csrc/api/src/jit.cpp
+      ${TORCH_SRC_DIR}/csrc/api/src/nn/init.cpp
+      ${TORCH_SRC_DIR}/csrc/api/src/nn/module.cpp
+      ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/batchnorm.cpp
+      ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/conv.cpp
+      ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/dropout.cpp
+      ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/embedding.cpp
+      ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/functional.cpp
+      ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/linear.cpp
+      ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/named_any.cpp
+      ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/rnn.cpp
+      ${TORCH_SRC_DIR}/csrc/api/src/optim/adagrad.cpp
+      ${TORCH_SRC_DIR}/csrc/api/src/optim/adam.cpp
+      ${TORCH_SRC_DIR}/csrc/api/src/optim/lbfgs.cpp
+      ${TORCH_SRC_DIR}/csrc/api/src/optim/optimizer.cpp
+      ${TORCH_SRC_DIR}/csrc/api/src/optim/rmsprop.cpp
+      ${TORCH_SRC_DIR}/csrc/api/src/optim/serialize.cpp
+      ${TORCH_SRC_DIR}/csrc/api/src/optim/sgd.cpp
+      ${TORCH_SRC_DIR}/csrc/api/src/serialize/input-archive.cpp
+      ${TORCH_SRC_DIR}/csrc/api/src/serialize/output-archive.cpp
+    )
+  endif()
+
+
+  list(APPEND Caffe2_CPU_SRCS ${TORCH_SRCS})
+endif()
+
+# ==========================================================
+# END formerly-libtorch sources
+# ==========================================================
+
+
 # Compile exposed libraries.
 add_library(caffe2 ${Caffe2_CPU_SRCS})
-if (NOT WIN32)
+
+
+option(TORCH_STATIC "Build libtorch.a rather than libtorch.so" OFF)
+
+
+# This is required for older versions of CMake, which don't allow
+# specifying add_library() without a list of source files
+set(DUMMY_EMPTY_FILE ${CMAKE_BINARY_DIR}/empty.cpp)
+
+if (MSVC)
+  set(DUMMY_FILE_CONTENT "__declspec(dllexport) int ignore_this_library_placeholder(){return 0\\;}")
+else()
+  set(DUMMY_FILE_CONTENT "")
+endif()
+
+file(WRITE ${DUMMY_EMPTY_FILE} ${DUMMY_FILE_CONTENT})
+
+
+# Wrapper library for transition to merged libcaffe and libtorch.
+# Only necessary on Windows?
+# Contains "caffe2" and "caffe2_gpu".
+if (TORCH_STATIC)
+  add_library(torch STATIC ${DUMMY_EMPTY_FILE})
+else()
+  add_library(torch SHARED ${DUMMY_EMPTY_FILE})
+endif()
+
+
+target_link_libraries(torch caffe2)
+
+# ==========================================================
+# formerly-libtorch flags
+# ==========================================================
+
+if (NOT INTERN_BUILD_MOBILE)
+
+  # Forces caffe2.pb.h to be generated before its dependents are compiled.
+  # Adding the generated header file to the ${TORCH_SRCS} list is not sufficient
+  # to establish the dependency, since the generation procedure is declared in a different CMake file.
+  # See https://samthursfield.wordpress.com/2015/11/21/cmake-dependencies-between-targets-and-files-and-custom-commands/#custom-commands-in-different-directories
+  add_dependencies(caffe2 Caffe2_PROTO)
+
+  target_compile_definitions(caffe2 PUBLIC _THP_CORE)
+
+
+  # until they can be unified, keep these lists synced with setup.py
+  if(MSVC)
+
+    if (MSVC_Z7_OVERRIDE)
+      set(MSVC_DEBINFO_OPTION "/Z7")
+    else()
+      set(MSVC_DEBINFO_OPTION "/Zi")
+    endif()
+
+    target_compile_options(caffe2 PUBLIC
+      ${MSVC_RUNTIME_LIBRARY_OPTION}
+      ${MSVC_DEBINFO_OPTION}
+      /EHa
+      /DNOMINMAX
+      /wd4267
+      /wd4251
+      /wd4522
+      /wd4522
+      /wd4838
+      /wd4305
+      /wd4244
+      /wd4190
+      /wd4101
+      /wd4996
+      /wd4275
+      /bigobj
+      )
+  else()
+    target_compile_options(caffe2 PUBLIC
+      #    -std=c++11
+      -Wall
+      -Wextra
+      -Wno-unused-parameter
+      -Wno-missing-field-initializers
+      -Wno-write-strings
+      -Wno-unknown-pragmas
+      # Clang has an unfixed bug leading to spurious missing braces
+      # warnings, see https://bugs.llvm.org/show_bug.cgi?id=21629
+      -Wno-missing-braces
+      )
+
+    if(NOT APPLE)
+      target_compile_options(caffe2 PRIVATE
+        # Considered to be flaky.  See the discussion at
+        # https://github.com/pytorch/pytorch/pull/9608
+        -Wno-maybe-uninitialized)
+    endif()
+
+  endif()
+
+  if (MSVC)
+  elseif (WERROR)
+    target_compile_options(caffe2 PRIVATE -Werror -Wno-strict-overflow)
+  endif()
+
+
+  if (NOT NO_API)
+    target_include_directories(caffe2 PRIVATE
+      ${TORCH_SRC_DIR}/csrc/api
+      ${TORCH_SRC_DIR}/csrc/api/include)
+  endif()
+
+  if(USE_CUDA)
+    if(MSVC)
+      if (NOT NVTOOLEXT_HOME)
+        set(NVTOOLEXT_HOME "C:/Program Files/NVIDIA Corporation/NvToolsExt")
+      endif()
+      if ($ENV{NVTOOLEXT_HOME})
+        set(NVTOOLEXT_HOME $ENV{NVTOOLEXT_HOME})
+      endif()
+      set(TORCH_CUDA_LIBRARIES
+        ${NVTOOLEXT_HOME}/lib/x64/nvToolsExt64_1.lib
+        ${CUDA_LIBRARIES})
+      target_include_directories(caffe2 PUBLIC "${NVTOOLEXT_HOME}/include")
+
+    elseif(APPLE)
+      set(TORCH_CUDA_LIBRARIES
+        ${CUDA_TOOLKIT_ROOT_DIR}/lib/libcudart.dylib
+        ${CUDA_TOOLKIT_ROOT_DIR}/lib/libnvrtc.dylib
+        ${CUDA_TOOLKIT_ROOT_DIR}/lib/libnvToolsExt.dylib
+        ${CUDA_LIBRARIES})
+
+    else()
+      find_library(LIBNVTOOLSEXT libnvToolsExt.so PATHS ${CUDA_TOOLKIT_ROOT_DIR}/lib64/)
+      set(TORCH_CUDA_LIBRARIES
+        ${LIBNVTOOLSEXT}
+        ${CUDA_LIBRARIES})
+    endif()
+
+    target_compile_definitions(caffe2 PRIVATE USE_CUDA)
+  endif()
+
+
+  set(TH_CPU_INCLUDE
+    # dense
+    aten/src/TH
+    ${CMAKE_CURRENT_BINARY_DIR}/aten/src/TH
+    ${TORCH_ROOT}/aten/src
+    ${CMAKE_CURRENT_BINARY_DIR}/aten/src
+    ${CMAKE_BINARY_DIR}/aten/src)
+  target_include_directories(caffe2 PRIVATE ${TH_CPU_INCLUDE})
+
+  set(ATen_CPU_INCLUDE
+    ${TORCH_ROOT}/aten/src
+    ${CMAKE_CURRENT_BINARY_DIR}/../aten/src
+    ${CMAKE_CURRENT_BINARY_DIR}/../aten/src/ATen
+    ${CMAKE_BINARY_DIR}/aten/src)
+  target_include_directories(caffe2 PRIVATE ${ATen_CPU_INCLUDE})
+
+  target_include_directories(caffe2 PRIVATE
+    ${TORCH_SRC_DIR}/csrc)
+
+  target_include_directories(caffe2 PRIVATE
+    ${TORCH_ROOT}/third_party/miniz-2.0.8)
+
+
+  set_property(TARGET caffe2 PROPERTY CXX_STANDARD 11)
+
+
+  # Prevent the unused functions being optimized away
+  # Otherwise torch.dll will be linked without caffe2_gpu.dll
+  if (MSVC)
+    # TODO What to do with this line?
+    set_target_properties(caffe2 PROPERTIES LINK_FLAGS "/OPT:NOREF")
+  endif()
+
+  install(DIRECTORY "${TORCH_SRC_DIR}/csrc"
+    DESTINATION ${TORCH_INSTALL_INCLUDE_DIR}/torch
+    FILES_MATCHING PATTERN "*.h")
+  install(FILES "${TORCH_SRC_DIR}/script.h" "${TORCH_SRC_DIR}/extension.h"
+    DESTINATION ${TORCH_INSTALL_INCLUDE_DIR}/torch)
+
+
+  if (BUILD_TEST AND NOT MSVC AND NOT USE_ROCM)
+    add_subdirectory(${TORCH_ROOT}/test/cpp/jit ${CMAKE_BINARY_DIR}/test_jit)
+  endif()
+
+  if (BUILD_TEST AND NOT NO_API)
+    add_subdirectory(${TORCH_ROOT}/test/cpp/api ${CMAKE_BINARY_DIR}/test_api)
+  endif()
+
+
+  # XXX This ABI check cannot be run with arm-linux-androideabi-g++
+  if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
+    if (DEFINED GLIBCXX_USE_CXX11_ABI)
+      message(STATUS "_GLIBCXX_USE_CXX11_ABI is already defined as a cmake variable")
+    else()
+      message(STATUS "${CMAKE_CXX_COMPILER} ${TORCH_SRC_DIR}/abi-check.cpp -o ${CMAKE_BINARY_DIR}/abi-check")
+      execute_process(
+        COMMAND
+        "${CMAKE_CXX_COMPILER}"
+        "${TORCH_SRC_DIR}/abi-check.cpp"
+        "-o"
+        "${CMAKE_BINARY_DIR}/abi-check"
+        RESULT_VARIABLE ABI_CHECK_COMPILE_RESULT)
+      if (ABI_CHECK_COMPILE_RESULT)
+        message(FATAL_ERROR "Could not compile ABI Check: ${ABI_CHECK_COMPILE_RESULT}")
+      endif()
+      execute_process(
+        COMMAND "${CMAKE_BINARY_DIR}/abi-check"
+        RESULT_VARIABLE ABI_CHECK_RESULT
+        OUTPUT_VARIABLE GLIBCXX_USE_CXX11_ABI)
+      if (ABI_CHECK_RESULT)
+        message(WARNING "Could not run ABI Check: ${ABI_CHECK_RESULT}")
+      endif()
+    endif()
+    message(STATUS "Determined _GLIBCXX_USE_CXX11_ABI=${GLIBCXX_USE_CXX11_ABI}")
+  endif()
+
+  # CMake config for external projects.
+  configure_file(
+    ${PROJECT_SOURCE_DIR}/cmake/TorchConfigVersion.cmake.in
+    ${PROJECT_BINARY_DIR}/TorchConfigVersion.cmake
+    @ONLY)
+  configure_file(
+    ${TORCH_ROOT}/cmake/TorchConfig.cmake.in
+    ${PROJECT_BINARY_DIR}/TorchConfig.cmake
+    @ONLY)
+  install(FILES
+    ${PROJECT_BINARY_DIR}/TorchConfigVersion.cmake
+    ${PROJECT_BINARY_DIR}/TorchConfig.cmake
+    DESTINATION share/cmake/Torch)
+
+  if (USE_DISTRIBUTED)
+    add_subdirectory(${TORCH_SRC_DIR}/lib/THD lib_THD)
+    if (NOT MSVC AND NOT APPLE)
+      add_subdirectory(${TORCH_SRC_DIR}/lib/c10d lib_c10d)
+    endif()
+  endif()
+
+
+  # ---[ Torch python bindings build
+  add_subdirectory(../torch torch)
+
+
+endif()
+# ==========================================================
+# END formerly-libtorch flags
+# ==========================================================
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+if (NOT NO_API)
+  target_include_directories(caffe2 PUBLIC
+    $<BUILD_INTERFACE:${TORCH_SRC_DIR}/csrc/api>
+    $<BUILD_INTERFACE:${TORCH_SRC_DIR}/csrc/api/include>)
+endif()
+
+
+find_package(OpenMP QUIET)
+if(USE_OPENMP AND OPENMP_FOUND)
+  message(STATUS "pytorch is compiling with OpenMP. \n"
+    "OpenMP CXX_FLAGS: ${OpenMP_CXX_FLAGS}. \n"
+    "OpenMP libraries: ${OpenMP_CXX_LIBRARIES}.")
+  target_compile_options(caffe2 INTERFACE ${OpenMP_CXX_FLAGS})
+  target_link_libraries(caffe2 PRIVATE ${OpenMP_CXX_LIBRARIES})
+endif()
+
+
+if(USE_ROCM)
+
+  # XXX kostmo
+#  target_link_libraries(caffe2 PUBLIC caffe2_hip_library)
+
+  target_compile_definitions(caffe2 PRIVATE
+    USE_ROCM
+    __HIP_PLATFORM_HCC__
+    )
+  target_include_directories(caffe2 PRIVATE
+    /opt/rocm/include
+    /opt/rocm/hcc/include
+    /opt/rocm/rocblas/include
+    /opt/rocm/hipsparse/include
+    )
+endif()
+
+
+if (NOT WIN32 AND NOT USE_ASAN)
+  # Enable hidden visibility by default to make it easier to debug issues with
+  # TORCH_API annotations. Hidden visibility with selective default visibility
+  # behaves close enough to Windows' dllimport/dllexport.
+  #
+  # Unfortunately, hidden visibility messes up some ubsan warnings because
+  # templated classes crossing library boundary get duplicated (but identical)
+  # definitions. It's easier to just disable it.
   target_compile_options(caffe2 PRIVATE "-fvisibility=hidden")
 endif()
 
+
 if(NOT BUILD_ATEN_ONLY)
   caffe2_interface_library(caffe2_protos caffe2_protos_whole)
   target_link_libraries(caffe2 PRIVATE caffe2_protos_whole)
@@ -244,9 +857,9 @@ target_include_directories(caffe2 SYSTEM PRIVATE "${Caffe2_DEPENDENCY_INCLUDE}")
 # Set standard properties on the target
 torch_set_target_props(caffe2)
 
-if (NOT MSVC)
-  target_compile_options(caffe2 INTERFACE "$<$<COMPILE_LANGUAGE:CXX>:-std=c++11>")
-endif()
+#if (NOT MSVC)
+#  target_compile_options(caffe2 INTERFACE "$<$<COMPILE_LANGUAGE:CXX>:-std=c++11>")
+#endif()
 
 target_compile_options(caffe2 PRIVATE "-DCAFFE2_BUILD_MAIN_LIB")
 if (MSVC AND NOT BUILD_SHARED_LIBS)
@@ -325,7 +938,17 @@ if (MSVC AND BUILD_SHARED_LIBS)
 endif()
 # Use -O2 for release builds (-O3 doesn't improve perf, and -Os results in perf regression)
 target_compile_options(caffe2 PRIVATE "$<$<OR:$<CONFIG:Release>,$<CONFIG:RelWithDebInfo>>:-O2>")
+
+
 install(TARGETS caffe2 EXPORT Caffe2Targets DESTINATION lib)
+
+install(TARGETS torch DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+
+if (MSVC AND NOT TORCH_STATIC)
+  install(FILES $<TARGET_PDB_FILE:torch> DESTINATION "${TORCH_INSTALL_LIB_DIR}" OPTIONAL)
+endif()
+
+
 caffe2_interface_library(caffe2 caffe2_library)
 list(APPEND Caffe2_MAIN_LIBS caffe2_library)
 # Install PDB files for MSVC builds
@@ -335,12 +958,15 @@ endif()
 
 # ---[ CUDA library.
 if(USE_CUDA)
+
   set(CUDA_LINK_LIBRARIES_KEYWORD PRIVATE)
   torch_cuda_based_add_library(caffe2_gpu ${Caffe2_GPU_SRCS})
   set(CUDA_LINK_LIBRARIES_KEYWORD)
   target_link_libraries(caffe2_gpu INTERFACE caffe2::cudart)
   target_link_libraries(caffe2_gpu PUBLIC c10_cuda)
 
+  target_link_libraries(caffe2_gpu PUBLIC ${TORCH_CUDA_LIBRARIES})
+
   target_include_directories(
       caffe2_gpu INTERFACE $<INSTALL_INTERFACE:include>)
   target_include_directories(
@@ -375,10 +1001,14 @@ if(USE_CUDA)
   caffe2_interface_library(caffe2_gpu caffe2_gpu_library)
   list(APPEND Caffe2_MAIN_LIBS caffe2_gpu_library)
 
+
   # Install PDB files for MSVC builds
   if (MSVC AND BUILD_SHARED_LIBS)
     install(FILES $<TARGET_PDB_FILE:caffe2_gpu> DESTINATION lib OPTIONAL)
   endif()
+
+  target_link_libraries(torch caffe2_gpu)
+
 endif()
 
 # ---[ Caffe2 HIP sources.
diff --git a/caffe2/core/c10_operator.h b/caffe2/core/c10_operator.h
index a82d0d8402d6..27d66523ad9d 100644
--- a/caffe2/core/c10_operator.h
+++ b/caffe2/core/c10_operator.h
@@ -1,10 +1,9 @@
 #pragma once
 
+#if !defined(CAFFE2_IS_XPLAT_BUILD)
 #include <ATen/core/function_schema.h>
 #include <ATen/core/op_registration/op_registration.h>
-#if !defined(CAFFE2_IS_XPLAT_BUILD)
 #include <torch/csrc/jit/script/function_schema_parser.h>
-#endif
 #include <vector>
 
 namespace caffe2 {
@@ -156,7 +155,6 @@ inline std::unique_ptr<c10::KernelCache> noCache() {
  * - If your operator has a variable number of input tensors, make the first (!)
  *   input an input of type TensorList. There must be no other tensor inputs.
  */
-#if !defined(CAFFE2_IS_XPLAT_BUILD)
 #define C10_DECLARE_CAFFE2_OPERATOR(OperatorName)                  \
   namespace caffe2 {                                               \
   namespace _c10_ops {                                             \
@@ -180,24 +178,26 @@ inline std::unique_ptr<c10::KernelCache> noCache() {
   static auto registry_##OperatorName##_##__COUNTER__ =                      \
       ::c10::RegisterOperators().op(                                         \
           ::caffe2::_c10_ops::schema_##OperatorName(),                       \
-          ::c10::kernel(                                                     \
-              &::caffe2::detail::call_caffe2_op_from_c10<                    \
-                  ::caffe2::_c10_ops::schema_##OperatorName,                 \
-                  OperatorClass>,                                            \
-              &::caffe2::detail::noCache),                                   \
-          ::c10::dispatchKey(::c10::CPUTensorId()));
+          ::c10::RegisterOperators::options()                                \
+              .kernel(                                                       \
+                  &::caffe2::detail::call_caffe2_op_from_c10<                \
+                      ::caffe2::_c10_ops::schema_##OperatorName,             \
+                      OperatorClass>,                                        \
+                  &::caffe2::detail::noCache)                                \
+              .dispatchKey(::c10::CPUTensorId()));
 
 #define C10_REGISTER_CAFFE2_OPERATOR_CUDA(OperatorName, OperatorClass)       \
   /* Register call_caffe2_op_from_c10 as a kernel with the c10 dispatcher */ \
   static auto registry_##OperatorName##_##__COUNTER__ =                      \
       ::c10::RegisterOperators().op(                                         \
           ::caffe2::_c10_ops::schema_##OperatorName(),                       \
-          ::c10::kernel(                                                     \
-              &::caffe2::detail::call_caffe2_op_from_c10<                    \
-                  ::caffe2::_c10_ops::schema_##OperatorName,                 \
-                  OperatorClass>,                                            \
-              &::caffe2::detail::noCache),                                   \
-          ::c10::dispatchKey(::c10::CUDATensorId()));
+          ::c10::RegisterOperators::options()                                \
+              .kernel(                                                       \
+                  &::caffe2::detail::call_caffe2_op_from_c10<                \
+                      ::caffe2::_c10_ops::schema_##OperatorName,             \
+                      OperatorClass>,                                        \
+                  &::caffe2::detail::noCache)                                \
+              .dispatchKey(::c10::CUDATensorId()));
 
 // You should never manually call the C10_REGISTER_CAFFE2_OPERATOR_HIP macro.
 // The C10_REGISTER_CAFFE2_OPERATOR_CUDA macro from above will be automatically
@@ -207,12 +207,13 @@ inline std::unique_ptr<c10::KernelCache> noCache() {
   static auto registry_##OperatorName##_##__COUNTER__ =                      \
       ::c10::RegisterOperators().op(                                         \
           ::caffe2::_c10_ops::schema_##OperatorName(),                       \
-          ::c10::kernel(                                                     \
-              &::caffe2::detail::call_caffe2_op_from_c10<                    \
-                  ::caffe2::_c10_ops::schema_##OperatorName,                 \
-                  OperatorClass>,                                            \
-              &::caffe2::detail::noCache),                                   \
-          ::c10::dispatchKey(::c10::HIPTensorId()));
+          ::c10::RegisterOperators().options()                               \
+              .kernel(                                                       \
+                  &::caffe2::detail::call_caffe2_op_from_c10<                \
+                      ::caffe2::_c10_ops::schema_##OperatorName,             \
+                      OperatorClass>,                                        \
+                  &::caffe2::detail::noCache)                                \
+              .dispatchKey(::c10::HIPTensorId()));
 
 #else
 // Don't use c10 dispatcher on mobile because of binary size
diff --git a/caffe2/core/net.cc b/caffe2/core/net.cc
index 7e4c6e51bfeb..41a314403489 100644
--- a/caffe2/core/net.cc
+++ b/caffe2/core/net.cc
@@ -190,6 +190,15 @@ int ExecutorHelper::GetNumWorkers() const {
   CAFFE_THROW("Not implemented");
 }
 
+// benchmark an individual run so that we can FeedBlobs with new inputs
+// no warmup
+// return time taken in microseconds
+float NetBase::TEST_Benchmark_One_Run() {
+  Timer timer;
+  CAFFE_ENFORCE(Run(), "Run has failed.");
+  return timer.MicroSeconds();
+}
+
 std::vector<float> NetBase::TEST_Benchmark(
     const int warmup_runs,
     const int main_runs,
diff --git a/caffe2/core/net.h b/caffe2/core/net.h
index 17a7e329cf7a..7bfb47fe1ca0 100644
--- a/caffe2/core/net.h
+++ b/caffe2/core/net.h
@@ -17,7 +17,6 @@
 #include "caffe2/core/observer.h"
 #include "caffe2/core/operator_schema.h"
 #include "caffe2/core/tensor.h"
-#include "caffe2/core/workspace.h"
 #include "caffe2/proto/caffe2_pb.h"
 #include "caffe2/utils/simple_queue.h"
 
@@ -63,6 +62,13 @@ class CAFFE2_API NetBase : public Observable<NetBase> {
 
   virtual bool RunAsync();
 
+  /* Benchmarks a network for one individual run so that we can feed new
+   * inputs on additional calls.
+   * This function returns the number of microseconds spent
+   * during the benchmark
+   */
+  virtual float TEST_Benchmark_One_Run();
+
   /**
    * Benchmarks a network.
    *
diff --git a/caffe2/core/operator.cc b/caffe2/core/operator.cc
index 4bd3e1a715ef..57e93df605f6 100644
--- a/caffe2/core/operator.cc
+++ b/caffe2/core/operator.cc
@@ -29,8 +29,14 @@ C10_DEFINE_bool(
 C10_DEFINE_bool(
     caffe2_operator_throw_if_fp_exceptions,
     false,
-    "If set, throws if floating point exceptions (FE_DIVBYZERO, FE_INVALID, "
-    "FE_OVERFLOW) are detected when running any operator.");
+    "If set, throws if floating point exceptions (FE_DIVBYZERO, FE_INVALID) "
+    "are detected when running any operator. FE_OVERFLOW is handled separately "
+    "by caffe2_operator_throw_if_fp_overflow_exceptions option.");
+C10_DEFINE_bool(
+    caffe2_operator_throw_if_fp_overflow_exceptions,
+    false,
+    "If set, throws if floating point exception FE_OVERFLOW is detected when "
+    "running any operator.");
 
 namespace caffe2 {
 
@@ -63,8 +69,11 @@ OperatorBase::OperatorBase(const OperatorDef& operator_def, Workspace* ws)
   type_ = operator_def.type();
 }
 
+#if !defined(CAFFE2_IS_XPLAT_BUILD)
 namespace {
-int compute_input_size_(const std::vector<c10::IValue>& inputs) {
+int
+C10_UNUSED  // Suppress unused function warning on mobile.
+compute_input_size_(const std::vector<c10::IValue>& inputs) {
   if (inputs.empty()) {
     return 0;
   }
@@ -103,6 +112,7 @@ OperatorBase::OperatorBase(
   input_tensors_.resize(input_size_);
   output_tensors_.resize(newstyle_outputs_.size());
 }
+#endif
 
 vector<TensorShape> OperatorBase::InputTensorShapes() const {
   vector<TensorShape> tps;
@@ -577,18 +587,43 @@ TensorShapes InferBlobShapesAndTypes(
   return tps;
 }
 
-void LoadInt8TensorInfoOfBlob(float* scale, float* offset, const Blob* b) {
-  const int8::Int8TensorCPU* i8tc =
+void LoadInt8TensorInfoOfBlob(
+    std::vector<float>* scale,
+    std::vector<float>* offset,
+    uint32_t* axis,
+    const Blob* b) {
+  const int8::Int8TensorCPU* int8_tensor =
       static_cast<const int8::Int8TensorCPU*>(b->GetRaw());
-  *scale = i8tc->scale;
-  *offset = i8tc->zero_point;
+  scale->clear();
+  offset->clear();
+  scale->push_back(int8_tensor->scale);
+  offset->push_back(int8_tensor->zero_point);
+  *axis = 1;
 }
 
 TensorShape GetTensorShapeOfBlob(const Blob* b) {
-  TypeCall type_fun = GetTypeCallFunction(b->meta().id());
-  TensorInfoCall tensor_info_fun = GetTensorInfoFunction(b->meta().id());
   TensorShape tp;
+#ifndef C10_MOBILE
+  auto function_ptr =
+      ExternalTensorFunctionsBaseRegistry()->Create(b->meta().id());
+  if (function_ptr != nullptr) {
+    // This is dnnlowp tensor and we cant deal with it using regular path
+    auto dtype = function_ptr->GetExternalTensorType(b->GetRaw());
+    tp.set_data_type(TypeMetaToDataType(dtype));
 
+    size_t _capacity;
+    DeviceOption _device;
+    auto dshape =
+        function_ptr->GetExternalTensorInfo(b->GetRaw(), &_capacity, &_device);
+    for (auto d : dshape) {
+      tp.add_dims(d);
+    }
+    return tp;
+  }
+#endif
+
+  TypeCall type_fun = GetTypeCallFunction(b->meta().id());
+  TensorInfoCall tensor_info_fun = GetTensorInfoFunction(b->meta().id());
   if (type_fun) {
     tp.set_data_type(TypeMetaToDataType(type_fun(b->GetRaw())));
   }
@@ -737,9 +772,21 @@ std::function<void(const OperatorDef&)> GetOperatorLogger() {
 
 c10::optional<int> OperatorBase::argumentIndexWithName(
     const std::string& name) const {
+#if !defined(CAFFE2_IS_XPLAT_BUILD)
   return getFunctionSchema().argumentIndexWithName(name);
+#else
+  CAFFE_THROW("Non-legacy operators are not legal in xplat/caffe2");
+#endif
 }
 
 OperatorBase::~OperatorBase() noexcept = default;
 
+#ifndef C10_MOBILE
+C10_DEFINE_TYPED_REGISTRY(
+    ExternalTensorFunctionsBaseRegistry,
+    TypeIdentifier,
+    ExternalTensorFunctionsBase,
+    std::unique_ptr);
+#endif
+
 }  // namespace caffe2
diff --git a/caffe2/core/operator.h b/caffe2/core/operator.h
index 9e4c5952a82f..b094036a0706 100644
--- a/caffe2/core/operator.h
+++ b/caffe2/core/operator.h
@@ -6,12 +6,15 @@
 #include <climits>
 #include <cstddef>
 #include <exception>
+#include <functional>
 #include <set>
+#include <string>
 #include <typeinfo>
 #include <vector>
 
-#include "c10/macros/Macros.h"
-#include "c10/util/Registry.h"
+#include <c10/macros/Macros.h>
+#include <c10/util/Registry.h>
+#include <c10/util/typeid.h>
 #include "caffe2/core/blob.h"
 #include "caffe2/core/common.h"
 #include "caffe2/core/net.h"
@@ -26,9 +29,12 @@
 #include "caffe2/utils/proto_utils.h"
 
 #include <ATen/core/Tensor.h>
+#if !defined(CAFFE2_IS_XPLAT_BUILD)
 #include <ATen/core/ivalue.h>
+#endif
 
 C10_DECLARE_bool(caffe2_operator_throw_if_fp_exceptions);
+C10_DECLARE_bool(caffe2_operator_throw_if_fp_overflow_exceptions);
 
 namespace c10 {
 struct FunctionSchema;
@@ -50,10 +56,12 @@ class CAFFE2_API OperatorBase : public Observable<OperatorBase> {
    * Alternatively, inputs can be one tensor list ivalue followed by non-tensors
    * to represent operators with a variable number of inputs.
    */
+#if !defined(CAFFE2_IS_XPLAT_BUILD)
   explicit OperatorBase(
       const c10::FunctionSchema& schema,
       std::vector<c10::IValue> inputs,
       std::vector<at::Tensor> outputs);
+#endif
 
   virtual ~OperatorBase() noexcept;
 
@@ -61,12 +69,20 @@ class CAFFE2_API OperatorBase : public Observable<OperatorBase> {
    * New operators should be instantiated with FunctionSchema
    */
   bool isLegacyOperator() const {
+#if !defined(CAFFE2_IS_XPLAT_BUILD)
     return !fn_schema_;
+#else
+    return true;
+#endif
   }
 
   const c10::FunctionSchema& getFunctionSchema() const {
     CAFFE_ENFORCE(!isLegacyOperator());
+#if !defined(CAFFE2_IS_XPLAT_BUILD)
     return *fn_schema_.get();
+#else
+    CAFFE_THROW("Non-legacy operators are not legal in xplat/caffe2");
+#endif
   }
 
   /** @brief Checks if the operator has an argument of the given name.
@@ -88,10 +104,14 @@ class CAFFE2_API OperatorBase : public Observable<OperatorBase> {
       return ArgumentHelper::GetSingleArgument<OperatorDef, T>(
           *operator_def_, name, default_value);
     }
+#if !defined(CAFFE2_IS_XPLAT_BUILD)
     auto index = argumentIndexWithName(name);
     CAFFE_ENFORCE(index.has_value(), "Couldn't get index for argument!", name);
     const auto& value = newstyle_inputs_[index.value()];
     return value.template to<T>();
+#else
+    CAFFE_THROW("Non-legacy operators are not legal in xplat/caffe2");
+#endif
   }
 
   template <typename T>
@@ -100,10 +120,12 @@ class CAFFE2_API OperatorBase : public Observable<OperatorBase> {
     return ArgumentHelper::HasSingleArgumentOfType<OperatorDef, T>(
         *operator_def_, name);
   }
+#if !defined(CAFFE2_IS_XPLAT_BUILD)
   template <typename T>
   inline vector<T> GetVectorFromIValueList(const c10::IValue& value) const {
     return value.template to<vector<T>>();
   }
+#endif
 
   template <typename T>
   inline vector<T> GetRepeatedArgument(
@@ -114,10 +136,14 @@ class CAFFE2_API OperatorBase : public Observable<OperatorBase> {
       return ArgumentHelper::GetRepeatedArgument<OperatorDef, T>(
           *operator_def_, name, default_value);
     }
+#if !defined(CAFFE2_IS_XPLAT_BUILD)
     auto index = argumentIndexWithName(name);
     CAFFE_ENFORCE(index.has_value(), "Couldn't get index for argument!", name);
     const auto& value = newstyle_inputs_[index.value()];
     return GetVectorFromIValueList<T>(value);
+#else
+    CAFFE_THROW("Non-legacy operators are not legal in xplat/caffe2");
+#endif
   }
 
   // Get the inputs and outputs as specific types.
@@ -165,6 +191,7 @@ class CAFFE2_API OperatorBase : public Observable<OperatorBase> {
         throw enf;
       }
     }
+#if !defined(CAFFE2_IS_XPLAT_BUILD)
     DCHECK_LT(0, newstyle_inputs_.size());
     IValue ival;
     if (newstyle_inputs_[0].isTensorList()) {
@@ -186,6 +213,9 @@ class CAFFE2_API OperatorBase : public Observable<OperatorBase> {
     CAFFE_ENFORCE_EQ(tensor.GetDeviceType(), type);
     input_tensors_[idx] = std::move(tensor);
     return input_tensors_[idx];
+#else
+    CAFFE_THROW("Non-legacy operators are not legal in xplat/caffe2");
+#endif
   }
 
   template <typename T>
@@ -207,6 +237,7 @@ class CAFFE2_API OperatorBase : public Observable<OperatorBase> {
       // When you get a Tensor here it is not fully initialized
       return BlobGetMutableTensor(outputs_.at(idx), type);
     }
+#if !defined(CAFFE2_IS_XPLAT_BUILD)
     auto& output = newstyle_outputs_[idx];
     Tensor tensor = caffe2::Tensor(output);
     if (!tensor.defined() || tensor.GetDeviceType() != type) {
@@ -216,6 +247,9 @@ class CAFFE2_API OperatorBase : public Observable<OperatorBase> {
     }
     output_tensors_[idx] = caffe2::Tensor(output);
     return &output_tensors_[idx];
+#else
+    CAFFE_THROW("Non-legacy operators are not legal in xplat/caffe2");
+#endif
   }
 
   inline Tensor
@@ -232,10 +266,14 @@ class CAFFE2_API OperatorBase : public Observable<OperatorBase> {
 
   void SetOutputTensor(int idx, Tensor tensor) {
     if (!isLegacyOperator()) {
+#if !defined(CAFFE2_IS_XPLAT_BUILD)
       newstyle_outputs_[idx] = at::Tensor(tensor);
 
       // also update the tensor in the hack
       output_tensors_[idx] = std::move(tensor);
+#else
+    CAFFE_THROW("Non-legacy operators are not legal in xplat/caffe2");
+#endif
     } else {
       // update the tensor in the workspace
       BlobSetTensor(outputs_.at(idx), std::move(tensor));
@@ -257,6 +295,7 @@ class CAFFE2_API OperatorBase : public Observable<OperatorBase> {
           "device must be provided in options.");
       return BlobGetMutableTensor(outputs_.at(idx), dims, options);
     }
+#if !defined(CAFFE2_IS_XPLAT_BUILD)
     auto& output = newstyle_outputs_[idx];
     Tensor tensor =
         GetSizedTensorWithOptions(caffe2::Tensor(output), dims, options);
@@ -265,6 +304,9 @@ class CAFFE2_API OperatorBase : public Observable<OperatorBase> {
 
     output_tensors_[idx] = caffe2::Tensor(output);
     return &output_tensors_[idx];
+#else
+    CAFFE_THROW("Non-legacy operators are not legal in xplat/caffe2");
+#endif
   }
 
   // Get output Tensor of the operator and CopyFrom the given Tensor
@@ -349,7 +391,11 @@ class CAFFE2_API OperatorBase : public Observable<OperatorBase> {
     if (isLegacyOperator()) {
       return outputs_.size();
     }
+#if !defined(CAFFE2_IS_XPLAT_BUILD)
     return newstyle_outputs_.size();
+#else
+    CAFFE_THROW("Non-legacy operators are not legal in xplat/caffe2");
+#endif
   }
   inline const vector<const Blob*>& Inputs() const { return inputs_; }
   inline const vector<Blob*>& Outputs() { return outputs_; }
@@ -540,9 +586,11 @@ class CAFFE2_API OperatorBase : public Observable<OperatorBase> {
     return helper_;
   }
 
+#if !defined(CAFFE2_IS_XPLAT_BUILD)
   std::vector<at::Tensor> move_newstyle_outputs() && {
     return std::move(newstyle_outputs_);
   }
+#endif
 
  public:
   static const int kNoNetPositionSet = -1;
@@ -556,9 +604,11 @@ class CAFFE2_API OperatorBase : public Observable<OperatorBase> {
   vector<const Blob*> inputs_;
   vector<Blob*> outputs_;
   // Preferrably use c10::optional, but nvcc doesn't work
+#if !defined(CAFFE2_IS_XPLAT_BUILD)
   std::unique_ptr<const c10::FunctionSchema> fn_schema_;
   vector<c10::IValue> newstyle_inputs_;
   vector<at::Tensor> newstyle_outputs_;
+#endif
   // HACK
   // We preserve the fact that Output() returns Tensor*
   // by storing Tensor in a vector owned by the
@@ -618,6 +668,7 @@ inline NetDef OperatorBase::GetSingleArgument<NetDef>(
   return NetDef();
 }
 
+#if !defined(CAFFE2_IS_XPLAT_BUILD)
 template <>
 inline vector<int> OperatorBase::GetVectorFromIValueList<int>(
     const c10::IValue& value) const {
@@ -649,6 +700,7 @@ inline vector<string> OperatorBase::GetVectorFromIValueList<string>(
   vector<string> out;
   return out;
 }
+#endif
 
 // OP_SINGLE_ARG provides a shorter initialization choice for initialization of
 // member variables for the class constructors.
@@ -688,6 +740,7 @@ class Operator : public OperatorBase {
     // constructors will run on that device.
     context_.SwitchToDevice();
   }
+#if !defined(CAFFE2_IS_XPLAT_BUILD)
   explicit Operator(
       const c10::FunctionSchema& fn_schema,
       std::vector<c10::IValue> inputs,
@@ -697,6 +750,7 @@ class Operator : public OperatorBase {
     // constructors will run on that device.
     context_.SwitchToDevice();
   }
+#endif
   ~Operator() noexcept override {}
 
   /// Retrieve a non-owning reference to the input at position 'idx' for this
@@ -844,6 +898,8 @@ class Operator : public OperatorBase {
         CAFFE_ENFORCE(
             !std::fetestexcept(FE_INVALID),
             "Invalid floating point exception (FE_INVALID) reported.");
+      }
+      if (FLAGS_caffe2_operator_throw_if_fp_overflow_exceptions) {
         CAFFE_ENFORCE(
             !std::fetestexcept(FE_OVERFLOW),
             "Overflow floating point exception (FE_OVERFLOW) reported.");
@@ -1342,8 +1398,11 @@ CAFFE2_API void SetOpEnginePref(
     const std::string& op_type,
     const CaffeMap<DeviceType, EnginePrefType>& op_pref);
 
-CAFFE2_API void
-LoadInt8TensorInfoOfBlob(float* scale, float* offset, const Blob* b);
+CAFFE2_API void LoadInt8TensorInfoOfBlob(
+    std::vector<float>* scale,
+    std::vector<float>* offset,
+    uint32_t* axis,
+    const Blob* b);
 
 CAFFE2_API TensorShape GetTensorShapeOfBlob(const Blob* b);
 
@@ -1375,6 +1434,57 @@ CAFFE2_API std::set<std::string> GetRegisteredOperators();
 CAFFE2_API void SetOperatorLogger(std::function<void(const OperatorDef&)> tracer);
 std::function<void(const OperatorDef&)> GetOperatorLogger();
 
+#ifndef C10_MOBILE
+// This is for transferring tensor data between C2 and backends.
+struct ExternalTensorDescriptor {
+  uint64_t dataType;
+  uint32_t dimensions;
+  const uint64_t* shape;
+  uint32_t quantizationAxis;
+  uint64_t quantizationParams;
+  const float* scales;
+  const int32_t* biases;
+  uint64_t buffer;
+};
+
+class ExternalTensorFunctionsBase {
+ public:
+  explicit ExternalTensorFunctionsBase() {}
+  virtual ~ExternalTensorFunctionsBase() {}
+  virtual bool IsSameMetaType(TypeIdentifier id) = 0;
+  virtual void SetupExternalTensorDescriptor(
+      const Blob* blob,
+      std::vector<std::vector<uint64_t>>* shapes,
+      std::vector<std::vector<float>>* all_scales,
+      std::vector<std::vector<float>>* all_offsets,
+      ExternalTensorDescriptor* desc) = 0;
+  virtual void LoadInfoOfBlob(
+      const Blob* blob,
+      std::vector<float>* scale,
+      std::vector<float>* offset,
+      uint32_t* axis) = 0;
+  virtual TypeIdentifier GetTypeMetaId(const string& name) = 0;
+  virtual TypeMeta GetExternalTensorType(const void* c) = 0;
+  virtual vector<int64_t> GetExternalTensorInfo(
+      const void* c,
+      size_t* capacity,
+      DeviceOption* device) = 0;
+};
+
+C10_DECLARE_TYPED_REGISTRY(
+    ExternalTensorFunctionsBaseRegistry,
+    TypeIdentifier,
+    ExternalTensorFunctionsBase,
+    std::unique_ptr);
+
+#define REGISTER_EXTERNAL_TENSOR_FUNCTIONS(id, ...) \
+  C10_REGISTER_TYPED_CLASS(ExternalTensorFunctionsBaseRegistry, id, __VA_ARGS__)
+inline unique_ptr<ExternalTensorFunctionsBase> CreateExternalTensorFunctions(
+    TypeIdentifier id) {
+  return ExternalTensorFunctionsBaseRegistry()->Create(id);
+}
+#endif // C10_MOBILE
+
 }  // namespace caffe2
 
 
diff --git a/caffe2/core/operator_c10wrapper.h b/caffe2/core/operator_c10wrapper.h
index d18d1fcb1858..ab9e636b0a70 100644
--- a/caffe2/core/operator_c10wrapper.h
+++ b/caffe2/core/operator_c10wrapper.h
@@ -1,5 +1,7 @@
 #pragma once
 
+// TODO Also register c10 operators on mobile
+#if !defined(CAFFE2_IS_XPLAT_BUILD)
 #include <ATen/core/dispatch/Dispatcher.h>
 #include <ATen/core/ivalue.h>
 #include <c10/util/ArrayRef.h>
@@ -180,7 +182,7 @@ class C10OperatorWrapper final : public Operator<Context> {
     if (default_value.has_value()) {
       return this->template GetSingleArgument<T>(name, default_value->to<T>());
     } else {
-      AT_CHECK(
+      TORCH_CHECK(
           this->template HasSingleArgumentOfType<T>(name),
           "Error in caffe2->c10 wrapper: Expected argument '",
           name,
@@ -225,9 +227,8 @@ createC10OperatorWrapper(const char* op_name, const char* overload_name) {
 }
 
 } // namespace detail
+} // namespace caffe2
 
-// TODO Also register c10 operators on mobile
-#if !defined(CAFFE2_IS_XPLAT_BUILD)
 // TODO Currently we only register the CPU variant. This is going to be fixed
 //      once the tensor detemplatization lands.
 #define REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_CPU(        \
@@ -256,4 +257,3 @@ createC10OperatorWrapper(const char* op_name, const char* overload_name) {
 #define REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_HIP( \
     OperatorName, Name)
 #endif
-} // namespace caffe2
diff --git a/caffe2/core/tensor.cc b/caffe2/core/tensor.cc
index 182b1563e5ee..67018bfcaea1 100644
--- a/caffe2/core/tensor.cc
+++ b/caffe2/core/tensor.cc
@@ -57,9 +57,11 @@ TypeMeta GetTensorType(const void* c) {
   const Tensor* tc = static_cast<const Tensor*>(c);
   return tc->dtype();
 }
+
 TypeMeta GetInt8TensorType(const void* c) {
-  const int8::Int8TensorCPU* i8tc = static_cast<const int8::Int8TensorCPU*>(c);
-  return (i8tc->t).dtype();
+  const int8::Int8TensorCPU* int8_tensor =
+      static_cast<const int8::Int8TensorCPU*>(c);
+  return (int8_tensor->t).dtype();
 }
 
 // TODO(jerryzh): Remove
@@ -98,9 +100,11 @@ vector<int64_t> GetTensorInfo(
 
 vector<int64_t>
 GetInt8TensorInfo(const void* c, size_t* capacity, DeviceOption* device) {
-  const int8::Int8TensorCPU* i8tc = static_cast<const int8::Int8TensorCPU*>(c);
-  return GetTensorInfo(&(i8tc->t), capacity, device);
+  const int8::Int8TensorCPU* int8_tensor =
+      static_cast<const int8::Int8TensorCPU*>(c);
+  return GetTensorInfo(&(int8_tensor->t), capacity, device);
 }
+
 // since we only have one tensor, probably need to remove this at some point?
 static CaffeMap<TypeIdentifier, TensorInfoCall> tensor_info_call_registry_{
     {TypeMeta::Id<Tensor>(), GetTensorInfo},
diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h
index f65e7712c641..9fbc160b2509 100644
--- a/caffe2/core/tensor.h
+++ b/caffe2/core/tensor.h
@@ -519,8 +519,8 @@ class CAFFE2_API Tensor final {
     return impl_.get()->strides();
   }
 
-  inline bool is_contiguous() const {
-    return impl_.get()->is_contiguous();
+  inline bool is_contiguous(at::MemoryFormat memory_format=at::MemoryFormat::Any) const {
+    return impl_.get()->is_contiguous(memory_format);
   }
 
   /**
@@ -649,6 +649,9 @@ Tensor TensorCPUFromValues(at::IntArrayRef dims, at::ArrayRef<T> values) {
   return r;
 }
 
+vector<int64_t>
+GetTensorInfo(const void* c, size_t* capacity, DeviceOption* device);
+
 class CAFFE2_API TensorPrinter {
  public:
   explicit TensorPrinter(
diff --git a/caffe2/image/image_input_op.cc b/caffe2/image/image_input_op.cc
index a01994c46592..be21e791ad16 100644
--- a/caffe2/image/image_input_op.cc
+++ b/caffe2/image/image_input_op.cc
@@ -19,21 +19,21 @@ REGISTER_CPU_OPERATOR(ImageInput, ImageInputOp<CPUContext>);
 OPERATOR_SCHEMA(ImageInput)
     .NumInputs(0, 1)
     .NumOutputs(2, INT_MAX)
-    .TensorInferenceFunction(
-        [](const OperatorDef& def, const vector<TensorShape>& /* unused */ ) {
-          vector<TensorShape> out(2);
-          ArgumentHelper helper(def);
-          int batch_size = helper.GetSingleArgument<int>("batch_size", 0);
-          int crop = helper.GetSingleArgument<int>("crop", -1);
-          int color = helper.GetSingleArgument<int>("color", 1);
-          CHECK_GT(crop, 0);
-          out[0] = CreateTensorShape(
-              vector<int>{batch_size, crop, crop, color ? 3 : 1},
-              TensorProto::FLOAT);
-          out[1] =
-              CreateTensorShape(vector<int>{1, batch_size}, TensorProto::INT32);
-          return out;
-        })
+    .TensorInferenceFunction([](const OperatorDef& def,
+                                const vector<TensorShape>& /* unused */) {
+      vector<TensorShape> out(2);
+      ArgumentHelper helper(def);
+      int batch_size = helper.GetSingleArgument<int>("batch_size", 0);
+      int crop = helper.GetSingleArgument<int>("crop", -1);
+      int color = helper.GetSingleArgument<int>("color", 1);
+      CHECK_GT(crop, 0);
+      out[0] = CreateTensorShape(
+          vector<int>{batch_size, crop, crop, color ? 3 : 1},
+          TensorProto::FLOAT);
+      out[1] =
+          CreateTensorShape(vector<int>{1, batch_size}, TensorProto::INT32);
+      return out;
+    })
     .SetDoc(R"DOC(
 Imports and processes images from a database. For each run of the operator,
 batch_size images will be processed. GPUs can optionally be used for
@@ -51,76 +51,117 @@ The following transformations are applied to the image
 
 The dimension of the output image will always be cropxcrop
 )DOC")
-    .Arg("batch_size", "Number of images to output for each run of the operator"
-         ". Must be 1 or greater")
+    .Arg(
+        "batch_size",
+        "Number of images to output for each run of the operator"
+        ". Must be 1 or greater")
     .Arg("color", "Number of color channels (1 or 3). Defaults to 1")
     .Arg("color_jitter", "Whether or not to do color jitter. Defaults to 0")
-    .Arg("img_saturation", "Image saturation scale used in color jittering. "
-         "Defaults to 0.4")
-    .Arg("img_brightness", "Image brightness scale used in color jittering. "
-         "Defaults to 0.4")
-    .Arg("img_contrast", "Image contrast scale used in color jittering. "
-         "Defaults to 0.4")
-    .Arg("color_lighting", "Whether or not to do color lighting."
-         " Defaults to 0")
-    .Arg("color_lighting_std", "Std of normal distribution where color lighting"
+    .Arg(
+        "img_saturation",
+        "Image saturation scale used in color jittering. "
+        "Defaults to 0.4")
+    .Arg(
+        "img_brightness",
+        "Image brightness scale used in color jittering. "
+        "Defaults to 0.4")
+    .Arg(
+        "img_contrast",
+        "Image contrast scale used in color jittering. "
+        "Defaults to 0.4")
+    .Arg(
+        "color_lighting",
+        "Whether or not to do color lighting."
+        " Defaults to 0")
+    .Arg(
+        "color_lighting_std",
+        "Std of normal distribution where color lighting"
         " scaling factor is sampled. Defaults to 0.1")
-    .Arg("scale_jitter_type", "Type 0: No scale jittering "
-          "Type 1: Inception-style scale jittering")
-    .Arg("label_type", "Type 0: single integer label for multi-class "
+    .Arg(
+        "scale_jitter_type",
+        "Type 0: No scale jittering "
+        "Type 1: Inception-style scale jittering")
+    .Arg(
+        "label_type",
+        "Type 0: single integer label for multi-class "
         "classification. Type 1: sparse active label indices for multi-label "
         "classification. Type 2: dense label embedding vector for label "
         "embedding regression")
-    .Arg("scale", "Scale the size of the smallest dimension of the image to"
-         " this. Scale and minsize are mutually exclusive."
-         " Must be larger than crop")
-    .Arg("minsize", "Scale the size of the smallest dimension of the image to"
-         " this only if the size is initially smaller. Scale and minsize are"
-         " mutually exclusive. Must be larger than crop.")
-    .Arg("warp", "If 1, both dimensions of the image will be set to minsize or"
-         " scale; otherwise, the other dimension is proportionally scaled."
-         " Defaults to 0")
+    .Arg(
+        "scale",
+        "Scale the size of the smallest dimension of the image to"
+        " this. Scale and minsize are mutually exclusive."
+        " Must be larger than crop")
+    .Arg(
+        "minsize",
+        "Scale the size of the smallest dimension of the image to"
+        " this only if the size is initially smaller. Scale and minsize are"
+        " mutually exclusive. Must be larger than crop.")
+    .Arg(
+        "warp",
+        "If 1, both dimensions of the image will be set to minsize or"
+        " scale; otherwise, the other dimension is proportionally scaled."
+        " Defaults to 0")
     .Arg("crop", "Size to crop the image to. Must be provided")
     .Arg("mirror", "Whether or not to mirror the image. Defaults to 0")
-    .Arg("mean", "Mean by which to normalize color channels."
-         " Defaults to 0.")
-    .Arg("mean_per_channel", "Vector of means per color channel "
-         " (1 or 3 elements). Defaults to mean argument. Channel order BGR")
-    .Arg("std", "Standard deviation by which to normalize color channels."
-         " Defaults to 1.")
-    .Arg("std_per_channel", "Vector of standard dev. per color channel "
-     " (1 or 3 elements). Defaults to std argument. Channel order is BGR")
+    .Arg(
+        "mean",
+        "Mean by which to normalize color channels."
+        " Defaults to 0.")
+    .Arg(
+        "mean_per_channel",
+        "Vector of means per color channel "
+        " (1 or 3 elements). Defaults to mean argument. Channel order BGR")
+    .Arg(
+        "std",
+        "Standard deviation by which to normalize color channels."
+        " Defaults to 1.")
+    .Arg(
+        "std_per_channel",
+        "Vector of standard dev. per color channel "
+        " (1 or 3 elements). Defaults to std argument. Channel order is BGR")
     .Arg("bounding_ymin", "Bounding box coordinate. Defaults to -1 (none)")
     .Arg("bounding_xmin", "Bounding box coordinate. Defaults to -1 (none)")
     .Arg("bounding_height", "Bounding box coordinate. Defaults to -1 (none)")
     .Arg("bounding_width", "Bounding box coordinate. Defaults to -1 (none)")
     .ArgIsTest("Set to 1 to do deterministic cropping. Defaults to 0")
     .Arg("use_caffe_datum", "1 if the input is in Caffe format. Defaults to 0")
-    .Arg("use_gpu_transform", "1 if GPU acceleration should be used."
-         " Defaults to 0. Can only be 1 in a CUDAContext")
-    .Arg("decode_threads", "Number of CPU decode/transform threads."
-         " Defaults to 4")
+    .Arg(
+        "use_gpu_transform",
+        "1 if GPU acceleration should be used."
+        " Defaults to 0. Can only be 1 in a CUDAContext")
+    .Arg(
+        "decode_threads",
+        "Number of CPU decode/transform threads."
+        " Defaults to 4")
     .Arg("output_type", "If gpu_transform, can set to FLOAT or FLOAT16.")
     .Arg("db", "Name of the database (if not passed as input)")
-    .Arg("db_type", "Type of database (if not passed as input)."
-         " Defaults to leveldb")
-    .Arg("output_sizes", "The sizes of any outputs besides the data and label "
-         "(should have a number of elements equal to the number of additional "
-         "outputs)")
-    .Arg("random_scale", "[min, max] shortest-side desired for image resize. "
-         "Defaults to [-1, -1] or no random resize desired.")
+    .Arg(
+        "db_type",
+        "Type of database (if not passed as input)."
+        " Defaults to leveldb")
+    .Arg(
+        "output_sizes",
+        "The sizes of any outputs besides the data and label "
+        "(should have a number of elements equal to the number of additional "
+        "outputs)")
+    .Arg(
+        "random_scale",
+        "[min, max] shortest-side desired for image resize. "
+        "Defaults to [-1, -1] or no random resize desired.")
     .Input(0, "reader", "The input reader (a db::DBReader)")
     .Output(0, "data", "Tensor containing the images")
     .Output(1, "label", "Tensor containing the labels")
-    .Output(2, "additional outputs", "Any outputs after the first 2 will be "
-            "Tensors read from the input TensorProtos");
+    .Output(
+        2,
+        "additional outputs",
+        "Any outputs after the first 2 will be "
+        "Tensors read from the input TensorProtos");
 
 NO_GRADIENT(ImageInput);
 
 #ifdef CAFFE2_USE_MKLDNN
-REGISTER_IDEEP_OPERATOR(
-    ImageInput,
-    IDEEPFallbackOp<ImageInputOp<CPUContext>>);
+REGISTER_IDEEP_OPERATOR(ImageInput, IDEEPFallbackOp<ImageInputOp<CPUContext>>);
 #endif
 
-}  // namespace caffe2
+} // namespace caffe2
diff --git a/caffe2/image/image_input_op.h b/caffe2/image/image_input_op.h
index bd3fca17bef8..b85091634501 100644
--- a/caffe2/image/image_input_op.h
+++ b/caffe2/image/image_input_op.h
@@ -4,8 +4,8 @@
 
 #include <opencv2/opencv.hpp>
 
-#include <iostream>
 #include <algorithm>
+#include <iostream>
 
 #include "c10/core/thread_pool.h"
 #include "caffe2/core/common.h"
@@ -21,15 +21,15 @@ namespace caffe2 {
 class CUDAContext;
 
 template <class Context>
-class ImageInputOp final
-    : public PrefetchOperator<Context> {
+class ImageInputOp final : public PrefetchOperator<Context> {
   // SINGLE_LABEL: single integer label for multi-class classification
-  // MULTI_LABEL_SPARSE: sparse active label indices for multi-label classification
-  // MULTI_LABEL_DENSE: dense label embedding vector for label embedding regression
-  // MULTI_LABEL_WEIGHTED_SPARSE: sparse active label indices with per-label weights
-  // for multi-label classification
-  // SINGLE_LABEL_WEIGHTED: single integer label for multi-class classification with weighted sampling
-  // EMBEDDING_LABEL: an array of floating numbers representing dense embedding.
+  // MULTI_LABEL_SPARSE: sparse active label indices for multi-label
+  // classification MULTI_LABEL_DENSE: dense label embedding vector for label
+  // embedding regression MULTI_LABEL_WEIGHTED_SPARSE: sparse active label
+  // indices with per-label weights for multi-label classification
+  // SINGLE_LABEL_WEIGHTED: single integer label for multi-class classification
+  // with weighted sampling EMBEDDING_LABEL: an array of floating numbers
+  // representing dense embedding.
   //   It is useful for model distillation
   enum LABEL_TYPE {
     SINGLE_LABEL = 0,
@@ -52,8 +52,7 @@ class ImageInputOp final
   using OperatorBase::OutputSize;
   using PrefetchOperator<Context>::context_;
   using PrefetchOperator<Context>::prefetch_thread_;
-  explicit ImageInputOp(const OperatorDef& operator_def,
-                                    Workspace* ws);
+  explicit ImageInputOp(const OperatorDef& operator_def, Workspace* ws);
   ~ImageInputOp() {
     PrefetchOperator<Context>::Finalize();
   }
@@ -73,19 +72,26 @@ class ImageInputOp final
   // Structure to store per-image information
   // This can be modified by the DecodeAnd* so needs
   // to be privatized per launch.
-  using PerImageArg = struct {
-    BoundingBox bounding_params;
-  };
+  using PerImageArg = struct { BoundingBox bounding_params; };
 
   bool GetImageAndLabelAndInfoFromDBValue(
-      const string& value, cv::Mat* img, PerImageArg& info, int item_id,
+      const string& value,
+      cv::Mat* img,
+      PerImageArg& info,
+      int item_id,
       std::mt19937* randgen);
   void DecodeAndTransform(
-      const std::string& value, float *image_data, int item_id,
-      const int channels, std::size_t thread_index);
+      const std::string& value,
+      float* image_data,
+      int item_id,
+      const int channels,
+      std::size_t thread_index);
   void DecodeAndTransposeOnly(
-      const std::string& value, uint8_t *image_data, int item_id,
-      const int channels, std::size_t thread_index);
+      const std::string& value,
+      uint8_t* image_data,
+      int item_id,
+      const int channels,
+      std::size_t thread_index);
   bool ApplyTransformOnGPU(
       const std::vector<std::int64_t>& dims,
       const c10::Device& type);
@@ -201,8 +207,8 @@ ImageInputOp<Context>::ImageInputOp(
           0)),
       num_decode_threads_(
           OperatorBase::template GetSingleArgument<int>("decode_threads", 4)),
-      additional_output_sizes_(OperatorBase::template GetRepeatedArgument<int>(
-                                   "output_sizes", {})),
+      additional_output_sizes_(
+          OperatorBase::template GetRepeatedArgument<int>("output_sizes", {})),
       thread_pool_(std::make_shared<TaskThreadPool>(num_decode_threads_)),
       // output type only supported with CUDA and use_gpu_transform for now
       output_type_(
@@ -221,96 +227,102 @@ ImageInputOp<Context>::ImageInputOp(
   }
 
   mean_ = OperatorBase::template GetRepeatedArgument<float>(
-    "mean_per_channel",
-    {OperatorBase::template GetSingleArgument<float>("mean", 0.)});
+      "mean_per_channel",
+      {OperatorBase::template GetSingleArgument<float>("mean", 0.)});
 
   std_ = OperatorBase::template GetRepeatedArgument<float>(
-    "std_per_channel",
-    {OperatorBase::template GetSingleArgument<float>("std", 1.)});
+      "std_per_channel",
+      {OperatorBase::template GetSingleArgument<float>("std", 1.)});
 
   if (additional_output_sizes_.size() == 0) {
     additional_output_sizes_ = std::vector<int>(OutputSize() - 2, 1);
   } else {
     CAFFE_ENFORCE(
-      additional_output_sizes_.size() == OutputSize() - 2,
-      "If the output sizes are specified, they must be specified for all "
-      "additional outputs");
+        additional_output_sizes_.size() == OutputSize() - 2,
+        "If the output sizes are specified, they must be specified for all "
+        "additional outputs");
   }
   additional_inputs_count_ = OutputSize() - 2;
 
   default_arg_.bounding_params = {
-    false,
-    OperatorBase::template GetSingleArgument<int>("bounding_ymin", -1),
-    OperatorBase::template GetSingleArgument<int>("bounding_xmin", -1),
-    OperatorBase::template GetSingleArgument<int>("bounding_height", -1),
-    OperatorBase::template GetSingleArgument<int>("bounding_width", -1),
+      false,
+      OperatorBase::template GetSingleArgument<int>("bounding_ymin", -1),
+      OperatorBase::template GetSingleArgument<int>("bounding_xmin", -1),
+      OperatorBase::template GetSingleArgument<int>("bounding_height", -1),
+      OperatorBase::template GetSingleArgument<int>("bounding_width", -1),
   };
 
   if (operator_def.input_size() == 0) {
     LOG(ERROR) << "You are using an old ImageInputOp format that creates "
-                       "a local db reader. Consider moving to the new style "
-                       "that takes in a DBReader blob instead.";
-    string db_name =
-        OperatorBase::template GetSingleArgument<string>("db", "");
+                  "a local db reader. Consider moving to the new style "
+                  "that takes in a DBReader blob instead.";
+    string db_name = OperatorBase::template GetSingleArgument<string>("db", "");
     CAFFE_ENFORCE_GT(db_name.size(), 0, "Must specify a db name.");
     owned_reader_.reset(new db::DBReader(
-        OperatorBase::template GetSingleArgument<string>(
-            "db_type", "leveldb"),
+        OperatorBase::template GetSingleArgument<string>("db_type", "leveldb"),
         db_name));
     reader_ = owned_reader_.get();
   }
 
   // hard-coded PCA eigenvectors and eigenvalues, based on RBG channel order
   color_lighting_eigvecs_.push_back(
-    std::vector<float>{-144.7125f, 183.396f, 102.2295f});
+      std::vector<float>{-144.7125f, 183.396f, 102.2295f});
   color_lighting_eigvecs_.push_back(
-    std::vector<float>{-148.104f, -1.1475f, -207.57f});
+      std::vector<float>{-148.104f, -1.1475f, -207.57f});
   color_lighting_eigvecs_.push_back(
-    std::vector<float>{-148.818f, -177.174f, 107.1765f});
+      std::vector<float>{-148.818f, -177.174f, 107.1765f});
 
   color_lighting_eigvals_ = std::vector<float>{0.2175f, 0.0188f, 0.0045f};
 
   CAFFE_ENFORCE_GT(batch_size_, 0, "Batch size should be nonnegative.");
   if (use_caffe_datum_) {
-    CAFFE_ENFORCE(label_type_ == SINGLE_LABEL || label_type_ == SINGLE_LABEL_WEIGHTED,
-      "Caffe datum only supports single integer label");
+    CAFFE_ENFORCE(
+        label_type_ == SINGLE_LABEL || label_type_ == SINGLE_LABEL_WEIGHTED,
+        "Caffe datum only supports single integer label");
   }
-  if (label_type_ !=  SINGLE_LABEL && label_type_ != SINGLE_LABEL_WEIGHTED) {
-    CAFFE_ENFORCE_GT(num_labels_, 0,
-      "Number of labels must be set for using either sparse label indices or dense label embedding.");
+  if (label_type_ != SINGLE_LABEL && label_type_ != SINGLE_LABEL_WEIGHTED) {
+    CAFFE_ENFORCE_GT(
+        num_labels_,
+        0,
+        "Number of labels must be set for using either sparse label indices or dense label embedding.");
   }
   if (label_type_ == MULTI_LABEL_WEIGHTED_SPARSE ||
-    label_type_ == SINGLE_LABEL_WEIGHTED) {
+      label_type_ == SINGLE_LABEL_WEIGHTED) {
     additional_inputs_offset_ = 3;
   } else {
     additional_inputs_offset_ = 2;
   }
-  CAFFE_ENFORCE((scale_ > 0) != (minsize_ > 0),
-                "Must provide one and only one of scaling or minsize");
+  CAFFE_ENFORCE(
+      (scale_ > 0) != (minsize_ > 0),
+      "Must provide one and only one of scaling or minsize");
   CAFFE_ENFORCE_GT(crop_, 0, "Must provide the cropping value.");
   CAFFE_ENFORCE_GE(
-    scale_ > 0 ? scale_ : minsize_,
-    crop_, "The scale/minsize value must be no smaller than the crop value.");
+      scale_ > 0 ? scale_ : minsize_,
+      crop_,
+      "The scale/minsize value must be no smaller than the crop value.");
 
   CAFFE_ENFORCE_EQ(
       mean_.size(),
       std_.size(),
       "The mean and std. dev vectors must be of the same size.");
-  CAFFE_ENFORCE(mean_.size() == 1 || mean_.size() == 3,
-                "The mean and std. dev vectors must be of size 1 or 3");
+  CAFFE_ENFORCE(
+      mean_.size() == 1 || mean_.size() == 3,
+      "The mean and std. dev vectors must be of size 1 or 3");
   CAFFE_ENFORCE(
       !use_caffe_datum_ || OutputSize() == 2,
       "There can only be 2 outputs if the Caffe datum format is used");
 
-  CAFFE_ENFORCE(random_scale_.size() == 2,
-      "Must provide [scale_min, scale_max]");
-  CAFFE_ENFORCE_GE(random_scale_[1], random_scale_[0],
+  CAFFE_ENFORCE(
+      random_scale_.size() == 2, "Must provide [scale_min, scale_max]");
+  CAFFE_ENFORCE_GE(
+      random_scale_[1],
+      random_scale_[0],
       "random scale must provide a range [min, max]");
 
-  if (default_arg_.bounding_params.ymin < 0
-      || default_arg_.bounding_params.xmin < 0
-      || default_arg_.bounding_params.height < 0
-      || default_arg_.bounding_params.width < 0) {
+  if (default_arg_.bounding_params.ymin < 0 ||
+      default_arg_.bounding_params.xmin < 0 ||
+      default_arg_.bounding_params.height < 0 ||
+      default_arg_.bounding_params.width < 0) {
     default_arg_.bounding_params.valid = false;
   } else {
     default_arg_.bounding_params.valid = true;
@@ -334,11 +346,10 @@ ImageInputOp<Context>::ImageInputOp(
     LOG(INFO) << "    Applying a default bounding box of Y ["
               << default_arg_.bounding_params.ymin << "; "
               << default_arg_.bounding_params.ymin +
-      default_arg_.bounding_params.height
-              << ") x X ["
-              << default_arg_.bounding_params.xmin << "; "
+            default_arg_.bounding_params.height
+              << ") x X [" << default_arg_.bounding_params.xmin << "; "
               << default_arg_.bounding_params.xmin +
-      default_arg_.bounding_params.width
+            default_arg_.bounding_params.width
               << ")";
   }
   if (scale_ > 0 && !random_scaling_) {
@@ -348,8 +359,7 @@ ImageInputOp<Context>::ImageInputOp(
     if (random_scaling_) {
       // randomly set min_size_ for each image
       LOG(INFO) << "    Randomly scaling shortest side between "
-                << random_scale_[0] << " and "
-                << random_scale_[1];
+                << random_scale_[0] << " and " << random_scale_[1];
     } else {
       // Here, minsize_ > 0
       LOG(INFO) << "    Ensuring minimum image size of " << minsize_
@@ -365,16 +375,16 @@ ImageInputOp<Context>::ImageInputOp(
   auto mit = mean_.begin();
   auto sit = std_.begin();
 
-  for (int i = 0;
-       mit != mean_.end() && sit != std_.end();
-       ++mit, ++sit, ++i) {
+  for (int i = 0; mit != mean_.end() && sit != std_.end(); ++mit, ++sit, ++i) {
     LOG(INFO) << "    Default [Channel " << i << "] Subtract mean " << *mit
               << " and divide by std " << *sit << ".";
     // We actually will use the inverse of std, so inverse it here
     *sit = 1.f / *sit;
   }
   LOG(INFO) << "    Outputting images as "
-            << OperatorBase::template GetSingleArgument<string>("output_type", "unknown") << ".";
+            << OperatorBase::template GetSingleArgument<string>(
+                   "output_type", "unknown")
+            << ".";
 
   std::mt19937 meta_randgen(time(nullptr));
   for (int i = 0; i < num_decode_threads_; ++i) {
@@ -394,25 +404,17 @@ ImageInputOp<Context>::ImageInputOp(
     sizes = std::vector<int64_t>{batch_size_};
   }
   // data type for prefetched_label_ is actually not known here..
-  ReinitializeTensor(
-      &prefetched_label_,
-      sizes,
-      at::dtype<int>().device(CPU));
+  ReinitializeTensor(&prefetched_label_, sizes, at::dtype<int>().device(CPU));
 
   for (int i = 0; i < additional_output_sizes_.size(); ++i) {
     prefetched_additional_outputs_on_device_.emplace_back();
     prefetched_additional_outputs_.emplace_back();
   }
-
 }
 
 // Inception-stype scale jittering
 template <class Context>
-bool RandomSizedCropping(
-  cv::Mat* img,
-  const int crop,
-  std::mt19937* randgen
-) {
+bool RandomSizedCropping(cv::Mat* img, const int crop, std::mt19937* randgen) {
   cv::Mat scaled_img;
   bool inception_scale_jitter = false;
   int im_height = img->rows, im_width = img->cols;
@@ -426,20 +428,15 @@ bool RandomSizedCropping(
     float aspect_ratio = aspect_ratio_dis(*randgen);
     int nh = floor(std::sqrt(((float)target_area / aspect_ratio)));
     int nw = floor(std::sqrt(((float)target_area * aspect_ratio)));
-    if (nh >= 1 && nh <= im_height && nw >=1 && nw <= im_width) {
-      int height_offset = std::uniform_int_distribution<>(
-        0, im_height - nh)(*randgen);
-      int width_offset = std::uniform_int_distribution<>(
-        0,im_width - nw)(*randgen);
+    if (nh >= 1 && nh <= im_height && nw >= 1 && nw <= im_width) {
+      int height_offset =
+          std::uniform_int_distribution<>(0, im_height - nh)(*randgen);
+      int width_offset =
+          std::uniform_int_distribution<>(0, im_width - nw)(*randgen);
       cv::Rect ROI(width_offset, height_offset, nw, nh);
       cropping = (*img)(ROI);
       cv::resize(
-          cropping,
-          scaled_img,
-          cv::Size(crop, crop),
-          0,
-          0,
-          cv::INTER_AREA);
+          cropping, scaled_img, cv::Size(crop, crop), 0, 0, cv::INTER_AREA);
       *img = scaled_img;
       inception_scale_jitter = true;
       break;
@@ -697,7 +694,8 @@ bool ImageInputOp<Context>::GetImageAndLabelAndInfoFromDBValue(
   if (out_c == src.channels()) {
     *img = src;
   } else {
-    cv::cvtColor(src, *img, (out_c == 1) ? cv::COLOR_BGR2GRAY : cv::COLOR_GRAY2BGR);
+    cv::cvtColor(
+        src, *img, (out_c == 1) ? cv::COLOR_BGR2GRAY : cv::COLOR_GRAY2BGR);
   }
 
   // Note(Yangqing): I believe that the mat should be created continuous.
@@ -706,23 +704,26 @@ bool ImageInputOp<Context>::GetImageAndLabelAndInfoFromDBValue(
   // Sanity check now that we decoded everything
 
   // Ensure that the bounding box is legit
-  if (info.bounding_params.valid
-      && (src.rows < info.bounding_params.ymin + info.bounding_params.height
-        || src.cols < info.bounding_params.xmin + info.bounding_params.width
-     )) {
+  if (info.bounding_params.valid &&
+      (src.rows < info.bounding_params.ymin + info.bounding_params.height ||
+       src.cols < info.bounding_params.xmin + info.bounding_params.width)) {
     info.bounding_params.valid = false;
   }
 
   // Apply the bounding box if requested
   if (info.bounding_params.valid) {
     // If we reach here, we know the parameters are sane
-    cv::Rect bounding_box(info.bounding_params.xmin, info.bounding_params.ymin,
-                          info.bounding_params.width, info.bounding_params.height);
+    cv::Rect bounding_box(
+        info.bounding_params.xmin,
+        info.bounding_params.ymin,
+        info.bounding_params.width,
+        info.bounding_params.height);
     *img = (*img)(bounding_box);
 
     /*
     LOG(INFO) << "Did bounding with ymin:"
-              << info.bounding_params.ymin << " xmin:" << info.bounding_params.xmin
+              << info.bounding_params.ymin << " xmin:" <<
+    info.bounding_params.xmin
               << " height:" << info.bounding_params.height
               << " width:" << info.bounding_params.width << "\n";
     LOG(INFO) << "Bounded matrix: " << img;
@@ -736,52 +737,51 @@ bool ImageInputOp<Context>::GetImageAndLabelAndInfoFromDBValue(
   if (scale_jitter_type_ == INCEPTION_STYLE) {
     if (!is_test_) {
       // Inception-stype scale jittering is only used for training
-      inception_scale_jitter = RandomSizedCropping<Context>(img, crop_, randgen);
+      inception_scale_jitter =
+          RandomSizedCropping<Context>(img, crop_, randgen);
       // if a random crop is still not found, do simple random cropping later
     }
   }
 
   if ((scale_jitter_type_ == NO_SCALE_JITTER) ||
-    (scale_jitter_type_ == INCEPTION_STYLE && !inception_scale_jitter)) {
-      int scaled_width, scaled_height;
-      int scale_to_use = scale_ > 0 ? scale_ : minsize_;
-
-      // set the random minsize
-      if (random_scaling_) {
-        scale_to_use = std::uniform_int_distribution<>(random_scale_[0],
-                                                       random_scale_[1])(*randgen);
-      }
+      (scale_jitter_type_ == INCEPTION_STYLE && !inception_scale_jitter)) {
+    int scaled_width, scaled_height;
+    int scale_to_use = scale_ > 0 ? scale_ : minsize_;
 
-      if (warp_) {
-        scaled_width = scale_to_use;
-        scaled_height = scale_to_use;
-      } else if (img->rows > img->cols) {
-        scaled_width = scale_to_use;
-        scaled_height =
-            static_cast<float>(img->rows) * scale_to_use / img->cols;
-      } else {
-        scaled_height = scale_to_use;
-        scaled_width =
-            static_cast<float>(img->cols) * scale_to_use / img->rows;
-      }
-      if ((scale_ > 0 &&
-           (scaled_height != img->rows || scaled_width != img->cols))
-          || (scaled_height > img->rows || scaled_width > img->cols)) {
-        // We rescale in all cases if we are using scale_
-        // but only to make the image bigger if using minsize_
-        /*
-        LOG(INFO) << "Scaling to " << scaled_width << " x " << scaled_height
-                  << " From " << img->cols << " x " << img->rows;
-        */
-        cv::resize(
-            *img,
-            scaled_img,
-            cv::Size(scaled_width, scaled_height),
-            0,
-            0,
-            cv::INTER_AREA);
-        *img = scaled_img;
-      }
+    // set the random minsize
+    if (random_scaling_) {
+      scale_to_use = std::uniform_int_distribution<>(
+          random_scale_[0], random_scale_[1])(*randgen);
+    }
+
+    if (warp_) {
+      scaled_width = scale_to_use;
+      scaled_height = scale_to_use;
+    } else if (img->rows > img->cols) {
+      scaled_width = scale_to_use;
+      scaled_height = static_cast<float>(img->rows) * scale_to_use / img->cols;
+    } else {
+      scaled_height = scale_to_use;
+      scaled_width = static_cast<float>(img->cols) * scale_to_use / img->rows;
+    }
+    if ((scale_ > 0 &&
+         (scaled_height != img->rows || scaled_width != img->cols)) ||
+        (scaled_height > img->rows || scaled_width > img->cols)) {
+      // We rescale in all cases if we are using scale_
+      // but only to make the image bigger if using minsize_
+      /*
+      LOG(INFO) << "Scaling to " << scaled_width << " x " << scaled_height
+                << " From " << img->cols << " x " << img->rows;
+      */
+      cv::resize(
+          *img,
+          scaled_img,
+          cv::Size(scaled_width, scaled_height),
+          0,
+          0,
+          cv::INTER_AREA);
+      *img = scaled_img;
+    }
   }
 
   // TODO(Yangqing): return false if any error happens.
@@ -791,19 +791,18 @@ bool ImageInputOp<Context>::GetImageAndLabelAndInfoFromDBValue(
 // assume HWC order and color channels BGR
 template <class Context>
 void Saturation(
-  float* img,
-  const int img_size,
-  const float alpha_rand,
-  std::mt19937* randgen
-) {
+    float* img,
+    const int img_size,
+    const float alpha_rand,
+    std::mt19937* randgen) {
   float alpha = 1.0f +
-    std::uniform_real_distribution<float>(-alpha_rand, alpha_rand)(*randgen);
+      std::uniform_real_distribution<float>(-alpha_rand, alpha_rand)(*randgen);
   // BGR to Gray scale image: R -> 0.299, G -> 0.587, B -> 0.114
   int p = 0;
   for (int h = 0; h < img_size; ++h) {
     for (int w = 0; w < img_size; ++w) {
       float gray_color = img[3 * p] * 0.114f + img[3 * p + 1] * 0.587f +
-        img[3 * p + 2] * 0.299f;
+          img[3 * p + 2] * 0.299f;
       for (int c = 0; c < 3; ++c) {
         img[3 * p + c] = img[3 * p + c] * alpha + gray_color * (1.0f - alpha);
       }
@@ -815,13 +814,12 @@ void Saturation(
 // assume HWC order and color channels BGR
 template <class Context>
 void Brightness(
-  float* img,
-  const int img_size,
-  const float alpha_rand,
-  std::mt19937* randgen
-) {
+    float* img,
+    const int img_size,
+    const float alpha_rand,
+    std::mt19937* randgen) {
   float alpha = 1.0f +
-    std::uniform_real_distribution<float>(-alpha_rand, alpha_rand)(*randgen);
+      std::uniform_real_distribution<float>(-alpha_rand, alpha_rand)(*randgen);
   int p = 0;
   for (int h = 0; h < img_size; ++h) {
     for (int w = 0; w < img_size; ++w) {
@@ -835,25 +833,24 @@ void Brightness(
 // assume HWC order and color channels BGR
 template <class Context>
 void Contrast(
-  float* img,
-  const int img_size,
-  const float alpha_rand,
-  std::mt19937* randgen
-){
+    float* img,
+    const int img_size,
+    const float alpha_rand,
+    std::mt19937* randgen) {
   float gray_mean = 0;
   int p = 0;
   for (int h = 0; h < img_size; ++h) {
     for (int w = 0; w < img_size; ++w) {
       // BGR to Gray scale image: R -> 0.299, G -> 0.587, B -> 0.114
       gray_mean += img[3 * p] * 0.114f + img[3 * p + 1] * 0.587f +
-        img[3 * p + 2] * 0.299f;
+          img[3 * p + 2] * 0.299f;
       p++;
     }
   }
   gray_mean /= (img_size * img_size);
 
   float alpha = 1.0f +
-    std::uniform_real_distribution<float>(-alpha_rand, alpha_rand)(*randgen);
+      std::uniform_real_distribution<float>(-alpha_rand, alpha_rand)(*randgen);
   p = 0;
   for (int h = 0; h < img_size; ++h) {
     for (int w = 0; w < img_size; ++w) {
@@ -868,19 +865,20 @@ void Contrast(
 // assume HWC order and color channels BGR
 template <class Context>
 void ColorJitter(
-  float* img,
-  const int img_size,
-  const float saturation,
-  const float brightness,
-  const float contrast,
-  std::mt19937* randgen
-) {
-  std::srand (unsigned(std::time(0)));
+    float* img,
+    const int img_size,
+    const float saturation,
+    const float brightness,
+    const float contrast,
+    std::mt19937* randgen) {
+  std::srand(unsigned(std::time(0)));
   std::vector<int> jitter_order{0, 1, 2};
   // obtain a time-based seed:
   unsigned seed = std::chrono::system_clock::now().time_since_epoch().count();
-  std::shuffle(jitter_order.begin(), jitter_order.end(),
-    std::default_random_engine(seed));
+  std::shuffle(
+      jitter_order.begin(),
+      jitter_order.end(),
+      std::default_random_engine(seed));
 
   for (int i = 0; i < 3; ++i) {
     if (jitter_order[i] == 0) {
@@ -896,13 +894,12 @@ void ColorJitter(
 // assume HWC order and color channels BGR
 template <class Context>
 void ColorLighting(
-  float* img,
-  const int img_size,
-  const float alpha_std,
-  const std::vector<std::vector<float>>& eigvecs,
-  const std::vector<float>& eigvals,
-  std::mt19937* randgen
-) {
+    float* img,
+    const int img_size,
+    const float alpha_std,
+    const std::vector<std::vector<float>>& eigvecs,
+    const std::vector<float>& eigvals,
+    std::mt19937* randgen) {
   std::normal_distribution<float> d(0, alpha_std);
   std::vector<float> alphas(3);
   for (int i = 0; i < 3; ++i) {
@@ -924,19 +921,17 @@ void ColorLighting(
       }
     }
   }
-
 }
 
 // assume HWC order and color channels BGR
 // mean subtraction and scaling.
 template <class Context>
 void ColorNormalization(
-  float* img,
-  const int img_size,
-  const int channels,
-  const std::vector<float>& mean,
-  const std::vector<float>& std
-) {
+    float* img,
+    const int img_size,
+    const int channels,
+    const std::vector<float>& mean,
+    const std::vector<float>& std) {
   int p = 0;
   for (int h = 0; h < img_size; ++h) {
     for (int w = 0; w < img_size; ++w) {
@@ -981,9 +976,9 @@ void TransformImage(
     height_offset = (scaled_img.rows - crop) / 2;
   } else {
     width_offset =
-      std::uniform_int_distribution<>(0, scaled_img.cols - crop)(*randgen);
+        std::uniform_int_distribution<>(0, scaled_img.cols - crop)(*randgen);
     height_offset =
-      std::uniform_int_distribution<>(0, scaled_img.rows - crop)(*randgen);
+        std::uniform_int_distribution<>(0, scaled_img.rows - crop)(*randgen);
   }
 
   float* image_data_ptr = image_data;
@@ -1010,12 +1005,17 @@ void TransformImage(
   }
 
   if (color_jitter && channels == 3 && !is_test) {
-    ColorJitter<Context>(image_data, crop, saturation, brightness, contrast,
-      randgen);
+    ColorJitter<Context>(
+        image_data, crop, saturation, brightness, contrast, randgen);
   }
   if (color_lighting && channels == 3 && !is_test) {
-    ColorLighting<Context>(image_data, crop, color_lighting_std,
-      color_lighting_eigvecs, color_lighting_eigvals, randgen);
+    ColorLighting<Context>(
+        image_data,
+        crop,
+        color_lighting_std,
+        color_lighting_eigvecs,
+        color_lighting_eigvals,
+        randgen);
   }
 
   // Color normalization
@@ -1026,11 +1026,15 @@ void TransformImage(
 // Only crop / transose the image
 // leave in uint8_t dataType
 template <class Context>
-void CropTransposeImage(const cv::Mat& scaled_img, const int channels,
-                        uint8_t *cropped_data, const int crop,
-                        const bool mirror, std::mt19937 *randgen,
-                        std::bernoulli_distribution *mirror_this_image,
-                        bool is_test = false) {
+void CropTransposeImage(
+    const cv::Mat& scaled_img,
+    const int channels,
+    uint8_t* cropped_data,
+    const int crop,
+    const bool mirror,
+    std::mt19937* randgen,
+    std::bernoulli_distribution* mirror_this_image,
+    bool is_test = false) {
   CAFFE_ENFORCE_GE(
       scaled_img.rows, crop, "Image height must be bigger than crop.");
   CAFFE_ENFORCE_GE(
@@ -1043,16 +1047,16 @@ void CropTransposeImage(const cv::Mat& scaled_img, const int channels,
     height_offset = (scaled_img.rows - crop) / 2;
   } else {
     width_offset =
-      std::uniform_int_distribution<>(0, scaled_img.cols - crop)(*randgen);
+        std::uniform_int_distribution<>(0, scaled_img.cols - crop)(*randgen);
     height_offset =
-      std::uniform_int_distribution<>(0, scaled_img.rows - crop)(*randgen);
+        std::uniform_int_distribution<>(0, scaled_img.rows - crop)(*randgen);
   }
 
   if (mirror && (*mirror_this_image)(*randgen)) {
     // Copy mirrored image.
     for (int h = height_offset; h < height_offset + crop; ++h) {
       for (int w = width_offset + crop - 1; w >= width_offset; --w) {
-        const uint8_t* cv_data = scaled_img.ptr(h) + w*channels;
+        const uint8_t* cv_data = scaled_img.ptr(h) + w * channels;
         for (int c = 0; c < channels; ++c) {
           *(cropped_data++) = cv_data[c];
         }
@@ -1062,7 +1066,7 @@ void CropTransposeImage(const cv::Mat& scaled_img, const int channels,
     // Copy normally.
     for (int h = height_offset; h < height_offset + crop; ++h) {
       for (int w = width_offset; w < width_offset + crop; ++w) {
-        const uint8_t* cv_data = scaled_img.ptr(h) + w*channels;
+        const uint8_t* cv_data = scaled_img.ptr(h) + w * channels;
         for (int c = 0; c < channels; ++c) {
           *(cropped_data++) = cv_data[c];
         }
@@ -1075,9 +1079,11 @@ void CropTransposeImage(const cv::Mat& scaled_img, const int channels,
 // Intended as entry point for binding to thread pool
 template <class Context>
 void ImageInputOp<Context>::DecodeAndTransform(
-      const std::string& value, float *image_data, int item_id,
-      const int channels, std::size_t thread_index) {
-
+    const std::string& value,
+    float* image_data,
+    int item_id,
+    const int channels,
+    std::size_t thread_index) {
   CAFFE_ENFORCE((int)thread_index < num_decode_threads_);
 
   std::bernoulli_distribution mirror_this_image(0.5f);
@@ -1089,18 +1095,34 @@ void ImageInputOp<Context>::DecodeAndTransform(
   CHECK(
       GetImageAndLabelAndInfoFromDBValue(value, &img, info, item_id, randgen));
   // Factor out the image transformation
-  TransformImage<Context>(img, channels, image_data,
-    color_jitter_, img_saturation_, img_brightness_, img_contrast_,
-    color_lighting_, color_lighting_std_, color_lighting_eigvecs_,
-    color_lighting_eigvals_, crop_, mirror_, mean_, std_,
-    randgen, &mirror_this_image, is_test_);
+  TransformImage<Context>(
+      img,
+      channels,
+      image_data,
+      color_jitter_,
+      img_saturation_,
+      img_brightness_,
+      img_contrast_,
+      color_lighting_,
+      color_lighting_std_,
+      color_lighting_eigvecs_,
+      color_lighting_eigvals_,
+      crop_,
+      mirror_,
+      mean_,
+      std_,
+      randgen,
+      &mirror_this_image,
+      is_test_);
 }
 
 template <class Context>
 void ImageInputOp<Context>::DecodeAndTransposeOnly(
-    const std::string& value, uint8_t *image_data, int item_id,
-    const int channels, std::size_t thread_index) {
-
+    const std::string& value,
+    uint8_t* image_data,
+    int item_id,
+    const int channels,
+    std::size_t thread_index) {
   CAFFE_ENFORCE((int)thread_index < num_decode_threads_);
 
   std::bernoulli_distribution mirror_this_image(0.5f);
@@ -1113,11 +1135,17 @@ void ImageInputOp<Context>::DecodeAndTransposeOnly(
       GetImageAndLabelAndInfoFromDBValue(value, &img, info, item_id, randgen));
 
   // Factor out the image transformation
-  CropTransposeImage<Context>(img, channels, image_data, crop_, mirror_,
-                              randgen, &mirror_this_image, is_test_);
+  CropTransposeImage<Context>(
+      img,
+      channels,
+      image_data,
+      crop_,
+      mirror_,
+      randgen,
+      &mirror_this_image,
+      is_test_);
 }
 
-
 template <class Context>
 bool ImageInputOp<Context>::Prefetch() {
   if (!owned_reader_.get()) {
@@ -1146,16 +1174,16 @@ bool ImageInputOp<Context>::Prefetch() {
     reader_->Read(&key, &value);
 
     // determine label type based on first item
-    if( item_id == 0 ) {
-      if( use_caffe_datum_ ) {
+    if (item_id == 0) {
+      if (use_caffe_datum_) {
         prefetched_label_.mutable_data<int>();
       } else {
         TensorProtos protos;
         CAFFE_ENFORCE(protos.ParseFromString(value));
         TensorProto_DataType labeldt = protos.protos(1).data_type();
-        if( labeldt == TensorProto::INT32 ) {
+        if (labeldt == TensorProto::INT32) {
           prefetched_label_.mutable_data<int>();
-        } else if ( labeldt == TensorProto::FLOAT) {
+        } else if (labeldt == TensorProto::FLOAT) {
           prefetched_label_.mutable_data<float>();
         } else {
           LOG(FATAL) << "Unsupported label type.";
@@ -1164,7 +1192,8 @@ bool ImageInputOp<Context>::Prefetch() {
         for (int i = 0; i < additional_inputs_count_; ++i) {
           int index = additional_inputs_offset_ + i;
           TensorProto additional_output_proto = protos.protos(index);
-          auto sizes = std::vector<int64_t>({batch_size_, additional_output_sizes_[i]});
+          auto sizes =
+              std::vector<int64_t>({batch_size_, additional_output_sizes_[i]});
           if (additional_output_proto.data_type() == TensorProto::FLOAT) {
             prefetched_additional_outputs_[i] =
                 caffe2::empty(sizes, at::dtype<float>().device(CPU));
@@ -1312,6 +1341,6 @@ bool ImageInputOp<Context>::CopyPrefetched() {
   }
   return true;
 }
-}  // namespace caffe2
+} // namespace caffe2
 
-#endif  // CAFFE2_IMAGE_IMAGE_INPUT_OP_H_
+#endif // CAFFE2_IMAGE_IMAGE_INPUT_OP_H_
diff --git a/caffe2/image/image_input_op_gpu.cc b/caffe2/image/image_input_op_gpu.cc
index 56d2f3dd317b..a484585770e0 100644
--- a/caffe2/image/image_input_op_gpu.cc
+++ b/caffe2/image/image_input_op_gpu.cc
@@ -35,4 +35,4 @@ bool ImageInputOp<CUDAContext>::ApplyTransformOnGPU(
 
 REGISTER_CUDA_OPERATOR(ImageInput, ImageInputOp<CUDAContext>);
 
-}  // namespace caffe2
+} // namespace caffe2
diff --git a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm
index f556e9c7956c..0131c9179d7d 100644
--- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm
+++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm
@@ -2072,7 +2072,9 @@ bool RunOnDevice() override {
             OperatorBase::GetSingleArgument<int>("post_nms_topN", 300)),
         rpn_nms_thresh_(
             OperatorBase::GetSingleArgument<float>("nms_thresh", 0.7f)),
-        rpn_min_size_(OperatorBase::GetSingleArgument<float>("min_size", 16)) {}
+        rpn_min_size_(OperatorBase::GetSingleArgument<float>("min_size", 16)),
+        legacy_plus_one_(
+            this->template GetSingleArgument<bool>("legacy_plus_one", true)) {}
 
   template <class Derived1, class Derived2>
   std::vector<int> nms_metal(
@@ -2207,14 +2209,21 @@ void ProposalsForOneImage(
     Eigen::Map<ERMatXf>(scores.data(), H * W, A) =
         Eigen::Map<const ERMatXf>(scores_tensor.data(), A, H * W).transpose();
     // Transform anchors into proposals via bbox transformations
-    auto proposals = utils::bbox_transform(all_anchors.array(), bbox_deltas);
+    auto proposals = utils::bbox_transform(
+        all_anchors.array(),
+        bbox_deltas,
+        std::vector<float>{1.0, 1.0, 1.0, 1.0},
+        utils::BBOX_XFORM_CLIP_DEFAULT,
+        legacy_plus_one_);
 
     // 2. clip proposals to image (may result in proposals with zero area
     // that will be removed in the next step)
-    proposals = utils::clip_boxes(proposals, im_info[0], im_info[1]);
+    proposals = utils::clip_boxes(
+        proposals, im_info[0], im_info[1], 1.0, legacy_plus_one_);
 
     // 3. remove predicted boxes with either height or width < min_size
-    auto keep = utils::filter_boxes(proposals, min_size, im_info);
+    auto keep =
+        utils::filter_boxes(proposals, min_size, im_info, legacy_plus_one_);
 
     DCHECK_LE(keep.size(), scores.size());
 
@@ -2334,6 +2343,8 @@ bool RunOnDevice() override {
   float rpn_nms_thresh_{0.7};
   // RPN_MIN_SIZE
   float rpn_min_size_{16};
+  // The infamous "+ 1" for box width and height dating back to the DPM days
+  bool legacy_plus_one_{true};
   // threads per thread group, used in nms
   ushort maxThreadsPerThreadgroup{32};
 
diff --git a/caffe2/operators/batch_matmul_op.cc b/caffe2/operators/batch_matmul_op.cc
index 62ad5ad57121..aeefc131f002 100644
--- a/caffe2/operators/batch_matmul_op.cc
+++ b/caffe2/operators/batch_matmul_op.cc
@@ -129,7 +129,7 @@ from 0 to (dim0 * dim1 ...) - 1. rank(A) == rank(B) >= 2. In case of A and B bei
 two diemnsional, it behaves like normal matrix multiplication.
 )DOC")
     .Input(0, "A", "tensor of shape (dim0, dim1 ... M, K)")
-    .Input(1, "B", "tensor of shpae (dim0, dim2 ... K, N)")
+    .Input(1, "B", "tensor of shape (dim0, dim1 ... K, N)")
     .Output(0, "Y", "tensor of shape (dim0, dim1 ... M, N)")
     .Arg(
         "trans_a",
diff --git a/caffe2/operators/bbox_transform_op.cc b/caffe2/operators/bbox_transform_op.cc
index f8c795c5dba0..e09d795320c9 100644
--- a/caffe2/operators/bbox_transform_op.cc
+++ b/caffe2/operators/bbox_transform_op.cc
@@ -154,11 +154,12 @@ bool BBoxTransformOp<float, CPUContext>::RunOnDevice() {
           cur_deltas,
           weights_,
           utils::BBOX_XFORM_CLIP_DEFAULT,
+          legacy_plus_one_,
           angle_bound_on_,
           angle_bound_lo_,
           angle_bound_hi_);
-      EArrXXf clip_boxes =
-          utils::clip_boxes(trans_boxes, img_h, img_w, clip_angle_thresh_);
+      EArrXXf clip_boxes = utils::clip_boxes(
+          trans_boxes, img_h, img_w, clip_angle_thresh_, legacy_plus_one_);
       // Do not apply scale for angle in rotated boxes
       clip_boxes.leftCols(4) *= scale_after;
       new_boxes.block(offset, k * box_dim, num_rois, box_dim) = clip_boxes;
@@ -184,6 +185,7 @@ bool BBoxTransformOp<float, CPUContext>::RunOnDevice() {
 using BBoxTransformOpFloatCPU =
     caffe2::BBoxTransformOp<float, caffe2::CPUContext>;
 
+// clang-format off
 C10_REGISTER_CAFFE2_OPERATOR_CPU(
     BBoxTransform,
     "_caffe2::BBoxTransform("
@@ -196,9 +198,11 @@ C10_REGISTER_CAFFE2_OPERATOR_CPU(
       "bool angle_bound_on, "
       "int angle_bound_lo, "
       "int angle_bound_hi, "
-      "float clip_angle_thresh"
+      "float clip_angle_thresh, "
+      "bool legacy_plus_one"
     ") -> ("
       "Tensor output_0, "
       "Tensor output_1"
     ")",
     BBoxTransformOpFloatCPU);
+// clang-format on
diff --git a/caffe2/operators/bbox_transform_op.h b/caffe2/operators/bbox_transform_op.h
index e2bcf9b8c05d..57eef7533206 100644
--- a/caffe2/operators/bbox_transform_op.h
+++ b/caffe2/operators/bbox_transform_op.h
@@ -3,11 +3,11 @@
 #ifndef BBOX_TRANSFORM_OP_H_
 #define BBOX_TRANSFORM_OP_H_
 
+#include "caffe2/core/c10_operator.h"
 #include "caffe2/core/context.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/utils/math.h"
-#include "caffe2/core/c10_operator.h"
 
 C10_DECLARE_CAFFE2_OPERATOR(BBoxTransform)
 
@@ -16,7 +16,7 @@ namespace caffe2 {
 template <typename T, class Context>
 class BBoxTransformOp final : public Operator<Context> {
  public:
-  template<class... Args>
+  template <class... Args>
   explicit BBoxTransformOp(Args&&... args)
       : Operator<Context>(std::forward<Args>(args)...),
         weights_(this->template GetRepeatedArgument<T>(
@@ -32,7 +32,9 @@ class BBoxTransformOp final : public Operator<Context> {
         angle_bound_hi_(
             this->template GetSingleArgument<int>("angle_bound_hi", 90)),
         clip_angle_thresh_(
-            this->template GetSingleArgument<float>("clip_angle_thresh", 1.0)) {
+            this->template GetSingleArgument<float>("clip_angle_thresh", 1.0)),
+        legacy_plus_one_(
+            this->template GetSingleArgument<bool>("legacy_plus_one", true)) {
     CAFFE_ENFORCE_EQ(
         weights_.size(),
         4,
@@ -62,6 +64,8 @@ class BBoxTransformOp final : public Operator<Context> {
   // tolerance for backward compatibility. Set to negative value for
   // no clipping.
   float clip_angle_thresh_{1.0};
+  // The infamous "+ 1" for box width and height dating back to the DPM days
+  bool legacy_plus_one_{true};
 };
 
 } // namespace caffe2
diff --git a/caffe2/operators/box_with_nms_limit_op.cc b/caffe2/operators/box_with_nms_limit_op.cc
index b780bc2994c4..c1890de4c60c 100644
--- a/caffe2/operators/box_with_nms_limit_op.cc
+++ b/caffe2/operators/box_with_nms_limit_op.cc
@@ -98,7 +98,9 @@ bool BoxWithNMSLimitOp<CPUContext>::RunOnDevice() {
             soft_nms_sigma_,
             nms_thres_,
             soft_nms_min_score_thres_,
-            soft_nms_method_);
+            soft_nms_method_,
+            -1, /* topN */
+            legacy_plus_one_);
       } else {
         std::sort(
             inds.data(),
@@ -107,8 +109,13 @@ bool BoxWithNMSLimitOp<CPUContext>::RunOnDevice() {
               return cur_scores(lhs) > cur_scores(rhs);
             });
         int keep_max = detections_per_im_ > 0 ? detections_per_im_ : -1;
-        keeps[j] =
-            utils::nms_cpu(cur_boxes, cur_scores, inds, nms_thres_, keep_max);
+        keeps[j] = utils::nms_cpu(
+            cur_boxes,
+            cur_scores,
+            inds,
+            nms_thres_,
+            keep_max,
+            legacy_plus_one_);
       }
       total_keep_count += keeps[j].size();
     }
@@ -300,6 +307,7 @@ SHOULD_NOT_DO_GRADIENT(BoxWithNMSLimit);
 } // namespace
 } // namespace caffe2
 
+// clang-format off
 C10_REGISTER_CAFFE2_OPERATOR_CPU(
     BoxWithNMSLimit,
     "_caffe2::BoxWithNMSLimit("
@@ -316,7 +324,8 @@ C10_REGISTER_CAFFE2_OPERATOR_CPU(
       "bool rotated, "
       "bool cls_agnostic_bbox_reg, "
       "bool input_boxes_include_bg_cls, "
-      "bool output_classes_include_bg_cls "
+      "bool output_classes_include_bg_cls, "
+      "bool legacy_plus_one "
     ") -> ("
       "Tensor scores, "
       "Tensor boxes, "
@@ -326,3 +335,4 @@ C10_REGISTER_CAFFE2_OPERATOR_CPU(
       //"Tensor keeps_size, "
     ")",
     caffe2::BoxWithNMSLimitOp<caffe2::CPUContext>);
+// clang-format on
diff --git a/caffe2/operators/box_with_nms_limit_op.h b/caffe2/operators/box_with_nms_limit_op.h
index 090993fcef25..d0c7c6a37a3c 100644
--- a/caffe2/operators/box_with_nms_limit_op.h
+++ b/caffe2/operators/box_with_nms_limit_op.h
@@ -3,10 +3,9 @@
 #ifndef BOX_WITH_NMS_AND_LIMIT_OP_H_
 #define BOX_WITH_NMS_AND_LIMIT_OP_H_
 
+#include "caffe2/core/c10_operator.h"
 #include "caffe2/core/context.h"
 #include "caffe2/core/operator.h"
-#include "caffe2/core/c10_operator.h"
-
 
 C10_DECLARE_CAFFE2_OPERATOR(BoxWithNMSLimit)
 
@@ -44,7 +43,9 @@ class BoxWithNMSLimitOp final : public Operator<Context> {
             true)),
         output_classes_include_bg_cls_(this->template GetSingleArgument<bool>(
             "output_classes_include_bg_cls",
-            true)) {
+            true)),
+        legacy_plus_one_(
+            this->template GetSingleArgument<bool>("legacy_plus_one", true)) {
     CAFFE_ENFORCE(
         soft_nms_method_str_ == "linear" || soft_nms_method_str_ == "gaussian",
         "Unexpected soft_nms_method");
@@ -91,6 +92,8 @@ class BoxWithNMSLimitOp final : public Operator<Context> {
   // The index where foreground starts in scoures. Eg. if 0 represents
   // background class then foreground class starts with 1.
   int input_scores_fg_cls_starting_id_{1};
+  // The infamous "+ 1" for box width and height dating back to the DPM days
+  bool legacy_plus_one_{true};
 
   // Map a class id (starting with background and then foreground) from (0, 1,
   // ..., NUM_FG_CLASSES) to it's matching value in box
diff --git a/caffe2/operators/bucketize_op.cc b/caffe2/operators/bucketize_op.cc
new file mode 100644
index 000000000000..a7e9229e70ff
--- /dev/null
+++ b/caffe2/operators/bucketize_op.cc
@@ -0,0 +1,64 @@
+#include "caffe2/operators/bucketize_op.h"
+
+#include "caffe2/core/operator.h"
+#include "caffe2/core/tensor.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+bool BucketizeOp<T, Context>::RunOnDevice() {
+  auto& input = Input(X);
+  CAFFE_ENFORCE_GE(input.dim(), 1);
+
+  auto N = input.numel();
+  auto* output = Output(INDICES, input.sizes(), at::dtype<T>());
+  const auto* input_data = input.template data<float>();
+  auto* output_data = output->template mutable_data<T>();
+
+  math::Set<T, Context>(output->numel(), 0.0, output_data, &context_);
+
+  for (int64_t pos = 0; pos < N; pos++) {
+    // here we assume the boundary values for each feature are sorted
+    int64_t bucket_idx =
+        std::lower_bound(
+            boundaries_.begin(), boundaries_.end(), input_data[pos]) -
+        boundaries_.begin();
+    output_data[pos] = bucket_idx;
+  }
+
+  return true;
+};
+REGISTER_CPU_OPERATOR(Bucketize, BucketizeOp<int32_t, CPUContext>);
+
+OPERATOR_SCHEMA(Bucketize)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+This operator works as bucketize in tensorflow and digitize
+in numpy. It bucketizes the input 'X' based on argument 'boundaries'.
+For each value x in input 'data', the operator returns index i given
+boundaries[i-1] < x <= boundaries[i].
+If values in 'data' are beyond the bounds of boundaries, 0 or
+len(boundaries) is returned as appropriate.
+The boundaries need to be monotonically increasing.
+For example
+
+If data = [2, 4, 1] and boundaries = [0.1, 2.5], then
+
+output = [1, 2, 1]
+
+If data = [[2, 3], [4, 1], [2, 5]] and boundaries = [0.1, 2.5], then
+
+output = [[1, 2], [2, 1], [1, 2]]
+
+)DOC")
+    .Input(0, "data", "input tensor")
+    .Output(
+        0,
+        "output",
+        "indices of bins given by boundaries to which each value"
+        "in data belongs")
+    .Arg("boundaries", "bucketization boundaries");
+
+NO_GRADIENT(BucketizeOp);
+} // namespace caffe2
diff --git a/caffe2/operators/bucketize_op.cu b/caffe2/operators/bucketize_op.cu
new file mode 100644
index 000000000000..1864c08fa637
--- /dev/null
+++ b/caffe2/operators/bucketize_op.cu
@@ -0,0 +1,7 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/bucketize_op.h"
+#include "caffe2/operators/operator_fallback_gpu.h"
+
+namespace caffe2 {
+REGISTER_CUDA_OPERATOR(Bucketize, GPUFallbackOp);
+} // namespace caffe2
diff --git a/caffe2/operators/bucketize_op.h b/caffe2/operators/bucketize_op.h
new file mode 100644
index 000000000000..7e536b701a61
--- /dev/null
+++ b/caffe2/operators/bucketize_op.h
@@ -0,0 +1,37 @@
+// Copyright 2004-present Facebook. All Rights Reserved.
+
+#ifndef CAFFE2_OPERATORS_BUCKETIZE_OP_H_
+#define CAFFE2_OPERATORS_BUCKETIZE_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class BucketizeOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  BucketizeOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        boundaries_(this->template GetRepeatedArgument<float>("boundaries")) {
+    CAFFE_ENFORCE(
+        std::is_sorted(boundaries_.begin(), boundaries_.end()),
+        "The boundaries need to be monotonically increasing");
+  }
+
+  bool RunOnDevice() override;
+
+ protected:
+  INPUT_TAGS(X);
+  OUTPUT_TAGS(INDICES);
+
+ private:
+  std::vector<float> boundaries_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_BUCKETIZE_OP_H_
diff --git a/caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.cc b/caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.cc
index 3e15a9e0ee82..53390ac6ed51 100644
--- a/caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.cc
+++ b/caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.cc
@@ -5,14 +5,14 @@ namespace caffe2 {
 namespace utils {
 
 // Compute the area of an array of boxes.
-ERArrXXf BoxesArea(const ERArrXXf& boxes) {
+ERArrXXf BoxesArea(const ERArrXXf& boxes, const bool legacy_plus_one) {
   // equivalent to python code
   //   w = (boxes[:, 2] - boxes[:, 0] + 1)
   //   h = (boxes[:, 3] - boxes[:, 1] + 1)
   //   areas = w * h
   //   assert np.all(areas >= 0), 'Negative areas founds'
-  const auto w = boxes.col(2) - boxes.col(0) + 1;
-  const auto h = boxes.col(3) - boxes.col(1) + 1;
+  const auto w = boxes.col(2) - boxes.col(0) + int(legacy_plus_one);
+  const auto h = boxes.col(3) - boxes.col(1) + int(legacy_plus_one);
   const ERArrXXf areas = w * h;
   CAFFE_ENFORCE((areas >= 0).all(), "Negative areas founds: ", boxes);
   return areas;
@@ -20,11 +20,15 @@ ERArrXXf BoxesArea(const ERArrXXf& boxes) {
 
 // Determine which FPN level each RoI in a set of RoIs should map to based
 // on the heuristic in the FPN paper.
-ERArrXXf MapRoIsToFpnLevels(Eigen::Ref<const ERArrXXf> rois,
-                            const float k_min, const float k_max,
-                            const float s0, const float lvl0) {
+ERArrXXf MapRoIsToFpnLevels(
+    Eigen::Ref<const ERArrXXf> rois,
+    const float k_min,
+    const float k_max,
+    const float s0,
+    const float lvl0,
+    const bool legacy_plus_one) {
   // Compute level ids
-  ERArrXXf s = BoxesArea(rois).sqrt();
+  ERArrXXf s = BoxesArea(rois, legacy_plus_one).sqrt();
   // s0 = cfg.FPN.ROI_CANONICAL_SCALE  # default: 224
   // lvl0 = cfg.FPN.ROI_CANONICAL_LEVEL  # default: 4
 
@@ -39,8 +43,10 @@ ERArrXXf MapRoIsToFpnLevels(Eigen::Ref<const ERArrXXf> rois,
 
 // Sort RoIs from highest to lowest individual RoI score based on
 // values from scores array and limit to n results
-void SortAndLimitRoIsByScores(Eigen::Ref<const EArrXf> scores, int n,
-                              ERArrXXf& rois) {
+void SortAndLimitRoIsByScores(
+    Eigen::Ref<const EArrXf> scores,
+    int n,
+    ERArrXXf& rois) {
   CAFFE_ENFORCE(rois.rows() == scores.size(), "RoIs and scores count mismatch");
   // Create index array with 0, 1, ... N
   std::vector<int> idxs(rois.rows());
@@ -48,8 +54,12 @@ void SortAndLimitRoIsByScores(Eigen::Ref<const EArrXf> scores, int n,
   // Reuse a comparator based on scores and store a copy of RoIs that
   // will be truncated and manipulated below
   auto comp = [&scores](int lhs, int rhs) {
-    if (scores(lhs) > scores(rhs)) return true;
-    if (scores(lhs) < scores(rhs)) return false;
+    if (scores(lhs) > scores(rhs)) {
+      return true;
+    }
+    if (scores(lhs) < scores(rhs)) {
+      return false;
+    }
     // To ensure the sort is stable
     return lhs < rhs;
   };
@@ -86,9 +96,12 @@ void ArgSort(EArrXi& arr) {
 
 // Update out_filtered and out_indices with rows from rois where lvl matches
 // value in lvls passed in.
-void RowsWhereRoILevelEquals(Eigen::Ref<const ERArrXXf> rois,
-                             const ERArrXXf& lvls, const int lvl,
-                             ERArrXXf* out_filtered, EArrXi* out_indices) {
+void RowsWhereRoILevelEquals(
+    Eigen::Ref<const ERArrXXf> rois,
+    const ERArrXXf& lvls,
+    const int lvl,
+    ERArrXXf* out_filtered,
+    EArrXi* out_indices) {
   CAFFE_ENFORCE(out_filtered != nullptr, "Output filtered required");
   CAFFE_ENFORCE(out_indices != nullptr, "Output indices required");
   CAFFE_ENFORCE(rois.rows() == lvls.rows(), "RoIs and lvls count mismatch");
@@ -142,6 +155,7 @@ bool CollectAndDistributeFpnRpnProposalsOp<CPUContext>::RunOnDevice() {
     rois.block(len, 0, n, 5) = roi;
 
     const auto& score_in = Input(num_rpn_lvls + i);
+    CAFFE_ENFORCE_EQ(score_in.size(0), n);
 
     // No need to squeeze, since we are reshaping when converting to Eigen
     // https://docs.scipy.org/doc/numpy/reference/generated/numpy.squeeze.html
@@ -167,9 +181,8 @@ bool CollectAndDistributeFpnRpnProposalsOp<CPUContext>::RunOnDevice() {
   const int canon_scale = roi_canonical_scale_;
   const int canon_level = roi_canonical_level_;
   auto rois_block = rois.block(0, 1, rois.rows(), 4);
-  auto lvls = utils::MapRoIsToFpnLevels(rois_block,
-                                        lvl_min, lvl_max,
-                                        canon_scale, canon_level);
+  auto lvls = utils::MapRoIsToFpnLevels(
+      rois_block, lvl_min, lvl_max, canon_scale, canon_level, legacy_plus_one_);
 
   // equivalent to python code
   //   outputs[0].reshape(rois.shape)
@@ -193,7 +206,8 @@ bool CollectAndDistributeFpnRpnProposalsOp<CPUContext>::RunOnDevice() {
   //       outputs[output_idx + 1].data[...] = blob_roi_level
   //       rois_idx_order = np.concatenate((rois_idx_order, idx_lvl))
   //   rois_idx_restore = np.argsort(rois_idx_order)
-  //   blob_utils.py_op_copy_blob(rois_idx_restore.astype(np.int32), outputs[-1])
+  //   blob_utils.py_op_copy_blob(rois_idx_restore.astype(np.int32),
+  //   outputs[-1])
   EArrXi rois_idx_restore;
   for (int i = 0, lvl = lvl_min; i < num_roi_lvls; i++, lvl++) {
     ERArrXXf blob_roi_level;
@@ -213,7 +227,144 @@ bool CollectAndDistributeFpnRpnProposalsOp<CPUContext>::RunOnDevice() {
     roi_out_mat = blob_roi_level;
 
     // Append indices from idx_lvl to rois_idx_restore
-    rois_idx_restore.conservativeResize(rois_idx_restore.size() + idx_lvl.size());
+    rois_idx_restore.conservativeResize(
+        rois_idx_restore.size() + idx_lvl.size());
+    rois_idx_restore.tail(idx_lvl.size()) = idx_lvl;
+  }
+  utils::ArgSort(rois_idx_restore);
+
+  auto* rois_idx_restore_out =
+      Output(OutputSize() - 1, {rois_idx_restore.size()}, at::dtype<int>());
+  Eigen::Map<EArrXi> rois_idx_restore_out_mat(
+      rois_idx_restore_out->template mutable_data<int>(),
+      rois_idx_restore.size());
+  rois_idx_restore_out_mat = rois_idx_restore;
+
+  return true;
+}
+
+template <>
+bool CollectRpnProposalsOp<CPUContext>::RunOnDevice() {
+  int num_rpn_lvls = rpn_max_level_ - rpn_min_level_ + 1;
+  CAFFE_ENFORCE_EQ(InputSize(), 2 * num_rpn_lvls);
+
+  // Collect rois and scores in Eigen
+  // rois are in [[batch_idx, x0, y0, x1, y2], ...] format
+  // Combine predictions across all levels and retain the top scoring
+  //
+  // equivalent to python code
+  //   roi_inputs = inputs[:num_rpn_lvls]
+  //   score_inputs = inputs[num_rpn_lvls:]
+  //   rois = np.concatenate([blob.data for blob in roi_inputs])
+  //   scores = np.concatenate([blob.data for blob in score_inputs]).squeeze()
+  int proposal_num = 0;
+  for (int i = 0; i < num_rpn_lvls; i++) {
+    const auto& roi_in = Input(i);
+    proposal_num += roi_in.size(0);
+  }
+  ERArrXXf rois(proposal_num, 5);
+  EArrXf scores(proposal_num);
+  int len = 0;
+  for (int i = 0; i < num_rpn_lvls; i++) {
+    const auto& roi_in = Input(i);
+    const int n = roi_in.size(0);
+
+    Eigen::Map<const ERArrXXf> roi(roi_in.data<float>(), n, 5);
+    rois.block(len, 0, n, 5) = roi;
+
+    const auto& score_in = Input(num_rpn_lvls + i);
+    CAFFE_ENFORCE_EQ(score_in.size(0), n);
+
+    // No need to squeeze, since we are reshaping when converting to Eigen
+    // https://docs.scipy.org/doc/numpy/reference/generated/numpy.squeeze.html
+    Eigen::Map<const EArrXf> score(score_in.data<float>(), n);
+    scores.segment(len, n) = score;
+
+    len += n;
+  }
+
+  // Grab only top rpn_post_nms_topN rois
+  // equivalent to python code
+  //   inds = np.argsort(-scores)[:rpn_post_nms_topN]
+  //   rois = rois[inds, :]
+  utils::SortAndLimitRoIsByScores(scores, rpn_post_nms_topN_, rois);
+
+  // equivalent to python code
+  //   outputs[0].reshape(rois.shape)
+  //   outputs[0].data[...] = rois
+
+  auto* rois_out = Output(0, {rois.rows(), rois.cols()}, at::dtype<float>());
+  Eigen::Map<ERArrXXf> rois_out_mat(
+      rois_out->template mutable_data<float>(), rois.rows(), rois.cols());
+  rois_out_mat = rois;
+
+  return true;
+}
+
+template <>
+bool DistributeFpnProposalsOp<CPUContext>::RunOnDevice() {
+  int num_roi_lvls = roi_max_level_ - roi_min_level_ + 1;
+  CAFFE_ENFORCE_EQ(OutputSize(), num_roi_lvls + 1);
+
+  // Load Input(0) to rois
+  const auto& rois_in = Input(0);
+  const int num_rois = rois_in.size(0);
+  const int dim_rois = rois_in.size(1);
+  CAFFE_ENFORCE(dim_rois == 4 || dim_rois == 5);
+  Eigen::Map<const ERArrXXf> rois_4or5(
+      rois_in.data<float>(), num_rois, dim_rois);
+  ERArrXXf rois = ERArrXXf::Zero(num_rois, 5);
+  rois.rightCols(dim_rois) = rois_4or5;
+
+  // Distribute
+  // equivalent to python code
+  //   lvl_min = cfg.FPN.ROI_MIN_LEVEL
+  //   lvl_max = cfg.FPN.ROI_MAX_LEVEL
+  //   lvls = fpn.map_rois_to_fpn_levels(rois[:, 1:5], lvl_min, lvl_max)
+  const int lvl_min = roi_min_level_;
+  const int lvl_max = roi_max_level_;
+  const int canon_scale = roi_canonical_scale_;
+  const int canon_level = roi_canonical_level_;
+  auto rois_block = rois.block(0, 1, rois.rows(), 4);
+  auto lvls = utils::MapRoIsToFpnLevels(
+      rois_block, lvl_min, lvl_max, canon_scale, canon_level, legacy_plus_one_);
+
+  // Create new roi blobs for each FPN level
+  // (See: modeling.FPN.add_multilevel_roi_blobs which is similar but annoying
+  // to generalize to support this particular case.)
+  //
+  // equivalent to python code
+  //   rois_idx_order = np.empty((0, ))
+  //   for (output_idx, lvl in enumerate(range(lvl_min, lvl_max + 1)))
+  //       idx_lvl = np.where(lvls == lvl)[0]
+  //       blob_roi_level = rois[idx_lvl, :]
+  //       outputs[output_idx + 1].reshape(blob_roi_level.shape)
+  //       outputs[output_idx + 1].data[...] = blob_roi_level
+  //       rois_idx_order = np.concatenate((rois_idx_order, idx_lvl))
+  //   rois_idx_restore = np.argsort(rois_idx_order)
+  //   blob_utils.py_op_copy_blob(rois_idx_restore.astype(np.int32),
+  //   outputs[-1])
+  EArrXi rois_idx_restore;
+  for (int i = 0, lvl = lvl_min; i < num_roi_lvls; i++, lvl++) {
+    ERArrXXf blob_roi_level;
+    EArrXi idx_lvl;
+    utils::RowsWhereRoILevelEquals(rois, lvls, lvl, &blob_roi_level, &idx_lvl);
+
+    // Output blob_roi_level
+
+    auto* roi_out = Output(
+        i + 0,
+        {blob_roi_level.rows(), blob_roi_level.cols()},
+        at::dtype<float>());
+    Eigen::Map<ERArrXXf> roi_out_mat(
+        roi_out->template mutable_data<float>(),
+        blob_roi_level.rows(),
+        blob_roi_level.cols());
+    roi_out_mat = blob_roi_level;
+
+    // Append indices from idx_lvl to rois_idx_restore
+    rois_idx_restore.conservativeResize(
+        rois_idx_restore.size() + idx_lvl.size());
     rois_idx_restore.tail(idx_lvl.size()) = idx_lvl;
   }
   utils::ArgSort(rois_idx_restore);
@@ -230,7 +381,13 @@ bool CollectAndDistributeFpnRpnProposalsOp<CPUContext>::RunOnDevice() {
 
 namespace {
 
-REGISTER_CPU_OPERATOR(CollectAndDistributeFpnRpnProposals, CollectAndDistributeFpnRpnProposalsOp<CPUContext>);
+REGISTER_CPU_OPERATOR(
+    CollectAndDistributeFpnRpnProposals,
+    CollectAndDistributeFpnRpnProposalsOp<CPUContext>);
+REGISTER_CPU_OPERATOR(CollectRpnProposals, CollectRpnProposalsOp<CPUContext>);
+REGISTER_CPU_OPERATOR(
+    DistributeFpnProposals,
+    DistributeFpnProposalsOp<CPUContext>);
 
 OPERATOR_SCHEMA(CollectAndDistributeFpnRpnProposals)
     .NumInputs(2, INT_MAX)
@@ -344,5 +501,175 @@ will change.
 
 SHOULD_NOT_DO_GRADIENT(CollectAndDistributeFpnRpnProposals);
 
+OPERATOR_SCHEMA(CollectRpnProposals)
+    .NumInputs(2, INT_MAX)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+...
+)DOC")
+    .Arg("rpn_max_level", "(int) RPN_MAX_LEVEL")
+    .Arg("rpn_min_level", "(int) RPN_MIN_LEVEL")
+    .Arg("rpn_post_nms_topN", "(int) RPN_POST_NMS_TOP_N")
+    .Input(
+        0,
+        "rpn_rois_fpn2",
+        "RPN proposals for FPN level 2, "
+        "format (image_index, x1, y1, x2, y2). See rpn_rois "
+        "documentation from GenerateProposals.")
+    .Input(
+        1,
+        "rpn_rois_fpn3",
+        "RPN proposals for FPN level 3, "
+        "format (image_index, x1, y1, x2, y2). See rpn_rois "
+        "documentation from GenerateProposals.")
+    .Input(
+        2,
+        "rpn_rois_fpn4",
+        "RPN proposals for FPN level 4, "
+        "format (image_index, x1, y1, x2, y2). See rpn_rois "
+        "documentation from GenerateProposals.")
+    .Input(
+        3,
+        "rpn_rois_fpn5",
+        "RPN proposals for FPN level 5, "
+        "format (image_index, x1, y1, x2, y2). See rpn_rois "
+        "documentation from GenerateProposals.")
+    .Input(
+        4,
+        "rpn_rois_fpn6",
+        "RPN proposals for FPN level 6, "
+        "format (image_index, x1, y1, x2, y2). See rpn_rois "
+        "documentation from GenerateProposals.")
+    .Input(
+        5,
+        "rpn_roi_probs_fpn2",
+        "RPN objectness probabilities for FPN level 2. "
+        "See rpn_roi_probs documentation from GenerateProposals.")
+    .Input(
+        6,
+        "rpn_roi_probs_fpn3",
+        "RPN objectness probabilities for FPN level 3. "
+        "See rpn_roi_probs documentation from GenerateProposals.")
+    .Input(
+        7,
+        "rpn_roi_probs_fpn4",
+        "RPN objectness probabilities for FPN level 4. "
+        "See rpn_roi_probs documentation from GenerateProposals.")
+    .Input(
+        8,
+        "rpn_roi_probs_fpn5",
+        "RPN objectness probabilities for FPN level 5. "
+        "See rpn_roi_probs documentation from GenerateProposals.")
+    .Input(
+        9,
+        "rpn_roi_probs_fpn6",
+        "RPN objectness probabilities for FPN level 6. "
+        "See rpn_roi_probs documentation from GenerateProposals.")
+    .Output(
+        0,
+        "rois",
+        "Top proposals limited to rpn_post_nms_topN total, "
+        "format (image_index, x1, y1, x2, y2)");
+
+SHOULD_NOT_DO_GRADIENT(CollectRpnProposals);
+
+OPERATOR_SCHEMA(DistributeFpnProposals)
+    .NumInputs(1)
+    .NumOutputs(2, INT_MAX)
+    .SetDoc(R"DOC(
+...
+)DOC")
+    .Arg("roi_canonical_scale", "(int) ROI_CANONICAL_SCALE")
+    .Arg("roi_canonical_level", "(int) ROI_CANONICAL_LEVEL")
+    .Arg("roi_max_level", "(int) ROI_MAX_LEVEL")
+    .Arg("roi_min_level", "(int) ROI_MIN_LEVEL")
+    .Input(
+        0,
+        "rois",
+        "Top proposals limited to rpn_post_nms_topN total, "
+        "format (image_index, x1, y1, x2, y2)")
+    .Output(
+        0,
+        "rois_fpn2",
+        "RPN proposals for ROI level 2, "
+        "format (image_index, x1, y1, x2, y2)")
+    .Output(
+        1,
+        "rois_fpn3",
+        "RPN proposals for ROI level 3, "
+        "format (image_index, x1, y1, x2, y2)")
+    .Output(
+        2,
+        "rois_fpn4",
+        "RPN proposals for ROI level 4, "
+        "format (image_index, x1, y1, x2, y2)")
+    .Output(
+        3,
+        "rois_fpn5",
+        "RPN proposals for ROI level 5, "
+        "format (image_index, x1, y1, x2, y2)")
+    .Output(
+        4,
+        "rois_idx_restore",
+        "Permutation on the concatenation of all "
+        "rois_fpni, i=min...max, such that when applied the RPN RoIs are "
+        "restored to their original order in the input blobs.");
+
+SHOULD_NOT_DO_GRADIENT(DistributeFpnProposals);
+
 } // namespace
 } // namespace caffe2
+
+// clang-format off
+C10_REGISTER_CAFFE2_OPERATOR_CPU(
+    CollectAndDistributeFpnRpnProposals,
+    "_caffe2::CollectAndDistributeFpnRpnProposals("
+      "Tensor[] input_list, "
+      "int roi_canonical_scale, "
+      "int roi_canonical_level, "
+      "int roi_max_level, "
+      "int roi_min_level, "
+      "int rpn_max_level, "
+      "int rpn_min_level, "
+      "int rpn_post_nms_topN, "
+      "bool legacy_plus_one"
+    ") -> ("
+      "Tensor rois, "
+      "Tensor rois_fpn2, "
+      "Tensor rois_fpn3, "
+      "Tensor rois_fpn4, "
+      "Tensor rois_fpn5, "
+      "Tensor rois_idx_restore_int32"
+    ")",
+    caffe2::CollectAndDistributeFpnRpnProposalsOp<caffe2::CPUContext>);
+
+C10_REGISTER_CAFFE2_OPERATOR_CPU(
+    CollectRpnProposals,
+    "_caffe2::CollectRpnProposals("
+      "Tensor[] input_list, "
+      "int rpn_max_level, "
+      "int rpn_min_level, "
+      "int rpn_post_nms_topN"
+    ") -> ("
+      "Tensor rois"
+    ")",
+    caffe2::CollectRpnProposalsOp<caffe2::CPUContext>);
+
+C10_REGISTER_CAFFE2_OPERATOR_CPU(
+    DistributeFpnProposals,
+    "_caffe2::DistributeFpnProposals("
+      "Tensor rois, "
+      "int roi_canonical_scale, "
+      "int roi_canonical_level, "
+      "int roi_max_level, "
+      "int roi_min_level, "
+      "bool legacy_plus_one"
+    ") -> ("
+      "Tensor rois_fpn2, "
+      "Tensor rois_fpn3, "
+      "Tensor rois_fpn4, "
+      "Tensor rois_fpn5, "
+      "Tensor rois_idx_restore_int32"
+    ")",
+    caffe2::DistributeFpnProposalsOp<caffe2::CPUContext>);
+// clang-format on
diff --git a/caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.h b/caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.h
index a73ef60aa41a..ed44b4cb130c 100644
--- a/caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.h
+++ b/caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.h
@@ -1,28 +1,39 @@
 #ifndef CAFFE2_OPERATORS_COLLECT_AND_DISTRIBUTE_FPN_RPN_PROPOSALS_OP_H_
 #define CAFFE2_OPERATORS_COLLECT_AND_DISTRIBUTE_FPN_RPN_PROPOSALS_OP_H_
 
+#include "caffe2/core/c10_operator.h"
 #include "caffe2/core/context.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/utils/eigen_utils.h"
 #include "caffe2/utils/math.h"
 
+C10_DECLARE_CAFFE2_OPERATOR(CollectAndDistributeFpnRpnProposals);
+C10_DECLARE_CAFFE2_OPERATOR(CollectRpnProposals);
+C10_DECLARE_CAFFE2_OPERATOR(DistributeFpnProposals);
+
 namespace caffe2 {
 
 namespace utils {
 
 // Compute the area of an array of boxes.
-ERArrXXf BoxesArea(const ERArrXXf& boxes);
+ERArrXXf BoxesArea(const ERArrXXf& boxes, const bool legacy_plus_one = false);
 
 // Determine which FPN level each RoI in a set of RoIs should map to based
 // on the heuristic in the FPN paper.
-ERArrXXf MapRoIsToFpnLevels(Eigen::Ref<const ERArrXXf> rois,
-                            const float k_min, const float k_max,
-                            const float s0, const float lvl0);
+ERArrXXf MapRoIsToFpnLevels(
+    Eigen::Ref<const ERArrXXf> rois,
+    const float k_min,
+    const float k_max,
+    const float s0,
+    const float lvl0,
+    const bool legacy_plus_one = false);
 
 // Sort RoIs from highest to lowest individual RoI score based on
 // values from scores array and limit to n results
-void SortAndLimitRoIsByScores(Eigen::Ref<const EArrXf> scores, int n,
-                              ERArrXXf& rois);
+void SortAndLimitRoIsByScores(
+    Eigen::Ref<const EArrXf> scores,
+    int n,
+    ERArrXXf& rois);
 
 // Updates arr to be indices that would sort the array. Implementation of
 // https://docs.scipy.org/doc/numpy/reference/generated/numpy.argsort.html
@@ -30,18 +41,22 @@ void ArgSort(EArrXi& arr);
 
 // Update out_filtered and out_indices with rows from rois where lvl matches
 // value in lvls passed in.
-void RowsWhereRoILevelEquals(Eigen::Ref<const ERArrXXf> rois,
-                             const ERArrXXf& lvls, const int lvl,
-                             ERArrXXf* out_filtered, EArrXi* out_indices);
+void RowsWhereRoILevelEquals(
+    Eigen::Ref<const ERArrXXf> rois,
+    const ERArrXXf& lvls,
+    const int lvl,
+    ERArrXXf* out_filtered,
+    EArrXi* out_indices);
 
 } // namespace utils
 
 // C++ implementation of CollectAndDistributeFpnRpnProposalsOp
 // Merge RPN proposals generated at multiple FPN levels and then
-//    distribute those proposals to their appropriate FPN levels for Faster RCNN.
-//    An anchor at one FPN level may predict an RoI that will map to another
-//    level, hence the need to redistribute the proposals.
-// Reference: facebookresearch/Detectron/detectron/ops/collect_and_distribute_fpn_rpn_proposals.py
+//    distribute those proposals to their appropriate FPN levels for Faster
+//    RCNN. An anchor at one FPN level may predict an RoI that will map to
+//    another level, hence the need to redistribute the proposals.
+// Reference:
+// facebookresearch/Detectron/detectron/ops/collect_and_distribute_fpn_rpn_proposals.py
 template <class Context>
 class CollectAndDistributeFpnRpnProposalsOp final : public Operator<Context> {
  public:
@@ -62,7 +77,9 @@ class CollectAndDistributeFpnRpnProposalsOp final : public Operator<Context> {
         rpn_min_level_(
             this->template GetSingleArgument<int>("rpn_min_level", 2)),
         rpn_post_nms_topN_(
-            this->template GetSingleArgument<int>("rpn_post_nms_topN", 2000)) {
+            this->template GetSingleArgument<int>("rpn_post_nms_topN", 2000)),
+        legacy_plus_one_(
+            this->template GetSingleArgument<bool>("legacy_plus_one", true)) {
     CAFFE_ENFORCE_GE(
         roi_max_level_,
         roi_min_level_,
@@ -77,7 +94,7 @@ class CollectAndDistributeFpnRpnProposalsOp final : public Operator<Context> {
             c10::to_string(rpn_min_level_) + ".");
   }
 
-  ~CollectAndDistributeFpnRpnProposalsOp() {}
+  ~CollectAndDistributeFpnRpnProposalsOp() override {}
 
   bool RunOnDevice() override;
 
@@ -96,6 +113,84 @@ class CollectAndDistributeFpnRpnProposalsOp final : public Operator<Context> {
   int rpn_min_level_{2};
   // RPN_POST_NMS_TOP_N
   int rpn_post_nms_topN_{2000};
+  // The infamous "+ 1" for box width and height dating back to the DPM days
+  bool legacy_plus_one_{true};
+};
+
+template <class Context>
+class CollectRpnProposalsOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  template <class... Args>
+  explicit CollectRpnProposalsOp(Args&&... args)
+      : Operator<Context>(std::forward<Args>(args)...),
+        rpn_max_level_(
+            this->template GetSingleArgument<int>("rpn_max_level", 6)),
+        rpn_min_level_(
+            this->template GetSingleArgument<int>("rpn_min_level", 2)),
+        rpn_post_nms_topN_(
+            this->template GetSingleArgument<int>("rpn_post_nms_topN", 2000)) {
+    CAFFE_ENFORCE_GE(
+        rpn_max_level_,
+        rpn_min_level_,
+        "rpn_max_level " + c10::to_string(rpn_max_level_) +
+            " must be greater than or equal to rpn_min_level " +
+            c10::to_string(rpn_min_level_) + ".");
+  }
+
+  ~CollectRpnProposalsOp() override {}
+
+  bool RunOnDevice() override;
+
+ protected:
+  // RPN_MAX_LEVEL
+  int rpn_max_level_{6};
+  // RPN_MIN_LEVEL
+  int rpn_min_level_{2};
+  // RPN_POST_NMS_TOP_N
+  int rpn_post_nms_topN_{2000};
+};
+
+template <class Context>
+class DistributeFpnProposalsOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  template <class... Args>
+  explicit DistributeFpnProposalsOp(Args&&... args)
+      : Operator<Context>(std::forward<Args>(args)...),
+        roi_canonical_scale_(
+            this->template GetSingleArgument<int>("roi_canonical_scale", 224)),
+        roi_canonical_level_(
+            this->template GetSingleArgument<int>("roi_canonical_level", 4)),
+        roi_max_level_(
+            this->template GetSingleArgument<int>("roi_max_level", 5)),
+        roi_min_level_(
+            this->template GetSingleArgument<int>("roi_min_level", 2)),
+        legacy_plus_one_(
+            this->template GetSingleArgument<bool>("legacy_plus_one", true)) {
+    CAFFE_ENFORCE_GE(
+        roi_max_level_,
+        roi_min_level_,
+        "roi_max_level " + c10::to_string(roi_max_level_) +
+            " must be greater than or equal to roi_min_level " +
+            c10::to_string(roi_min_level_) + ".");
+  }
+
+  ~DistributeFpnProposalsOp() override {}
+
+  bool RunOnDevice() override;
+
+ protected:
+  // ROI_CANONICAL_SCALE
+  int roi_canonical_scale_{224};
+  // ROI_CANONICAL_LEVEL
+  int roi_canonical_level_{4};
+  // ROI_MAX_LEVEL
+  int roi_max_level_{5};
+  // ROI_MIN_LEVEL
+  int roi_min_level_{2};
+  // The infamous "+ 1" for box width and height dating back to the DPM days
+  bool legacy_plus_one_{true};
 };
 
 } // namespace caffe2
diff --git a/caffe2/operators/experimental/c10/cpu/add_cpu.cc b/caffe2/operators/experimental/c10/cpu/add_cpu.cc
index 052cf1e6623b..a06e6e3781c9 100644
--- a/caffe2/operators/experimental/c10/cpu/add_cpu.cc
+++ b/caffe2/operators/experimental/c10/cpu/add_cpu.cc
@@ -71,18 +71,10 @@ void add_op_cpu_impl(
 }
 
 static auto registry = c10::RegisterOperators().op(
-    FunctionSchema(
-        "_c10_experimental::Add",
-        "",
-        (std::vector<c10::Argument>{
-            c10::Argument("input1"),
-            c10::Argument("input2"),
-            c10::Argument("output"),
-            c10::Argument("legacy_broadcast", BoolType::get()),
-            c10::Argument("axis", IntType::get())}),
-        (std::vector<c10::Argument>{})),
-    c10::kernel<decltype(add_op_cpu_impl<float>), &add_op_cpu_impl<float>>(),
-    c10::dispatchKey(CPUTensorId()));
+    "_c10_experimental::Add",
+    c10::RegisterOperators::options()
+      .kernel<decltype(add_op_cpu_impl<float>), &add_op_cpu_impl<float>>()
+      .dispatchKey(CPUTensorId()));
 
 } // namespace
 
diff --git a/caffe2/operators/experimental/c10/cpu/averaged_loss_cpu.cc b/caffe2/operators/experimental/c10/cpu/averaged_loss_cpu.cc
index 069281cd6ff2..592c4e4c1b78 100644
--- a/caffe2/operators/experimental/c10/cpu/averaged_loss_cpu.cc
+++ b/caffe2/operators/experimental/c10/cpu/averaged_loss_cpu.cc
@@ -44,14 +44,10 @@ class averaged_loss_cpu final : public c10::OperatorKernel {
 };
 
 static auto registry = c10::RegisterOperators().op(
-    FunctionSchema(
-        "_c10_experimental::AveragedLoss",
-        "",
-        (std::vector<c10::Argument>{c10::Argument("input"),
-                                    c10::Argument("output")}),
-        (std::vector<c10::Argument>{})),
-    c10::kernel<averaged_loss_cpu<float, CPUContext>>(),
-    c10::dispatchKey(CPUTensorId()));
+    "_c10_experimental::AveragedLoss",
+    c10::RegisterOperators::options()
+      .kernel<averaged_loss_cpu<float, CPUContext>>()
+      .dispatchKey(CPUTensorId()));
 
 } // namespace
 
diff --git a/caffe2/operators/experimental/c10/cpu/batch_gather_cpu.cc b/caffe2/operators/experimental/c10/cpu/batch_gather_cpu.cc
index c293ababa8a7..ed1e58ecdb39 100644
--- a/caffe2/operators/experimental/c10/cpu/batch_gather_cpu.cc
+++ b/caffe2/operators/experimental/c10/cpu/batch_gather_cpu.cc
@@ -65,15 +65,10 @@ void batch_gather_op_cpu(const at::Tensor& data,
 }
 
 static auto registry = c10::RegisterOperators().op(
-    FunctionSchema(
-        "_c10_experimental::BatchGather",
-        "",
-        (std::vector<c10::Argument>{c10::Argument("data"),
-                                    c10::Argument("indices"),
-                                    c10::Argument("output")}),
-        (std::vector<c10::Argument>{})),
-    c10::kernel<decltype(batch_gather_op_cpu), &batch_gather_op_cpu>(),
-    c10::dispatchKey(CPUTensorId()));
+    "_c10_experimental::BatchGather",
+    c10::RegisterOperators::options()
+      .kernel<decltype(batch_gather_op_cpu), &batch_gather_op_cpu>()
+      .dispatchKey(CPUTensorId()));
 
 } // namespace
 
diff --git a/caffe2/operators/experimental/c10/cpu/batch_matmul_cpu.cc b/caffe2/operators/experimental/c10/cpu/batch_matmul_cpu.cc
index bbd10cc4510e..e782a4c8beae 100644
--- a/caffe2/operators/experimental/c10/cpu/batch_matmul_cpu.cc
+++ b/caffe2/operators/experimental/c10/cpu/batch_matmul_cpu.cc
@@ -269,19 +269,10 @@ class batch_matmul_cpu final : public c10::OperatorKernel {
 };
 
 static auto registry = c10::RegisterOperators().op(
-    FunctionSchema(
-        "_c10_experimental::BatchMatmul",
-        "",
-        (std::vector<c10::Argument>{
-            c10::Argument("A"),
-            c10::Argument("B"),
-            c10::Argument("output"),
-            c10::Argument("trans_a", IntType::get()),
-            c10::Argument("trans_b", IntType::get()),
-            c10::Argument("broadcast", IntType::get())}),
-        (std::vector<c10::Argument>{})),
-    c10::kernel<batch_matmul_cpu<float, CPUContext>>(),
-    c10::dispatchKey(CPUTensorId()));
+    "_c10_experimental::BatchMatmul",
+    c10::RegisterOperators::options()
+      .kernel<batch_matmul_cpu<float, CPUContext>>()
+      .dispatchKey(CPUTensorId()));
 
 } // namespace
 
diff --git a/caffe2/operators/experimental/c10/cpu/cast_cpu.cc b/caffe2/operators/experimental/c10/cpu/cast_cpu.cc
index 84db51298a96..8668429f03b2 100644
--- a/caffe2/operators/experimental/c10/cpu/cast_cpu.cc
+++ b/caffe2/operators/experimental/c10/cpu/cast_cpu.cc
@@ -87,17 +87,10 @@ void cast_op_cpu(
 }
 
 static auto registry = c10::RegisterOperators().op(
-    FunctionSchema(
-        "_c10_experimental::Cast",
-        "",
-        (std::vector<c10::Argument>{
-            c10::Argument("input"),
-            c10::Argument("output"),
-            c10::Argument("to_dtype", IntType::get()),
-        }),
-        (std::vector<c10::Argument>{})),
-    c10::kernel<decltype(cast_op_cpu), &cast_op_cpu>(),
-    c10::dispatchKey(CPUTensorId()));
+    "_c10_experimental::Cast",
+    c10::RegisterOperators::options()
+      .kernel<decltype(cast_op_cpu), &cast_op_cpu>()
+      .dispatchKey(CPUTensorId()));
 
 } // namespace
 
diff --git a/caffe2/operators/experimental/c10/cpu/concat_cpu.cc b/caffe2/operators/experimental/c10/cpu/concat_cpu.cc
index e84e8d07a5a3..999b002e5e80 100644
--- a/caffe2/operators/experimental/c10/cpu/concat_cpu.cc
+++ b/caffe2/operators/experimental/c10/cpu/concat_cpu.cc
@@ -13,7 +13,7 @@ namespace caffe2 {
 namespace {
 template <class DataType, class Context>
 void concat_op_cpu_impl(
-    ArrayRef<at::Tensor> inputs,
+    std::vector<at::Tensor> inputs,
     const at::Tensor& output_,
     const at::Tensor& split_,
     int64_t axis,
@@ -105,20 +105,12 @@ void concat_op_cpu_impl(
 }
 
 static auto registry = c10::RegisterOperators().op(
-    FunctionSchema(
-        "_c10_experimental::Concat",
-        "",
-        (std::vector<c10::Argument>{
-            c10::Argument("inputs", ListType::ofTensors()),
-            c10::Argument("output"),
-            c10::Argument("split_info"),
-            c10::Argument("add", IntType::get()),
-            c10::Argument("add_axis", IntType::get())}),
-        (std::vector<c10::Argument>{})),
-    c10::kernel<
+    "_c10_experimental::Concat",
+    c10::RegisterOperators::options()
+      .kernel<
         decltype(concat_op_cpu_impl<float, CPUContext>),
-        &concat_op_cpu_impl<float, CPUContext>>(),
-    c10::dispatchKey(CPUTensorId()));
+        &concat_op_cpu_impl<float, CPUContext>>()
+      .dispatchKey(CPUTensorId()));
 
 } // namespace
 
diff --git a/caffe2/operators/experimental/c10/cpu/enforce_finite_cpu.cc b/caffe2/operators/experimental/c10/cpu/enforce_finite_cpu.cc
index 69a4e75479fd..f73fb0284c54 100644
--- a/caffe2/operators/experimental/c10/cpu/enforce_finite_cpu.cc
+++ b/caffe2/operators/experimental/c10/cpu/enforce_finite_cpu.cc
@@ -25,15 +25,12 @@ void enforce_finite_op_impl_cpu(const at::Tensor& input_) {
 }
 
 static auto registry = c10::RegisterOperators().op(
-    FunctionSchema(
-        "_c10_experimental::EnforceFinite",
-        "",
-        (std::vector<c10::Argument>{c10::Argument("input")}),
-        (std::vector<c10::Argument>{})),
-    c10::kernel<
+    "_c10_experimental::EnforceFinite",
+    c10::RegisterOperators::options()
+      .kernel<
         decltype(enforce_finite_op_impl_cpu<float>),
-        &enforce_finite_op_impl_cpu<float>>(),
-    c10::dispatchKey(CPUTensorId()));
+        &enforce_finite_op_impl_cpu<float>>()
+      .dispatchKey(CPUTensorId()));
 
 } // namespace
 
diff --git a/caffe2/operators/experimental/c10/cpu/expand_dims_cpu.cc b/caffe2/operators/experimental/c10/cpu/expand_dims_cpu.cc
index 780ff2945a88..60cf805dcb8b 100644
--- a/caffe2/operators/experimental/c10/cpu/expand_dims_cpu.cc
+++ b/caffe2/operators/experimental/c10/cpu/expand_dims_cpu.cc
@@ -14,12 +14,12 @@ class expand_dims_cpu final : public c10::OperatorKernel {
   void operator()(
       const at::Tensor& input_,
       const at::Tensor& output_,
-      ArrayRef<int64_t> dims) {
+      std::vector<int64_t> dims) {
     Tensor input(input_);
     Tensor output(output_);
 
     if (!initialized_) {
-      dims_ = dims.vec();
+      dims_ = std::move(dims);
       auto originalSize = dims_.size();
       CAFFE_ENFORCE(originalSize > 0, "Parameter `dims` must be provided.");
       std::sort(dims_.begin(), dims_.end());
@@ -55,15 +55,10 @@ class expand_dims_cpu final : public c10::OperatorKernel {
 };
 
 static auto registry = c10::RegisterOperators().op(
-    FunctionSchema(
-        "_c10_experimental::ExpandDims",
-        "",
-        (std::vector<c10::Argument>{c10::Argument("input"),
-                                    c10::Argument("output"),
-                                    c10::Argument("dims", ListType::ofInts())}),
-        (std::vector<c10::Argument>{})),
-    c10::kernel<expand_dims_cpu<float>>(),
-    c10::dispatchKey(CPUTensorId()));
+    "_c10_experimental::ExpandDims",
+    c10::RegisterOperators::options()
+      .kernel<expand_dims_cpu<float>>()
+      .dispatchKey(CPUTensorId()));
 
 } // namespace
 
diff --git a/caffe2/operators/experimental/c10/cpu/fc_cpu.cc b/caffe2/operators/experimental/c10/cpu/fc_cpu.cc
index eac61b5a529a..3993da003053 100644
--- a/caffe2/operators/experimental/c10/cpu/fc_cpu.cc
+++ b/caffe2/operators/experimental/c10/cpu/fc_cpu.cc
@@ -129,18 +129,10 @@ class fc_op_cpu final : public c10::OperatorKernel {
 };
 
 static auto registry = c10::RegisterOperators().op(
-    FunctionSchema(
-        "_c10_experimental::FullyConnected",
-        "",
-        (std::vector<c10::Argument>{c10::Argument("X"),
-                                    c10::Argument("W"),
-                                    c10::Argument("b"),
-                                    c10::Argument("output"),
-                                    c10::Argument("axis", IntType::get()),
-                                    c10::Argument("axis_w", IntType::get())}),
-        (std::vector<c10::Argument>{})),
-    c10::kernel<fc_op_cpu<float, CPUContext>>(),
-    c10::dispatchKey(CPUTensorId()));
+    "_c10_experimental::FullyConnected",
+    c10::RegisterOperators::options()
+      .kernel<fc_op_cpu<float, CPUContext>>()
+      .dispatchKey(CPUTensorId()));
 
 } // namespace
 
diff --git a/caffe2/operators/experimental/c10/cpu/filler_cpu.cc b/caffe2/operators/experimental/c10/cpu/filler_cpu.cc
index 3e4966253858..40122cc3d803 100644
--- a/caffe2/operators/experimental/c10/cpu/filler_cpu.cc
+++ b/caffe2/operators/experimental/c10/cpu/filler_cpu.cc
@@ -45,10 +45,10 @@ void filler_init(
 
 template <class Type, class Context>
 void given_tensor_fill_op_cpu_impl(
-    ArrayRef<at::Tensor> inputs,
+    std::vector<at::Tensor> inputs,
     const at::Tensor& output_,
-    ArrayRef<int64_t> shape,
-    ArrayRef<int64_t> extra_shape,
+    std::vector<int64_t> shape,
+    std::vector<int64_t> extra_shape,
     bool input_as_shape,
     const at::Tensor& values_) {
   Tensor output(output_);
@@ -70,10 +70,10 @@ void given_tensor_fill_op_cpu_impl(
 }
 
 void constant_fill_op_cpu_impl(
-    ArrayRef<at::Tensor> inputs,
+    std::vector<at::Tensor> inputs,
     const at::Tensor& output_,
-    ArrayRef<int64_t> shape,
-    ArrayRef<int64_t> extra_shape,
+    std::vector<int64_t> shape,
+    std::vector<int64_t> extra_shape,
     bool input_as_shape,
     int64_t dtype,
     c10::Scalar value) {
@@ -110,10 +110,10 @@ void constant_fill_op_cpu_impl(
 }
 
 void uniform_fill_op_cpu_impl(
-    ArrayRef<at::Tensor> inputs,
+    std::vector<at::Tensor> inputs,
     const at::Tensor& output_,
-    ArrayRef<int64_t> shape,
-    ArrayRef<int64_t> extra_shape,
+    std::vector<int64_t> shape,
+    std::vector<int64_t> extra_shape,
     bool input_as_shape,
     double min,
     double max) {
@@ -145,86 +145,36 @@ void uniform_fill_op_cpu_impl(
 
 static auto registry =
     c10::RegisterOperators()
-        .op(FunctionSchema(
-                "_c10_experimental::ConstantFill",
-                "",
-                (std::vector<c10::Argument>{
-                    c10::Argument("inputs", ListType::ofTensors()),
-                    c10::Argument("output"),
-                    c10::Argument("shape", ListType::ofInts()),
-                    c10::Argument("extra_shape", ListType::ofInts()),
-                    c10::Argument("input_as_shape", BoolType::get()),
-                    c10::Argument("dtype", IntType::get()),
-                    c10::Argument("value", NumberType::get())}),
-                (std::vector<c10::Argument>{})),
-            c10::kernel<
+        .op("_c10_experimental::ConstantFill",
+            c10::RegisterOperators::options()
+              .kernel<
                 decltype(constant_fill_op_cpu_impl),
-                &constant_fill_op_cpu_impl>(),
-            c10::dispatchKey(CPUTensorId()))
-        .op(FunctionSchema(
-                "_c10_experimental::UniformFill",
-                "",
-                (std::vector<c10::Argument>{
-                    c10::Argument("inputs", ListType::ofTensors()),
-                    c10::Argument("output"),
-                    c10::Argument("shape", ListType::ofInts()),
-                    c10::Argument("extra_shape", ListType::ofInts()),
-                    c10::Argument("input_as_shape", BoolType::get()),
-                    c10::Argument("min", FloatType::get()),
-                    c10::Argument("max", FloatType::get())}),
-                (std::vector<c10::Argument>{})),
-            c10::kernel<
+                &constant_fill_op_cpu_impl>()
+              .dispatchKey(CPUTensorId()))
+        .op("_c10_experimental::UniformFill",
+            c10::RegisterOperators::options()
+              .kernel<
                 decltype(uniform_fill_op_cpu_impl),
-                &uniform_fill_op_cpu_impl>(),
-            c10::dispatchKey(CPUTensorId()))
-        .op(FunctionSchema(
-                "_c10_experimental::GivenTensorFill",
-                "",
-                (std::vector<c10::Argument>{
-                    c10::Argument("inputs", ListType::ofTensors()),
-                    c10::Argument("output"),
-                    c10::Argument("shape", ListType::ofInts()),
-                    c10::Argument("extra_shape", ListType::ofInts()),
-                    c10::Argument("input_as_shape", BoolType::get()),
-                    c10::Argument("values"),
-                }),
-                (std::vector<c10::Argument>{})),
-            c10::kernel<
+                &uniform_fill_op_cpu_impl>()
+              .dispatchKey(CPUTensorId()))
+        .op("_c10_experimental::GivenTensorFill",
+            c10::RegisterOperators::options()
+              .kernel<
                 decltype(given_tensor_fill_op_cpu_impl<float, CPUContext>),
-                &given_tensor_fill_op_cpu_impl<float, CPUContext>>(),
-            c10::dispatchKey(CPUTensorId()))
-        .op(FunctionSchema(
-                "_c10_experimental::GivenTensorIntFill",
-                "",
-                (std::vector<c10::Argument>{
-                    c10::Argument("inputs", ListType::ofTensors()),
-                    c10::Argument("output"),
-                    c10::Argument("shape", ListType::ofInts()),
-                    c10::Argument("extra_shape", ListType::ofInts()),
-                    c10::Argument("input_as_shape", BoolType::get()),
-                    c10::Argument("values"),
-                }),
-                (std::vector<c10::Argument>{})),
-            c10::kernel<
+                &given_tensor_fill_op_cpu_impl<float, CPUContext>>()
+              .dispatchKey(CPUTensorId()))
+        .op("_c10_experimental::GivenTensorIntFill",
+            c10::RegisterOperators::options()
+              .kernel<
                 decltype(given_tensor_fill_op_cpu_impl<int, CPUContext>),
-                &given_tensor_fill_op_cpu_impl<int, CPUContext>>(),
-            c10::dispatchKey(CPUTensorId()))
-        .op(FunctionSchema(
-                "_c10_experimental::GivenTensorInt64Fill",
-                "",
-                (std::vector<c10::Argument>{
-                    c10::Argument("inputs", ListType::ofTensors()),
-                    c10::Argument("output"),
-                    c10::Argument("shape", ListType::ofInts()),
-                    c10::Argument("extra_shape", ListType::ofInts()),
-                    c10::Argument("input_as_shape", BoolType::get()),
-                    c10::Argument("values"),
-                }),
-                (std::vector<c10::Argument>{})),
-            c10::kernel<
+                &given_tensor_fill_op_cpu_impl<int, CPUContext>>()
+              .dispatchKey(CPUTensorId()))
+        .op("_c10_experimental::GivenTensorInt64Fill",
+            c10::RegisterOperators::options()
+              .kernel<
                 decltype(given_tensor_fill_op_cpu_impl<int, CPUContext>),
-                &given_tensor_fill_op_cpu_impl<int, CPUContext>>(),
-            c10::dispatchKey(CPUTensorId()));
+                &given_tensor_fill_op_cpu_impl<int, CPUContext>>()
+              .dispatchKey(CPUTensorId()));
 
 } // namespace
 
diff --git a/caffe2/operators/experimental/c10/cpu/flatten_cpu.cc b/caffe2/operators/experimental/c10/cpu/flatten_cpu.cc
index 70cae21810a1..a2357f05a3a6 100644
--- a/caffe2/operators/experimental/c10/cpu/flatten_cpu.cc
+++ b/caffe2/operators/experimental/c10/cpu/flatten_cpu.cc
@@ -27,17 +27,12 @@ void flatten_op_cpu_impl(
 }
 
 static auto registry = c10::RegisterOperators().op(
-    FunctionSchema(
-        "_c10_experimental::Flatten",
-        "",
-        (std::vector<c10::Argument>{c10::Argument("input"),
-                                    c10::Argument("output"),
-                                    c10::Argument("axis", IntType::get())}),
-        (std::vector<c10::Argument>{})),
-    c10::kernel<
+    "_c10_experimental::Flatten",
+    c10::RegisterOperators::options()
+      .kernel<
         decltype(flatten_op_cpu_impl<float, CPUContext>),
-        &flatten_op_cpu_impl<float, CPUContext>>(),
-    c10::dispatchKey(CPUTensorId()));
+        &flatten_op_cpu_impl<float, CPUContext>>()
+      .dispatchKey(CPUTensorId()));
 
 } // namespace
 
diff --git a/caffe2/operators/experimental/c10/cpu/mul_cpu.cc b/caffe2/operators/experimental/c10/cpu/mul_cpu.cc
index b64cbac56dff..eae2bf3a3764 100644
--- a/caffe2/operators/experimental/c10/cpu/mul_cpu.cc
+++ b/caffe2/operators/experimental/c10/cpu/mul_cpu.cc
@@ -72,18 +72,10 @@ void mul_op_cpu_impl(
 }
 
 static auto registry = c10::RegisterOperators().op(
-    FunctionSchema(
-        "_c10_experimental::Mul",
-        "",
-        (std::vector<c10::Argument>{
-            c10::Argument("input1"),
-            c10::Argument("input2"),
-            c10::Argument("output"),
-            c10::Argument("legacy_broadcast", BoolType::get()),
-            c10::Argument("axis", IntType::get())}),
-        (std::vector<c10::Argument>{})),
-    c10::kernel<decltype(mul_op_cpu_impl<float>), &mul_op_cpu_impl<float>>(),
-    c10::dispatchKey(CPUTensorId()));
+    "_c10_experimental::Mul",
+    c10::RegisterOperators::options()
+      .kernel<decltype(mul_op_cpu_impl<float>), &mul_op_cpu_impl<float>>()
+      .dispatchKey(CPUTensorId()));
 
 } // namespace
 
diff --git a/caffe2/operators/experimental/c10/cpu/relu_cpu.cc b/caffe2/operators/experimental/c10/cpu/relu_cpu.cc
index 0a0225cbbdba..d5eec0c87c73 100644
--- a/caffe2/operators/experimental/c10/cpu/relu_cpu.cc
+++ b/caffe2/operators/experimental/c10/cpu/relu_cpu.cc
@@ -41,14 +41,10 @@ void relu_op_cpu_impl(
 }
 
 static auto registry = c10::RegisterOperators().op(
-    FunctionSchema(
-        "_c10_experimental::Relu",
-        "",
-        (std::vector<c10::Argument>{c10::Argument("input"),
-                                    c10::Argument("output")}),
-        (std::vector<c10::Argument>{})),
-    c10::kernel<decltype(relu_op_cpu_impl<float>), &relu_op_cpu_impl<float>>(),
-    c10::dispatchKey(CPUTensorId()));
+    "_c10_experimental::Relu",
+    c10::RegisterOperators::options()
+      .kernel<decltype(relu_op_cpu_impl<float>), &relu_op_cpu_impl<float>>()
+      .dispatchKey(CPUTensorId()));
 
 } // namespace
 
diff --git a/caffe2/operators/experimental/c10/cpu/sigmoid_cpu.cc b/caffe2/operators/experimental/c10/cpu/sigmoid_cpu.cc
index cf60fd01a922..d78cb4f6192d 100644
--- a/caffe2/operators/experimental/c10/cpu/sigmoid_cpu.cc
+++ b/caffe2/operators/experimental/c10/cpu/sigmoid_cpu.cc
@@ -24,16 +24,12 @@ void sigmoid_op_cpu_impl(
 }
 
 static auto registry = c10::RegisterOperators().op(
-    FunctionSchema(
-        "_c10_experimental::Sigmoid",
-        "",
-        (std::vector<c10::Argument>{c10::Argument("input"),
-                                    c10::Argument("output")}),
-        (std::vector<c10::Argument>{})),
-    c10::kernel<
+    "_c10_experimental::Sigmoid",
+    c10::RegisterOperators::options()
+      .kernel<
         decltype(sigmoid_op_cpu_impl<float>),
-        &sigmoid_op_cpu_impl<float>>(),
-    c10::dispatchKey(CPUTensorId()));
+        &sigmoid_op_cpu_impl<float>>()
+      .dispatchKey(CPUTensorId()));
 
 } // namespace
 
diff --git a/caffe2/operators/experimental/c10/cpu/sigmoid_cross_entropy_with_logits_cpu.cc b/caffe2/operators/experimental/c10/cpu/sigmoid_cross_entropy_with_logits_cpu.cc
index 460af4c2262a..f9bcabcf39f1 100644
--- a/caffe2/operators/experimental/c10/cpu/sigmoid_cross_entropy_with_logits_cpu.cc
+++ b/caffe2/operators/experimental/c10/cpu/sigmoid_cross_entropy_with_logits_cpu.cc
@@ -71,20 +71,12 @@ void sigmoid_cross_entropy_with_logits_op_cpu_impl(
 }
 
 static auto registry = c10::RegisterOperators().op(
-    FunctionSchema(
-        "_c10_experimental::SigmoidCrossEntropyWithLogits",
-        "",
-        (std::vector<c10::Argument>{
-            c10::Argument("input1"),
-            c10::Argument("input2"),
-            c10::Argument("output"),
-            c10::Argument("log_D_trick", BoolType::get()),
-            c10::Argument("unjoined_lr_loss", BoolType::get())}),
-        (std::vector<c10::Argument>{})),
-    c10::kernel<
+    "_c10_experimental::SigmoidCrossEntropyWithLogits",
+    c10::RegisterOperators::options()
+      .kernel<
         decltype(sigmoid_cross_entropy_with_logits_op_cpu_impl),
-        &sigmoid_cross_entropy_with_logits_op_cpu_impl>(),
-    c10::dispatchKey(CPUTensorId()));
+        &sigmoid_cross_entropy_with_logits_op_cpu_impl>()
+      .dispatchKey(CPUTensorId()));
 
 } // namespace
 
diff --git a/caffe2/operators/experimental/c10/cpu/sparse_lengths_sum_cpu.cc b/caffe2/operators/experimental/c10/cpu/sparse_lengths_sum_cpu.cc
index 98108e0c4fac..775a1b72968f 100644
--- a/caffe2/operators/experimental/c10/cpu/sparse_lengths_sum_cpu.cc
+++ b/caffe2/operators/experimental/c10/cpu/sparse_lengths_sum_cpu.cc
@@ -82,18 +82,12 @@ void sparse_lengths_sum_op_cpu(
 }
 
 static auto registry = c10::RegisterOperators().op(
-    FunctionSchema(
-        "_c10_experimental::SparseLengthsSum",
-        "",
-        (std::vector<c10::Argument>{c10::Argument("data"),
-                                    c10::Argument("indices"),
-                                    c10::Argument("lengths"),
-                                    c10::Argument("output")}),
-        (std::vector<c10::Argument>{})),
-    c10::kernel<
+    "_c10_experimental::SparseLengthsSum",
+    c10::RegisterOperators::options()
+      .kernel<
         decltype(sparse_lengths_sum_op_cpu),
-        &sparse_lengths_sum_op_cpu>(),
-    c10::dispatchKey(CPUTensorId()));
+        &sparse_lengths_sum_op_cpu>()
+      .dispatchKey(CPUTensorId()));
 
 } // namespace
 
diff --git a/caffe2/operators/experimental/c10/cpu/stop_gradient_cpu.cc b/caffe2/operators/experimental/c10/cpu/stop_gradient_cpu.cc
index bbfc8910be0e..d43aa8ba6929 100644
--- a/caffe2/operators/experimental/c10/cpu/stop_gradient_cpu.cc
+++ b/caffe2/operators/experimental/c10/cpu/stop_gradient_cpu.cc
@@ -20,16 +20,12 @@ void stop_gradient_op_cpu_impl(
 }
 
 static auto registry = c10::RegisterOperators().op(
-    FunctionSchema(
-        "_c10_experimental::StopGradient",
-        "",
-        (std::vector<c10::Argument>{c10::Argument("input"),
-                                    c10::Argument("output")}),
-        (std::vector<c10::Argument>{})),
-    c10::kernel<
+    "_c10_experimental::StopGradient",
+    c10::RegisterOperators::options()
+      .kernel<
         decltype(stop_gradient_op_cpu_impl<float>),
-        &stop_gradient_op_cpu_impl<float>>(),
-    c10::dispatchKey(CPUTensorId()));
+        &stop_gradient_op_cpu_impl<float>>()
+      .dispatchKey(CPUTensorId()));
 
 } // namespace
 
diff --git a/caffe2/operators/generate_proposals_op.cc b/caffe2/operators/generate_proposals_op.cc
index 7e6954fae146..7279ff17b761 100644
--- a/caffe2/operators/generate_proposals_op.cc
+++ b/caffe2/operators/generate_proposals_op.cc
@@ -228,17 +228,19 @@ void GenerateProposalsOp<CPUContext>::ProposalsForOneImage(
       bbox_deltas_sorted,
       bbox_weights,
       utils::BBOX_XFORM_CLIP_DEFAULT,
+      legacy_plus_one_,
       angle_bound_on_,
       angle_bound_lo_,
       angle_bound_hi_);
 
   // 2. clip proposals to image (may result in proposals with zero area
   // that will be removed in the next step)
-  proposals =
-      utils::clip_boxes(proposals, im_info[0], im_info[1], clip_angle_thresh_);
+  proposals = utils::clip_boxes(
+      proposals, im_info[0], im_info[1], clip_angle_thresh_, legacy_plus_one_);
 
   // 3. remove predicted boxes with either height or width < min_size
-  auto keep = utils::filter_boxes(proposals, min_size, im_info);
+  auto keep =
+      utils::filter_boxes(proposals, min_size, im_info, legacy_plus_one_);
   DCHECK_LE(keep.size(), scores_sorted.size());
 
   // 6. apply loose nms (e.g. threshold = 0.7)
@@ -246,9 +248,15 @@ void GenerateProposalsOp<CPUContext>::ProposalsForOneImage(
   // 8. return the top proposals (-> RoIs top)
   if (post_nms_topN > 0 && post_nms_topN < keep.size()) {
     keep = utils::nms_cpu(
-        proposals, scores_sorted, keep, nms_thresh, post_nms_topN);
+        proposals,
+        scores_sorted,
+        keep,
+        nms_thresh,
+        post_nms_topN,
+        legacy_plus_one_);
   } else {
-    keep = utils::nms_cpu(proposals, scores_sorted, keep, nms_thresh);
+    keep = utils::nms_cpu(
+        proposals, scores_sorted, keep, nms_thresh, -1, legacy_plus_one_);
   }
 
   // Generate outputs
@@ -406,6 +414,7 @@ SHOULD_NOT_DO_GRADIENT(GenerateProposalsCPP);
 
 } // namespace caffe2
 
+// clang-format off
 C10_REGISTER_CAFFE2_OPERATOR_CPU(
     GenerateProposals,
     "_caffe2::GenerateProposals("
@@ -421,6 +430,8 @@ C10_REGISTER_CAFFE2_OPERATOR_CPU(
       "bool angle_bound_on, "
       "int angle_bound_lo, "
       "int angle_bound_hi, "
-      "float clip_angle_thresh"
+      "float clip_angle_thresh, "
+      "bool legacy_plus_one"
     ") -> (Tensor output_0, Tensor output_1)",
     caffe2::GenerateProposalsOp<caffe2::CPUContext>);
+// clang-format on
diff --git a/caffe2/operators/generate_proposals_op.cu b/caffe2/operators/generate_proposals_op.cu
index 2bb1d9b2768b..fcae9d6fe822 100644
--- a/caffe2/operators/generate_proposals_op.cu
+++ b/caffe2/operators/generate_proposals_op.cu
@@ -23,6 +23,7 @@ __global__ void GeneratePreNMSUprightBoxesKernel(
     const float* d_img_info_vec,
     const int num_images,
     const float bbox_xform_clip,
+    const bool legacy_plus_one,
     float4* d_out_boxes,
     const int prenms_nboxes, // leading dimension of out_boxes
     float* d_inout_scores,
@@ -81,35 +82,35 @@ __global__ void GeneratePreNMSUprightBoxesKernel(
     dh = fmin(dh, bbox_xform_clip);
 
     // Applying the deltas
-    float width = x2 - x1 + 1.0f;
+    float width = x2 - x1 + float(int(legacy_plus_one));
     const float ctr_x = x1 + 0.5f * width;
     const float pred_ctr_x = ctr_x + width * dx; // TODO fuse madd
     const float pred_w = width * expf(dw);
     x1 = pred_ctr_x - 0.5f * pred_w;
-    x2 = pred_ctr_x + 0.5f * pred_w - 1.0f;
+    x2 = pred_ctr_x + 0.5f * pred_w - float(int(legacy_plus_one));
 
-    float height = y2 - y1 + 1.0f;
+    float height = y2 - y1 + float(int(legacy_plus_one));
     const float ctr_y = y1 + 0.5f * height;
     const float pred_ctr_y = ctr_y + height * dy;
     const float pred_h = height * expf(dh);
     y1 = pred_ctr_y - 0.5f * pred_h;
-    y2 = pred_ctr_y + 0.5f * pred_h - 1.0f;
+    y2 = pred_ctr_y + 0.5f * pred_h - float(int(legacy_plus_one));
 
     // Clipping box to image
     const float img_height = d_img_info_vec[3 * image_index + 0];
     const float img_width = d_img_info_vec[3 * image_index + 1];
     const float min_size_scaled =
         min_size * d_img_info_vec[3 * image_index + 2];
-    x1 = fmax(fmin(x1, img_width - 1.0f), 0.0f);
-    y1 = fmax(fmin(y1, img_height - 1.0f), 0.0f);
-    x2 = fmax(fmin(x2, img_width - 1.0f), 0.0f);
-    y2 = fmax(fmin(y2, img_height - 1.0f), 0.0f);
+    x1 = fmax(fmin(x1, img_width - float(int(legacy_plus_one))), 0.0f);
+    y1 = fmax(fmin(y1, img_height - float(int(legacy_plus_one))), 0.0f);
+    x2 = fmax(fmin(x2, img_width - float(int(legacy_plus_one))), 0.0f);
+    y2 = fmax(fmin(y2, img_height - float(int(legacy_plus_one))), 0.0f);
 
     // Filter boxes
     // Removing boxes with one dim < min_size
     // (center of box is in image, because of previous step)
-    width = x2 - x1 + 1.0f; // may have changed
-    height = y2 - y1 + 1.0f;
+    width = x2 - x1 + float(int(legacy_plus_one)); // may have changed
+    height = y2 - y1 + float(int(legacy_plus_one));
     bool keep_box = fmin(width, height) >= min_size_scaled;
 
     // We are not deleting the box right now even if !keep_box
@@ -140,6 +141,7 @@ __global__ void GeneratePreNMSRotatedBoxesKernel(
     const float* d_img_info_vec,
     const int num_images,
     const float bbox_xform_clip,
+    const bool legacy_plus_one,
     const bool angle_bound_on,
     const int angle_bound_lo,
     const int angle_bound_hi,
@@ -229,22 +231,22 @@ __global__ void GeneratePreNMSRotatedBoxesKernel(
         min_size * d_img_info_vec[3 * image_index + 2];
     if (fabs(box.a) <= clip_angle_thresh) {
       // Convert from [x_ctr, y_ctr, w, h] to [x1, y1, x2, y2]
-      float x1 = box.x_ctr - (box.w - 1.f) / 2.f;
-      float y1 = box.y_ctr - (box.h - 1.f) / 2.f;
-      float x2 = x1 + box.w - 1.f;
-      float y2 = y1 + box.h - 1.f;
+      float x1 = box.x_ctr - (box.w - float(int(legacy_plus_one))) / 2.f;
+      float y1 = box.y_ctr - (box.h - float(int(legacy_plus_one))) / 2.f;
+      float x2 = x1 + box.w - float(int(legacy_plus_one));
+      float y2 = y1 + box.h - float(int(legacy_plus_one));
 
       // Clip
-      x1 = fmax(fmin(x1, img_width - 1.0f), 0.0f);
-      y1 = fmax(fmin(y1, img_height - 1.0f), 0.0f);
-      x2 = fmax(fmin(x2, img_width - 1.0f), 0.0f);
-      y2 = fmax(fmin(y2, img_height - 1.0f), 0.0f);
+      x1 = fmax(fmin(x1, img_width - float(int(legacy_plus_one))), 0.0f);
+      y1 = fmax(fmin(y1, img_height - float(int(legacy_plus_one))), 0.0f);
+      x2 = fmax(fmin(x2, img_width - float(int(legacy_plus_one))), 0.0f);
+      y2 = fmax(fmin(y2, img_height - float(int(legacy_plus_one))), 0.0f);
 
       // Convert back to [x_ctr, y_ctr, w, h]
       box.x_ctr = (x1 + x2) / 2.f;
       box.y_ctr = (y1 + y2) / 2.f;
-      box.w = x2 - x1 + 1.f;
-      box.h = y2 - y1 + 1.f;
+      box.w = x2 - x1 + float(int(legacy_plus_one));
+      box.h = y2 - y1 + float(int(legacy_plus_one));
     }
 
     // Filter boxes.
@@ -485,6 +487,7 @@ bool GenerateProposalsOp<CUDAContext>::RunOnDevice() {
         d_im_info_vec,
         num_images,
         utils::BBOX_XFORM_CLIP_DEFAULT,
+        legacy_plus_one_,
         reinterpret_cast<float4*>(d_boxes),
         nboxes_to_generate,
         d_sorted_scores,
@@ -507,6 +510,7 @@ bool GenerateProposalsOp<CUDAContext>::RunOnDevice() {
         d_im_info_vec,
         num_images,
         utils::BBOX_XFORM_CLIP_DEFAULT,
+        legacy_plus_one_,
         angle_bound_on_,
         angle_bound_lo_,
         angle_bound_hi_,
@@ -597,6 +601,7 @@ bool GenerateProposalsOp<CUDAContext>::RunOnDevice() {
         d_image_prenms_boxes,
         prenms_nboxes,
         rpn_nms_thresh_,
+        legacy_plus_one_,
         d_image_boxes_keep_list,
         &nkeep,
         dev_nms_mask_,
@@ -669,5 +674,4 @@ REGISTER_CUDA_OPERATOR(GenerateProposals, GenerateProposalsOp<CUDAContext>);
 
 C10_REGISTER_CAFFE2_OPERATOR_CUDA(
     GenerateProposals,
-    caffe2::GenerateProposalsOp<caffe2::CUDAContext>
-);
+    caffe2::GenerateProposalsOp<caffe2::CUDAContext>);
diff --git a/caffe2/operators/generate_proposals_op.h b/caffe2/operators/generate_proposals_op.h
index f2c4a8f6ec1f..e55972c323fc 100644
--- a/caffe2/operators/generate_proposals_op.h
+++ b/caffe2/operators/generate_proposals_op.h
@@ -1,9 +1,9 @@
 #ifndef CAFFE2_OPERATORS_GENERATE_PROPOSALS_OP_H_
 #define CAFFE2_OPERATORS_GENERATE_PROPOSALS_OP_H_
 
+#include "caffe2/core/c10_operator.h"
 #include "caffe2/core/context.h"
 #include "caffe2/core/operator.h"
-#include "caffe2/core/c10_operator.h"
 #include "caffe2/utils/eigen_utils.h"
 #include "caffe2/utils/math.h"
 
@@ -79,7 +79,7 @@ template <class Context>
 class GenerateProposalsOp final : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
-  template<class... Args>
+  template <class... Args>
   explicit GenerateProposalsOp(Args&&... args)
       : Operator<Context>(std::forward<Args>(args)...),
         spatial_scale_(
@@ -99,7 +99,9 @@ class GenerateProposalsOp final : public Operator<Context> {
         angle_bound_hi_(
             this->template GetSingleArgument<int>("angle_bound_hi", 90)),
         clip_angle_thresh_(
-            this->template GetSingleArgument<float>("clip_angle_thresh", 1.0)) {}
+            this->template GetSingleArgument<float>("clip_angle_thresh", 1.0)),
+        legacy_plus_one_(
+            this->template GetSingleArgument<bool>("legacy_plus_one", true)) {}
 
   ~GenerateProposalsOp() {}
 
@@ -142,6 +144,8 @@ class GenerateProposalsOp final : public Operator<Context> {
   // tolerance for backward compatibility. Set to negative value for
   // no clipping.
   float clip_angle_thresh_{1.0};
+  // The infamous "+ 1" for box width and height dating back to the DPM days
+  bool legacy_plus_one_{true};
 
   // Scratch space required by the CUDA version
   // CUB buffers
diff --git a/caffe2/operators/generate_proposals_op_gpu_test.cc b/caffe2/operators/generate_proposals_op_gpu_test.cc
index da3f56a284ec..d328f81726c0 100644
--- a/caffe2/operators/generate_proposals_op_gpu_test.cc
+++ b/caffe2/operators/generate_proposals_op_gpu_test.cc
@@ -362,7 +362,8 @@ TEST(GenerateProposalsTest, TestRealDownSampledRotatedAngle0GPU) {
       rois_gt_xyxy.block(0, 0, rois_gt.rows(), 0);
   // rois_gt in [x_ctr, y_ctr, w, h] format
   rois_gt.block(0, 1, rois_gt.rows(), 4) = utils::bbox_xyxy_to_ctrwh(
-      rois_gt_xyxy.block(0, 1, rois_gt.rows(), 4).array());
+      rois_gt_xyxy.block(0, 1, rois_gt.rows(), 4).array(),
+      true /* legacy_plus_one */);
   // Angle
   rois_gt.block(0, 5, rois_gt.rows(), 1) =
       ERMatXf::Constant(rois_gt.rows(), 1, angle);
diff --git a/caffe2/operators/generate_proposals_op_test.cc b/caffe2/operators/generate_proposals_op_test.cc
index f79cf6891241..def7f286e85c 100644
--- a/caffe2/operators/generate_proposals_op_test.cc
+++ b/caffe2/operators/generate_proposals_op_test.cc
@@ -147,7 +147,8 @@ TEST(GenerateProposalsTest, TestComputeAllAnchorsRotated) {
 
   // Convert to RRPN format and add angles
   ERMatXf anchors(3, 5);
-  anchors.block(0, 0, 3, 4) = utils::bbox_xyxy_to_ctrwh(anchors_xyxy.array());
+  anchors.block(0, 0, 3, 4) = utils::bbox_xyxy_to_ctrwh(
+      anchors_xyxy.array(), true /* legacy_plus_one */);
   std::vector<float> angles{0.0, 45.0, -120.0};
   for (int i = 0; i < anchors.rows(); ++i) {
     anchors(i, 4) = angles[i % angles.size()];
@@ -170,8 +171,8 @@ TEST(GenerateProposalsTest, TestComputeAllAnchorsRotated) {
 
   // Convert gt to RRPN format and add angles
   ERMatXf all_anchors_gt(36, 5);
-  all_anchors_gt.block(0, 0, 36, 4) =
-      utils::bbox_xyxy_to_ctrwh(all_anchors_gt_xyxy.array());
+  all_anchors_gt.block(0, 0, 36, 4) = utils::bbox_xyxy_to_ctrwh(
+      all_anchors_gt_xyxy.array(), true /* legacy_plus_one */);
   for (int i = 0; i < all_anchors_gt.rows(); ++i) {
     all_anchors_gt(i, 4) = angles[i % angles.size()];
   }
@@ -196,7 +197,8 @@ TEST(GenerateProposalsTest, TestComputeSortedAnchorsRotated) {
 
   // Convert to RRPN format and add angles
   ERMatXf anchors(3, 5);
-  anchors.block(0, 0, 3, 4) = utils::bbox_xyxy_to_ctrwh(anchors_xyxy.array());
+  anchors.block(0, 0, 3, 4) = utils::bbox_xyxy_to_ctrwh(
+      anchors_xyxy.array(), true /* legacy_plus_one */);
   std::vector<float> angles{0.0, 45.0, -120.0};
   for (int i = 0; i < anchors.rows(); ++i) {
     anchors(i, 4) = angles[i % angles.size()];
@@ -524,7 +526,8 @@ TEST(GenerateProposalsTest, TestRealDownSampledRotatedAngle0) {
       rois_gt_xyxy.block(0, 0, rois_gt.rows(), 1);
   // rois_gt in [x_ctr, y_ctr, w, h] format
   rois_gt.block(0, 1, rois_gt.rows(), 4) = utils::bbox_xyxy_to_ctrwh(
-      rois_gt_xyxy.block(0, 1, rois_gt.rows(), 4).array());
+      rois_gt_xyxy.block(0, 1, rois_gt.rows(), 4).array(),
+      true /* legacy_plus_one */);
   // Angle
   rois_gt.block(0, 5, rois_gt.rows(), 1) =
       ERMatXf::Constant(rois_gt.rows(), 1, angle);
diff --git a/caffe2/operators/generate_proposals_op_util_boxes.h b/caffe2/operators/generate_proposals_op_util_boxes.h
index 04dd1912f6a7..92f9714c5a10 100644
--- a/caffe2/operators/generate_proposals_op_util_boxes.h
+++ b/caffe2/operators/generate_proposals_op_util_boxes.h
@@ -39,7 +39,8 @@ EArrXXt<typename Derived1::Scalar> bbox_transform_upright(
     const Eigen::ArrayBase<Derived2>& deltas,
     const std::vector<typename Derived2::Scalar>& weights =
         std::vector<typename Derived2::Scalar>{1.0, 1.0, 1.0, 1.0},
-    const float bbox_xform_clip = BBOX_XFORM_CLIP_DEFAULT) {
+    const float bbox_xform_clip = BBOX_XFORM_CLIP_DEFAULT,
+    const bool legacy_plus_one = false) {
   using T = typename Derived1::Scalar;
   using EArrXX = EArrXXt<T>;
   using EArrX = EArrXt<T>;
@@ -52,8 +53,8 @@ EArrXXt<typename Derived1::Scalar> bbox_transform_upright(
   CAFFE_ENFORCE_EQ(boxes.cols(), 4);
   CAFFE_ENFORCE_EQ(deltas.cols(), 4);
 
-  EArrX widths = boxes.col(2) - boxes.col(0) + T(1.0);
-  EArrX heights = boxes.col(3) - boxes.col(1) + T(1.0);
+  EArrX widths = boxes.col(2) - boxes.col(0) + T(int(legacy_plus_one));
+  EArrX heights = boxes.col(3) - boxes.col(1) + T(int(legacy_plus_one));
   auto ctr_x = boxes.col(0) + T(0.5) * widths;
   auto ctr_y = boxes.col(1) + T(0.5) * heights;
 
@@ -75,9 +76,9 @@ EArrXXt<typename Derived1::Scalar> bbox_transform_upright(
   // y1
   pred_boxes.col(1) = pred_ctr_y - T(0.5) * pred_h;
   // x2
-  pred_boxes.col(2) = pred_ctr_x + T(0.5) * pred_w - T(1.0);
+  pred_boxes.col(2) = pred_ctr_x + T(0.5) * pred_w - T(int(legacy_plus_one));
   // y2
-  pred_boxes.col(3) = pred_ctr_y + T(0.5) * pred_h - T(1.0);
+  pred_boxes.col(3) = pred_ctr_y + T(0.5) * pred_h - T(int(legacy_plus_one));
 
   return pred_boxes;
 }
@@ -166,13 +167,15 @@ EArrXXt<typename Derived1::Scalar> bbox_transform(
     const std::vector<typename Derived2::Scalar>& weights =
         std::vector<typename Derived2::Scalar>{1.0, 1.0, 1.0, 1.0},
     const float bbox_xform_clip = BBOX_XFORM_CLIP_DEFAULT,
+    const bool legacy_plus_one = false,
     const bool angle_bound_on = true,
     const int angle_bound_lo = -90,
     const int angle_bound_hi = 90) {
   CAFFE_ENFORCE(boxes.cols() == 4 || boxes.cols() == 5);
   if (boxes.cols() == 4) {
     // Upright boxes
-    return bbox_transform_upright(boxes, deltas, weights, bbox_xform_clip);
+    return bbox_transform_upright(
+        boxes, deltas, weights, bbox_xform_clip, legacy_plus_one);
   } else {
     // Rotated boxes with angle info
     return bbox_transform_rotated(
@@ -188,7 +191,8 @@ EArrXXt<typename Derived1::Scalar> bbox_transform(
 
 template <class Derived>
 EArrXXt<typename Derived::Scalar> bbox_xyxy_to_ctrwh(
-    const Eigen::ArrayBase<Derived>& boxes) {
+    const Eigen::ArrayBase<Derived>& boxes,
+    bool legacy_plus_one = false) {
   CAFFE_ENFORCE_EQ(boxes.cols(), 4);
 
   const auto& x1 = boxes.col(0);
@@ -199,14 +203,15 @@ EArrXXt<typename Derived::Scalar> bbox_xyxy_to_ctrwh(
   EArrXXt<typename Derived::Scalar> ret(boxes.rows(), 4);
   ret.col(0) = (x1 + x2) / 2.0; // x_ctr
   ret.col(1) = (y1 + y2) / 2.0; // y_ctr
-  ret.col(2) = x2 - x1 + 1.0; // w
-  ret.col(3) = y2 - y1 + 1.0; // h
+  ret.col(2) = x2 - x1 + int(legacy_plus_one); // w
+  ret.col(3) = y2 - y1 + int(legacy_plus_one); // h
   return ret;
 }
 
 template <class Derived>
 EArrXXt<typename Derived::Scalar> bbox_ctrwh_to_xyxy(
-    const Eigen::ArrayBase<Derived>& boxes) {
+    const Eigen::ArrayBase<Derived>& boxes,
+    const bool legacy_plus_one = false) {
   CAFFE_ENFORCE_EQ(boxes.cols(), 4);
 
   const auto& x_ctr = boxes.col(0);
@@ -215,10 +220,10 @@ EArrXXt<typename Derived::Scalar> bbox_ctrwh_to_xyxy(
   const auto& h = boxes.col(3);
 
   EArrXXt<typename Derived::Scalar> ret(boxes.rows(), 4);
-  ret.col(0) = x_ctr - (w - 1) / 2.0; // x1
-  ret.col(1) = y_ctr - (h - 1) / 2.0; // y1
-  ret.col(2) = x_ctr + (w - 1) / 2.0; // x2
-  ret.col(3) = y_ctr + (h - 1) / 2.0; // y2
+  ret.col(0) = x_ctr - (w - int(legacy_plus_one)) / 2.0; // x1
+  ret.col(1) = y_ctr - (h - int(legacy_plus_one)) / 2.0; // y1
+  ret.col(2) = x_ctr + (w - int(legacy_plus_one)) / 2.0; // x2
+  ret.col(3) = y_ctr + (h - int(legacy_plus_one)) / 2.0; // y2
   return ret;
 }
 
@@ -228,19 +233,20 @@ template <class Derived>
 EArrXXt<typename Derived::Scalar> clip_boxes_upright(
     const Eigen::ArrayBase<Derived>& boxes,
     int height,
-    int width) {
+    int width,
+    bool legacy_plus_one = false) {
   CAFFE_ENFORCE(boxes.cols() == 4);
 
   EArrXXt<typename Derived::Scalar> ret(boxes.rows(), boxes.cols());
 
   // x1 >= 0 && x1 < width
-  ret.col(0) = boxes.col(0).cwiseMin(width - 1).cwiseMax(0);
+  ret.col(0) = boxes.col(0).cwiseMin(width - int(legacy_plus_one)).cwiseMax(0);
   // y1 >= 0 && y1 < height
-  ret.col(1) = boxes.col(1).cwiseMin(height - 1).cwiseMax(0);
+  ret.col(1) = boxes.col(1).cwiseMin(height - int(legacy_plus_one)).cwiseMax(0);
   // x2 >= 0 && x2 < width
-  ret.col(2) = boxes.col(2).cwiseMin(width - 1).cwiseMax(0);
+  ret.col(2) = boxes.col(2).cwiseMin(width - int(legacy_plus_one)).cwiseMax(0);
   // y2 >= 0 && y2 < height
-  ret.col(3) = boxes.col(3).cwiseMin(height - 1).cwiseMax(0);
+  ret.col(3) = boxes.col(3).cwiseMin(height - int(legacy_plus_one)).cwiseMax(0);
 
   return ret;
 }
@@ -263,7 +269,8 @@ EArrXXt<typename Derived::Scalar> clip_boxes_rotated(
     const Eigen::ArrayBase<Derived>& boxes,
     int height,
     int width,
-    float angle_thresh = 1.0) {
+    float angle_thresh = 1.0,
+    bool legacy_plus_one = false) {
   CAFFE_ENFORCE(boxes.cols() == 5);
 
   const auto& angles = boxes.col(4);
@@ -275,13 +282,13 @@ EArrXXt<typename Derived::Scalar> clip_boxes_rotated(
 
   // Convert to [x1, y1, x2, y2] format and clip them
   const auto& upright_boxes_xyxy =
-      bbox_ctrwh_to_xyxy(upright_boxes.leftCols(4));
+      bbox_ctrwh_to_xyxy(upright_boxes.leftCols(4), legacy_plus_one);
   const auto& clipped_upright_boxes_xyxy =
-      clip_boxes_upright(upright_boxes_xyxy, height, width);
+      clip_boxes_upright(upright_boxes_xyxy, height, width, legacy_plus_one);
 
   // Convert back to [x_ctr, y_ctr, w, h, angle] and update upright boxes
   upright_boxes.block(0, 0, upright_boxes.rows(), 4) =
-      bbox_xyxy_to_ctrwh(clipped_upright_boxes_xyxy);
+      bbox_xyxy_to_ctrwh(clipped_upright_boxes_xyxy, legacy_plus_one);
 
   EArrXXt<typename Derived::Scalar> ret(boxes.rows(), boxes.cols());
   ret = boxes;
@@ -297,14 +304,16 @@ EArrXXt<typename Derived::Scalar> clip_boxes(
     const Eigen::ArrayBase<Derived>& boxes,
     int height,
     int width,
-    float angle_thresh = 1.0) {
+    float angle_thresh = 1.0,
+    bool legacy_plus_one = false) {
   CAFFE_ENFORCE(boxes.cols() == 4 || boxes.cols() == 5);
   if (boxes.cols() == 4) {
     // Upright boxes
-    return clip_boxes_upright(boxes, height, width);
+    return clip_boxes_upright(boxes, height, width, legacy_plus_one);
   } else {
     // Rotated boxes with angle info
-    return clip_boxes_rotated(boxes, height, width, angle_thresh);
+    return clip_boxes_rotated(
+        boxes, height, width, angle_thresh, legacy_plus_one);
   }
 }
 
@@ -316,7 +325,8 @@ template <class Derived>
 std::vector<int> filter_boxes_upright(
     const Eigen::ArrayBase<Derived>& boxes,
     double min_size,
-    const Eigen::Array3f& im_info) {
+    const Eigen::Array3f& im_info,
+    const bool legacy_plus_one = false) {
   CAFFE_ENFORCE_EQ(boxes.cols(), 4);
 
   // Scale min_size to match image scale
@@ -325,8 +335,8 @@ std::vector<int> filter_boxes_upright(
   using T = typename Derived::Scalar;
   using EArrX = EArrXt<T>;
 
-  EArrX ws = boxes.col(2) - boxes.col(0) + T(1);
-  EArrX hs = boxes.col(3) - boxes.col(1) + T(1);
+  EArrX ws = boxes.col(2) - boxes.col(0) + T(int(legacy_plus_one));
+  EArrX hs = boxes.col(3) - boxes.col(1) + T(int(legacy_plus_one));
   EArrX x_ctr = boxes.col(0) + ws / T(2);
   EArrX y_ctr = boxes.col(1) + hs / T(2);
 
@@ -368,11 +378,12 @@ template <class Derived>
 std::vector<int> filter_boxes(
     const Eigen::ArrayBase<Derived>& boxes,
     double min_size,
-    const Eigen::Array3f& im_info) {
+    const Eigen::Array3f& im_info,
+    const bool legacy_plus_one = false) {
   CAFFE_ENFORCE(boxes.cols() == 4 || boxes.cols() == 5);
   if (boxes.cols() == 4) {
     // Upright boxes
-    return filter_boxes_upright(boxes, min_size, im_info);
+    return filter_boxes_upright(boxes, min_size, im_info, legacy_plus_one);
   } else {
     // Rotated boxes with angle info
     return filter_boxes_rotated(boxes, min_size, im_info);
diff --git a/caffe2/operators/generate_proposals_op_util_boxes_test.cc b/caffe2/operators/generate_proposals_op_util_boxes_test.cc
index 1a08de7f422c..c36371c1ca14 100644
--- a/caffe2/operators/generate_proposals_op_util_boxes_test.cc
+++ b/caffe2/operators/generate_proposals_op_util_boxes_test.cc
@@ -31,7 +31,8 @@ TEST(UtilsBoxesTest, TestBboxTransformRandom) {
       bbox.array(),
       deltas.array(),
       std::vector<float>{1.0, 1.0, 1.0, 1.0},
-      BBOX_XFORM_CLIP);
+      BBOX_XFORM_CLIP,
+      true /* legacy_plus_one */);
   EXPECT_NEAR((result.matrix() - result_gt).norm(), 0.0, 1e-4);
 }
 
@@ -64,6 +65,7 @@ TEST(UtilsBoxesTest, TestBboxTransformRotated) {
       deltas.array(),
       std::vector<float>{1.0, 1.0, 1.0, 1.0},
       BBOX_XFORM_CLIP,
+      true, /* legacy_plus_one */
       false /* angle_bound_on */);
   EXPECT_NEAR((result.matrix() - result_gt).norm(), 0.0, 1e-2);
 }
@@ -96,6 +98,7 @@ TEST(UtilsBoxesTest, TestBboxTransformRotatedNormalized) {
       deltas.array(),
       std::vector<float>{1.0, 1.0, 1.0, 1.0},
       BBOX_XFORM_CLIP,
+      true, /* legacy_plus_one */
       true, /* angle_bound_on */
       -90, /* angle_bound_lo */
       90 /* angle_bound_hi */);
@@ -117,7 +120,8 @@ TEST(UtilsBoxesTest, ClipRotatedBoxes) {
 
   // Test with no clipping
   float angle_thresh = -1.0;
-  auto result = utils::clip_boxes(bbox.array(), height, width, angle_thresh);
+  auto result = utils::clip_boxes(
+      bbox.array(), height, width, angle_thresh, true /* legacy_plus_one */);
   EXPECT_NEAR((result.matrix() - bbox).norm(), 0.0, 1e-4);
 
   EMatXf result_gt(5, 5);
@@ -127,7 +131,8 @@ TEST(UtilsBoxesTest, ClipRotatedBoxes) {
 
   // Test clipping with tolerance
   angle_thresh = 1.0;
-  result = utils::clip_boxes(bbox.array(), height, width, angle_thresh);
+  result = utils::clip_boxes(
+      bbox.array(), height, width, angle_thresh, true /* legacy_plus_one */);
   EXPECT_NEAR((result.matrix() - result_gt).norm(), 0.0, 1e-4);
 }
 
diff --git a/caffe2/operators/generate_proposals_op_util_nms.h b/caffe2/operators/generate_proposals_op_util_nms.h
index 8c5234e3474c..571d7d59bcb8 100644
--- a/caffe2/operators/generate_proposals_op_util_nms.h
+++ b/caffe2/operators/generate_proposals_op_util_nms.h
@@ -27,7 +27,8 @@ std::vector<int> nms_cpu_upright(
     const Eigen::ArrayBase<Derived2>& scores,
     const std::vector<int>& sorted_indices,
     float thresh,
-    int topN = -1) {
+    int topN = -1,
+    bool legacy_plus_one = false) {
   CAFFE_ENFORCE_EQ(proposals.rows(), scores.rows());
   CAFFE_ENFORCE_EQ(proposals.cols(), 4);
   CAFFE_ENFORCE_EQ(scores.cols(), 1);
@@ -40,7 +41,8 @@ std::vector<int> nms_cpu_upright(
   auto x2 = proposals.col(2);
   auto y2 = proposals.col(3);
 
-  EArrX areas = (x2 - x1 + 1.0) * (y2 - y1 + 1.0);
+  EArrX areas =
+      (x2 - x1 + int(legacy_plus_one)) * (y2 - y1 + int(legacy_plus_one));
 
   EArrXi order = AsEArrXt(sorted_indices);
   std::vector<int> keep;
@@ -59,8 +61,8 @@ std::vector<int> nms_cpu_upright(
     EArrX xx2 = GetSubArray(x2, rest_indices).cwiseMin(x2[i]);
     EArrX yy2 = GetSubArray(y2, rest_indices).cwiseMin(y2[i]);
 
-    EArrX w = (xx2 - xx1 + 1.0).cwiseMax(0.0);
-    EArrX h = (yy2 - yy1 + 1.0).cwiseMax(0.0);
+    EArrX w = (xx2 - xx1 + int(legacy_plus_one)).cwiseMax(0.0);
+    EArrX h = (yy2 - yy1 + int(legacy_plus_one)).cwiseMax(0.0);
     EArrX inter = w * h;
     EArrX ovr = inter / (areas[i] + GetSubArray(areas, rest_indices) - inter);
 
@@ -98,7 +100,8 @@ std::vector<int> soft_nms_cpu_upright(
     float overlap_thresh = 0.3,
     float score_thresh = 0.001,
     unsigned int method = 1,
-    int topN = -1) {
+    int topN = -1,
+    bool legacy_plus_one = false) {
   CAFFE_ENFORCE_EQ(proposals.rows(), scores.rows());
   CAFFE_ENFORCE_EQ(proposals.cols(), 4);
   CAFFE_ENFORCE_EQ(scores.cols(), 1);
@@ -110,7 +113,8 @@ std::vector<int> soft_nms_cpu_upright(
   const auto& x2 = proposals.col(2);
   const auto& y2 = proposals.col(3);
 
-  EArrX areas = (x2 - x1 + 1.0) * (y2 - y1 + 1.0);
+  EArrX areas =
+      (x2 - x1 + int(legacy_plus_one)) * (y2 - y1 + int(legacy_plus_one));
 
   // Initialize out_scores with original scores. Will be iteratively updated
   // as Soft-NMS is applied.
@@ -138,8 +142,8 @@ std::vector<int> soft_nms_cpu_upright(
     EArrX xx2 = GetSubArray(x2, rest_indices).cwiseMin(x2[i]);
     EArrX yy2 = GetSubArray(y2, rest_indices).cwiseMin(y2[i]);
 
-    EArrX w = (xx2 - xx1 + 1.0).cwiseMax(0.0);
-    EArrX h = (yy2 - yy1 + 1.0).cwiseMax(0.0);
+    EArrX w = (xx2 - xx1 + int(legacy_plus_one)).cwiseMax(0.0);
+    EArrX h = (yy2 - yy1 + int(legacy_plus_one)).cwiseMax(0.0);
     EArrX inter = w * h;
     EArrX ovr = inter / (areas[i] + GetSubArray(areas, rest_indices) - inter);
 
@@ -656,11 +660,13 @@ std::vector<int> nms_cpu(
     const Eigen::ArrayBase<Derived2>& scores,
     const std::vector<int>& sorted_indices,
     float thresh,
-    int topN = -1) {
+    int topN = -1,
+    bool legacy_plus_one = false) {
   CAFFE_ENFORCE(proposals.cols() == 4 || proposals.cols() == 5);
   if (proposals.cols() == 4) {
     // Upright boxes
-    return nms_cpu_upright(proposals, scores, sorted_indices, thresh, topN);
+    return nms_cpu_upright(
+        proposals, scores, sorted_indices, thresh, topN, legacy_plus_one);
   } else {
     // Rotated boxes with angle info
     return nms_cpu_rotated(proposals, scores, sorted_indices, thresh, topN);
@@ -681,7 +687,8 @@ template <class Derived1, class Derived2>
 std::vector<int> nms_cpu(
     const Eigen::ArrayBase<Derived1>& proposals,
     const Eigen::ArrayBase<Derived2>& scores,
-    float thres) {
+    float thres,
+    bool legacy_plus_one = false) {
   std::vector<int> indices(proposals.rows());
   std::iota(indices.begin(), indices.end(), 0);
   std::sort(
@@ -689,7 +696,13 @@ std::vector<int> nms_cpu(
       indices.data() + indices.size(),
       [&scores](int lhs, int rhs) { return scores(lhs) > scores(rhs); });
 
-  return nms_cpu(proposals, scores, indices, thres);
+  return nms_cpu(
+      proposals,
+      scores,
+      indices,
+      thres,
+      -1 /* topN */,
+      legacy_plus_one /* legacy_plus_one */);
 }
 
 template <class Derived1, class Derived2, class Derived3>
@@ -702,7 +715,8 @@ std::vector<int> soft_nms_cpu(
     float overlap_thresh = 0.3,
     float score_thresh = 0.001,
     unsigned int method = 1,
-    int topN = -1) {
+    int topN = -1,
+    bool legacy_plus_one = false) {
   CAFFE_ENFORCE(proposals.cols() == 4 || proposals.cols() == 5);
   if (proposals.cols() == 4) {
     // Upright boxes
@@ -715,7 +729,8 @@ std::vector<int> soft_nms_cpu(
         overlap_thresh,
         score_thresh,
         method,
-        topN);
+        topN,
+        legacy_plus_one);
   } else {
     // Rotated boxes with angle info
     return soft_nms_cpu_rotated(
@@ -740,7 +755,8 @@ std::vector<int> soft_nms_cpu(
     float overlap_thresh = 0.3,
     float score_thresh = 0.001,
     unsigned int method = 1,
-    int topN = -1) {
+    int topN = -1,
+    bool legacy_plus_one = false) {
   std::vector<int> indices(proposals.rows());
   std::iota(indices.begin(), indices.end(), 0);
   return soft_nms_cpu(
@@ -752,7 +768,8 @@ std::vector<int> soft_nms_cpu(
       overlap_thresh,
       score_thresh,
       method,
-      topN);
+      topN,
+      legacy_plus_one);
 }
 
 } // namespace utils
diff --git a/caffe2/operators/generate_proposals_op_util_nms_gpu.cu b/caffe2/operators/generate_proposals_op_util_nms_gpu.cu
index 0cf157c99022..60cd996e45e3 100644
--- a/caffe2/operators/generate_proposals_op_util_nms_gpu.cu
+++ b/caffe2/operators/generate_proposals_op_util_nms_gpu.cu
@@ -29,6 +29,7 @@ __launch_bounds__(
         const Box* d_desc_sorted_boxes,
         const int nboxes,
         const float thresh,
+        const bool legacy_plus_one,
         const int mask_ld,
         int* d_delete_mask) {
   // Storing boxes used by this CUDA block in the shared memory
@@ -45,7 +46,8 @@ __launch_bounds__(
       if (threadIdx.y == 0) {
         const Box box = d_desc_sorted_boxes[i_to_load];
         shared_i_areas[threadIdx.x] =
-            (box.x2 - box.x1 + 1.0f) * (box.y2 - box.y1 + 1.0f);
+            (box.x2 - box.x1 + float(int(legacy_plus_one))) *
+            (box.y2 - box.y1 + float(int(legacy_plus_one)));
         shared_i_boxes[threadIdx.x] = box;
       }
     }
@@ -68,7 +70,8 @@ __launch_bounds__(
           const Box j_box = d_desc_sorted_boxes[j];
           const Box i_box = shared_i_boxes[threadIdx.x];
           const float j_area =
-              (j_box.x2 - j_box.x1 + 1.0f) * (j_box.y2 - j_box.y1 + 1.0f);
+              (j_box.x2 - j_box.x1 + float(int(legacy_plus_one))) *
+              (j_box.y2 - j_box.y1 + float(int(legacy_plus_one)));
           const float i_area = shared_i_areas[threadIdx.x];
           // The following code will not be valid with empty boxes
           if (i_area == 0.0f || j_area == 0.0f)
@@ -79,8 +82,8 @@ __launch_bounds__(
           const float yy2 = fminf(i_box.y2, j_box.y2);
 
           // fdimf computes the positive difference between xx2+1 and xx1
-          const float w = fdimf(xx2 + 1.0f, xx1);
-          const float h = fdimf(yy2 + 1.0f, yy1);
+          const float w = fdimf(xx2 + float(int(legacy_plus_one)), xx1);
+          const float h = fdimf(yy2 + float(int(legacy_plus_one)), yy1);
           const float intersection = w * h;
 
           // Testing for a/b > t
@@ -109,6 +112,7 @@ void nms_gpu_upright(
     const float* d_desc_sorted_boxes_float_ptr,
     const int N,
     const float thresh,
+    const bool legacy_plus_one,
     int* d_keep_sorted_list,
     int* h_nkeep,
     TensorCUDA& dev_delete_mask,
@@ -132,7 +136,7 @@ void nms_gpu_upright(
       CAFFE_CUDA_NUM_THREADS_2D,
       0,
       context->cuda_stream()>>>(
-      d_desc_sorted_boxes, N, thresh, mask_ld, d_delete_mask);
+      d_desc_sorted_boxes, N, thresh, legacy_plus_one, mask_ld, d_delete_mask);
 
   host_delete_mask.Resize(N * mask_ld);
   int* h_delete_mask = host_delete_mask.template mutable_data<int>();
@@ -554,6 +558,7 @@ void nms_gpu(
     const float* d_desc_sorted_boxes,
     const int N,
     const float thresh,
+    const bool legacy_plus_one,
     int* d_keep_sorted_list,
     int* h_nkeep,
     TensorCUDA& dev_delete_mask,
@@ -566,6 +571,7 @@ void nms_gpu(
         d_desc_sorted_boxes,
         N,
         thresh,
+        legacy_plus_one,
         d_keep_sorted_list,
         h_nkeep,
         dev_delete_mask,
diff --git a/caffe2/operators/generate_proposals_op_util_nms_gpu.h b/caffe2/operators/generate_proposals_op_util_nms_gpu.h
index da7a8401ed12..10d081f1f38e 100644
--- a/caffe2/operators/generate_proposals_op_util_nms_gpu.h
+++ b/caffe2/operators/generate_proposals_op_util_nms_gpu.h
@@ -27,6 +27,7 @@ CAFFE2_API void nms_gpu_upright(
     const float* d_desc_sorted_boxes,
     const int N,
     const float thresh,
+    const bool legacy_plus_one,
     int* d_keep_sorted_list,
     int* h_nkeep,
     TensorCUDA& dev_delete_mask,
@@ -55,6 +56,7 @@ CAFFE2_API void nms_gpu(
     const float* d_desc_sorted_boxes,
     const int N,
     const float thresh,
+    const bool legacy_plus_one,
     int* d_keep_sorted_list,
     int* h_nkeep,
     TensorCUDA& dev_delete_mask,
diff --git a/caffe2/operators/generate_proposals_op_util_nms_gpu_test.cc b/caffe2/operators/generate_proposals_op_util_nms_gpu_test.cc
index 372accae0af4..cd1428de0682 100644
--- a/caffe2/operators/generate_proposals_op_util_nms_gpu_test.cc
+++ b/caffe2/operators/generate_proposals_op_util_nms_gpu_test.cc
@@ -76,6 +76,7 @@ TEST(UtilsNMSTest, TestNMSGPU) {
         d_sorted_boxes,
         nboxes,
         thresh,
+        true, /* legacy_plus_one */
         d_list,
         &list_nitems,
         dev_delete_mask,
@@ -207,7 +208,13 @@ TEST(UtilsNMSTest, TestPerfNMS) {
   // Running ntests runs of CPU NMS
   auto cpu_start = std::chrono::steady_clock::now();
   for (int itest = 0; itest < ntests; ++itest) {
-    utils::nms_cpu(proposals, scores, indices, thresh);
+    utils::nms_cpu(
+        proposals,
+        scores,
+        indices,
+        thresh,
+        -1, /* topN */
+        true /* legacy_plus_one */);
   }
   auto cpu_stop = std::chrono::steady_clock::now();
 
@@ -244,6 +251,7 @@ TEST(UtilsNMSTest, TestPerfNMS) {
         d_sorted_boxes,
         nboxes,
         thresh,
+        true, /* legacy_plus_one */
         d_list,
         &list_nitems,
         dev_delete_mask,
@@ -342,13 +350,19 @@ TEST(UtilsNMSTest, GPUEqualsCPUCorrectnessTest) {
 
     // Running ntests runs of CPU NMS
     for (int itest = 0; itest < ntests; ++itest) {
-      std::vector<int> keep =
-          utils::nms_cpu(eig_proposals, eig_scores, sorted_indices, thresh);
+      std::vector<int> keep = utils::nms_cpu(
+          eig_proposals,
+          eig_scores,
+          sorted_indices,
+          thresh,
+          -1, /* topN */
+          true /* legacy_plus_one */);
       int list_nitems;
       utils::nms_gpu(
           d_sorted_boxes,
           nboxes,
           thresh,
+          true, /* legacy_plus_one */
           d_list,
           &list_nitems,
           dev_delete_mask,
@@ -439,6 +453,7 @@ TEST(UtilsNMSTest, TestNMSGPURotatedAngle0) {
         d_sorted_boxes,
         nboxes,
         thresh,
+        true, /* legacy_plus_one */
         d_list,
         &list_nitems,
         dev_delete_mask,
@@ -507,7 +522,13 @@ TEST(UtilsNMSTest, TestPerfRotatedNMS) {
   // Running ntests runs of CPU NMS
   auto cpu_start = std::chrono::steady_clock::now();
   for (int itest = 0; itest < ntests; ++itest) {
-    utils::nms_cpu(proposals, scores, indices, thresh);
+    utils::nms_cpu(
+        proposals,
+        scores,
+        indices,
+        thresh,
+        -1, /* topN */
+        true /* legacy_plus_one */);
   }
   auto cpu_stop = std::chrono::steady_clock::now();
 
@@ -544,6 +565,7 @@ TEST(UtilsNMSTest, TestPerfRotatedNMS) {
         d_sorted_boxes,
         nboxes,
         thresh,
+        true, /* legacy_plus_one */
         d_list,
         &list_nitems,
         dev_delete_mask,
@@ -642,13 +664,19 @@ TEST(UtilsNMSTest, GPUEqualsCPURotatedCorrectnessTest) {
 
     // Running ntests runs of CPU NMS
     for (int itest = 0; itest < ntests; ++itest) {
-      std::vector<int> keep =
-          utils::nms_cpu(eig_proposals, eig_scores, sorted_indices, thresh);
+      std::vector<int> keep = utils::nms_cpu(
+          eig_proposals,
+          eig_scores,
+          sorted_indices,
+          thresh,
+          -1, /* topN */
+          true /* legacy_plus_one */);
       int list_nitems;
       utils::nms_gpu(
           d_sorted_boxes,
           nboxes,
           thresh,
+          true, /* legacy_plus_one */
           d_list,
           &list_nitems,
           dev_delete_mask,
diff --git a/caffe2/operators/generate_proposals_op_util_nms_test.cc b/caffe2/operators/generate_proposals_op_util_nms_test.cc
index 8e8b5f17afab..2d168446bc39 100644
--- a/caffe2/operators/generate_proposals_op_util_nms_test.cc
+++ b/caffe2/operators/generate_proposals_op_util_nms_test.cc
@@ -19,7 +19,8 @@ TEST(UtilsNMSTest, TestNMS) {
   auto proposals = input.block(0, 0, input.rows(), 4);
   auto scores = input.col(4);
   for (int i = 0; i < input_thresh.size(); i++) {
-    auto cur_out = utils::nms_cpu(proposals, scores, input_thresh[i]);
+    auto cur_out = utils::nms_cpu(
+        proposals, scores, input_thresh[i], true /* legacy_plus_one */);
     EXPECT_EQ(output_gt[i], cur_out);
   }
 
@@ -31,7 +32,13 @@ TEST(UtilsNMSTest, TestNMS) {
       indices.data() + indices.size(),
       [&scores](int lhs, int rhs) { return scores(lhs) > scores(rhs); });
   for (int i = 0; i < input_thresh.size(); i++) {
-    auto cur_out = utils::nms_cpu(proposals, scores, indices, input_thresh[i]);
+    auto cur_out = utils::nms_cpu(
+        proposals,
+        scores,
+        indices,
+        input_thresh[i],
+        -1, /* topN */
+        true /* legacy_plus_one */);
     EXPECT_EQ(output_gt[i], cur_out);
   }
 
@@ -39,8 +46,13 @@ TEST(UtilsNMSTest, TestNMS) {
   std::vector<int> top_n = {1, 1, 2, 2, 3};
   auto gt_out = output_gt;
   for (int i = 0; i < input_thresh.size(); i++) {
-    auto cur_out =
-        utils::nms_cpu(proposals, scores, indices, input_thresh[i], top_n[i]);
+    auto cur_out = utils::nms_cpu(
+        proposals,
+        scores,
+        indices,
+        input_thresh[i],
+        top_n[i],
+        true /* legacy_plus_one */);
     gt_out[i].resize(top_n[i]);
     EXPECT_EQ(gt_out[i], cur_out);
   }
@@ -92,7 +104,8 @@ TEST(UtilsNMSTest, TestNMS1) {
                              18, 19, 21, 23, 24, 25, 26, 30, 32,
                              33, 34, 35, 37, 43, 44, 47, 50};
 
-  auto cur_out = utils::nms_cpu(proposals, scores, 0.5);
+  auto cur_out =
+      utils::nms_cpu(proposals, scores, 0.5, true /* legacy_plus_one */);
   std::sort(cur_out.begin(), cur_out.end());
   EXPECT_EQ(output_gt, cur_out);
 }
@@ -148,7 +161,9 @@ TEST(UtilsNMSTest, TestSoftNMS) {
         0.5,
         overlap_thresh[i],
         0.0001,
-        method[i]);
+        method[i],
+        -1, /* topN */
+        true /* legacy_plus_one */);
     EXPECT_EQ(keep, keep_gt);
     {
       auto diff = expected_scores - out_scores;
@@ -165,7 +180,8 @@ TEST(UtilsNMSTest, TestSoftNMS) {
           overlap_thresh[i],
           0.0001,
           method[i],
-          topN);
+          topN,
+          true /* legacy_plus_one */);
       std::vector<int> expected_keep(keep_gt.begin(), keep_gt.begin() + topN);
       EXPECT_EQ(expected_keep, keep);
     }
@@ -180,7 +196,9 @@ TEST(UtilsNMSTest, TestSoftNMS) {
         0.5,
         overlap_thresh[i],
         0.0001,
-        method[i]);
+        method[i],
+        -1, /* topN */
+        true /* legacy_plus_one */);
     std::sort(keep.begin(), keep.end());
     EXPECT_EQ(indices, keep);
     {
@@ -198,7 +216,9 @@ TEST(UtilsNMSTest, TestSoftNMS) {
         0.5,
         overlap_thresh[i],
         score_thresh,
-        method[i]);
+        method[i],
+        -1, /* topN */
+        true /* legacy_plus_one */);
     {
       auto expected_keep =
           utils::GetArrayIndices(expected_scores >= score_thresh);
@@ -235,7 +255,8 @@ TEST(UtilsNMSTest, TestNMSRotatedAngle0) {
 
   auto scores = input.col(4);
   for (int i = 0; i < input_thresh.size(); i++) {
-    auto cur_out = utils::nms_cpu(proposals, scores, input_thresh[i]);
+    auto cur_out = utils::nms_cpu(
+        proposals, scores, input_thresh[i], true /* legacy_plus_one */);
     EXPECT_EQ(output_gt[i], cur_out);
   }
 
@@ -247,7 +268,13 @@ TEST(UtilsNMSTest, TestNMSRotatedAngle0) {
       indices.data() + indices.size(),
       [&scores](int lhs, int rhs) { return scores(lhs) > scores(rhs); });
   for (int i = 0; i < input_thresh.size(); i++) {
-    auto cur_out = utils::nms_cpu(proposals, scores, indices, input_thresh[i]);
+    auto cur_out = utils::nms_cpu(
+        proposals,
+        scores,
+        indices,
+        input_thresh[i],
+        -1, /* topN */
+        true /* legacy_plus_one */);
     EXPECT_EQ(output_gt[i], cur_out);
   }
 
@@ -255,8 +282,13 @@ TEST(UtilsNMSTest, TestNMSRotatedAngle0) {
   std::vector<int> top_n = {1, 1, 2, 2, 3};
   auto gt_out = output_gt;
   for (int i = 0; i < input_thresh.size(); i++) {
-    auto cur_out =
-        utils::nms_cpu(proposals, scores, indices, input_thresh[i], top_n[i]);
+    auto cur_out = utils::nms_cpu(
+        proposals,
+        scores,
+        indices,
+        input_thresh[i],
+        top_n[i],
+        true /* legacy_plus_one */);
     gt_out[i].resize(top_n[i]);
     EXPECT_EQ(gt_out[i], cur_out);
   }
@@ -322,7 +354,9 @@ TEST(UtilsNMSTest, TestSoftNMSRotatedAngle0) {
         0.5,
         overlap_thresh[i],
         0.0001,
-        method[i]);
+        method[i],
+        -1, /* topN */
+        true /* legacy_plus_one */);
     EXPECT_EQ(keep, keep_gt);
     {
       auto diff = expected_scores - out_scores;
@@ -339,7 +373,8 @@ TEST(UtilsNMSTest, TestSoftNMSRotatedAngle0) {
           overlap_thresh[i],
           0.0001,
           method[i],
-          topN);
+          topN,
+          true /* legacy_plus_one */);
       std::vector<int> expected_keep(keep_gt.begin(), keep_gt.begin() + topN);
       EXPECT_EQ(expected_keep, keep);
     }
@@ -354,7 +389,9 @@ TEST(UtilsNMSTest, TestSoftNMSRotatedAngle0) {
         0.5,
         overlap_thresh[i],
         0.0001,
-        method[i]);
+        method[i],
+        -1, /* topN */
+        true /* legacy_plus_one */);
     std::sort(keep.begin(), keep.end());
     EXPECT_EQ(indices, keep);
     {
@@ -372,7 +409,9 @@ TEST(UtilsNMSTest, TestSoftNMSRotatedAngle0) {
         0.5,
         overlap_thresh[i],
         score_thresh,
-        method[i]);
+        method[i],
+        -1, /* topN */
+        true /* legacy_plus_one */);
     {
       auto expected_keep =
           utils::GetArrayIndices(expected_scores >= score_thresh);
diff --git a/caffe2/operators/given_tensor_fill_op.cc b/caffe2/operators/given_tensor_fill_op.cc
index 6cf039c3d75b..bab8acca8322 100644
--- a/caffe2/operators/given_tensor_fill_op.cc
+++ b/caffe2/operators/given_tensor_fill_op.cc
@@ -7,6 +7,9 @@ REGISTER_CPU_OPERATOR(
     GivenTensorDoubleFill,
     GivenTensorFillOp<double, CPUContext>);
 REGISTER_CPU_OPERATOR(GivenTensorBoolFill, GivenTensorFillOp<bool, CPUContext>);
+REGISTER_CPU_OPERATOR(
+    GivenTensorInt16Fill,
+    GivenTensorFillOp<int16_t, CPUContext>);
 REGISTER_CPU_OPERATOR(GivenTensorIntFill, GivenTensorFillOp<int, CPUContext>);
 REGISTER_CPU_OPERATOR(
     GivenTensorInt64Fill,
@@ -18,6 +21,7 @@ REGISTER_CPU_OPERATOR(
 NO_GRADIENT(GivenTensorFill);
 NO_GRADIENT(GivenTensorDoubleFill);
 NO_GRADIENT(GivenTensorBoolFill);
+NO_GRADIENT(GivenTensorInt16Fill);
 NO_GRADIENT(GivenTensorIntFill);
 NO_GRADIENT(GivenTensorInt64Fill);
 NO_GRADIENT(GivenTensorStringFill);
@@ -141,6 +145,28 @@ OPERATOR_SCHEMA(GivenTensorBoolFill)
         "1D tensor containing the desired output shape. First input must be in CPU context.")
     .TensorInferenceFunction(FillerTensorInference<TensorProto_DataType_BOOL>);
 
+OPERATOR_SCHEMA(GivenTensorInt16Fill)
+    .NumInputs(0, 1)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .Arg(
+        "values",
+        "The value for the elements of the output tensor.",
+        true /* required */)
+    .Arg(
+        "shape",
+        "The shape of the output tensor."
+        "Cannot set the shape argument and pass in an input at the same time.")
+    .Arg(
+        "extra_shape",
+        "The additional dimensions appended at the end of the shape indicated"
+        "by the input blob."
+        "Cannot set the extra_shape argument when there is no input blob.")
+    .Arg(
+        "input_as_shape",
+        "1D tensor containing the desired output shape. First input must be in CPU context.")
+    .TensorInferenceFunction(FillerTensorInference<TensorProto_DataType_INT16>);
+
 OPERATOR_SCHEMA(GivenTensorIntFill)
     .NumInputs(0, 1)
     .NumOutputs(1)
diff --git a/caffe2/operators/given_tensor_fill_op.cu b/caffe2/operators/given_tensor_fill_op.cu
index af0b8863fb2c..706c95c6277f 100644
--- a/caffe2/operators/given_tensor_fill_op.cu
+++ b/caffe2/operators/given_tensor_fill_op.cu
@@ -7,6 +7,9 @@ REGISTER_CUDA_OPERATOR(GivenTensorFill, GivenTensorFillOp<float, CUDAContext>);
 REGISTER_CUDA_OPERATOR(
     GivenTensorDoubleFill,
     GivenTensorFillOp<double, CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+    GivenTensorInt16Fill,
+    GivenTensorFillOp<int16_t, CUDAContext>);
 REGISTER_CUDA_OPERATOR(GivenTensorIntFill, GivenTensorFillOp<int, CUDAContext>);
 REGISTER_CUDA_OPERATOR(
     GivenTensorInt64Fill,
diff --git a/caffe2/operators/given_tensor_fill_op.h b/caffe2/operators/given_tensor_fill_op.h
index 1ba9f08f5c58..202958685e83 100644
--- a/caffe2/operators/given_tensor_fill_op.h
+++ b/caffe2/operators/given_tensor_fill_op.h
@@ -34,6 +34,9 @@ class GivenTensorFillOp final : public FillerOp<Context> {
         case TensorProto_DataType_BOOL:
           ExtractValues<bool>();
           break;
+        case TensorProto_DataType_INT16:
+          ExtractValues<int16_t>();
+          break;
         case TensorProto_DataType_INT32:
           ExtractValues<int>();
           break;
diff --git a/caffe2/operators/instance_norm_op.cc b/caffe2/operators/instance_norm_op.cc
index 5730cef37955..abd19a03cef3 100644
--- a/caffe2/operators/instance_norm_op.cc
+++ b/caffe2/operators/instance_norm_op.cc
@@ -15,20 +15,31 @@ bool InstanceNormOp<T, Context>::RunOnDeviceWithOrderNHWC() {
   CAFFE_ENFORCE(
       !IsInputOutputAlias(INPUT, OUTPUT),
       "Can't run InstanceNorm NHWC in-place");
-  auto* mean = OutputSize() > 1 ? Output(MEAN) : &mean_;
-  auto* inv_stdev = OutputSize() > 1 ? Output(INV_STDEV) : &inv_stdev_;
+
   const int N = X.dim32(0);
   const int H = X.dim32(1);
   const int W = X.dim32(2);
   const int C = X.dim32(3);
   const size_t offset = H * W * C;
-
   CAFFE_ENFORCE_EQ(Input(SCALE).numel(), C);
   CAFFE_ENFORCE_EQ(Input(BIAS).numel(), C);
 
   auto* Y = Output(OUTPUT, X.sizes(), at::dtype<T>());
-  mean->Resize(N, C);
-  inv_stdev->Resize(N, C);
+  Tensor* mean;
+  if (OutputSize() >= 2) {
+    mean = Output(MEAN, {N, C}, at::dtype<T>().device(Context::GetDeviceType()));
+  } else {
+    ReinitializeTensor(&mean_, {N, C}, at::dtype<T>().device(Context::GetDeviceType()));
+    mean = &mean_;
+  }
+  Tensor* inv_stdev;
+  if (OutputSize() >= 3) {
+    inv_stdev = Output(INV_STDEV, {N, C}, at::dtype<T>().device(Context::GetDeviceType()));
+  } else {
+    ReinitializeTensor(&inv_stdev_, {N, C}, at::dtype<T>().device(Context::GetDeviceType()));
+    inv_stdev = &inv_stdev_;
+  }
+
   ConstEigenVectorArrayMap<T> scale(Input(SCALE).template data<T>(), C);
   ConstEigenVectorArrayMap<T> bias(Input(BIAS).template data<T>(), C);
   for (int n = 0; n < N; ++n) {
@@ -66,19 +77,29 @@ bool InstanceNormOp<T, Context>::RunOnDeviceWithOrderNCHW() {
   const auto& scale = Input(SCALE);
   const auto& bias = Input(BIAS);
 
-  auto* mean = OutputSize() > 1 ? Output(MEAN) : &mean_;
-  auto* inv_stdev = OutputSize() > 1 ? Output(INV_STDEV) : &inv_stdev_;
   const int N = X.dim32(0);
   const int C = X.dim32(1);
   const int H = X.dim32(2);
   const int W = X.dim32(3);
-
   CAFFE_ENFORCE_EQ(scale.numel(), C);
   CAFFE_ENFORCE_EQ(bias.numel(), C);
 
   auto* Y = Output(OUTPUT, X.sizes(), at::dtype<T>());
-  mean->Resize(N, C);
-  inv_stdev->Resize(N, C);
+  Tensor* mean;
+  if (OutputSize() >= 2) {
+    mean = Output(MEAN, {N, C}, at::dtype<T>().device(Context::GetDeviceType()));
+  } else {
+    ReinitializeTensor(&mean_, {N, C}, at::dtype<T>().device(Context::GetDeviceType()));
+    mean = &mean_;
+  }
+
+  Tensor* inv_stdev;
+  if (OutputSize() >= 3) {
+    inv_stdev = Output(INV_STDEV, {N, C}, at::dtype<T>().device(Context::GetDeviceType()));
+  } else {
+    ReinitializeTensor(&inv_stdev_, {N, C}, at::dtype<T>().device(Context::GetDeviceType()));
+    inv_stdev = &inv_stdev_;
+  }
 
   const auto* Xdata = X.template data<T>();
   auto* Ydata = Y->template mutable_data<T>();
diff --git a/caffe2/operators/instance_norm_op.cu b/caffe2/operators/instance_norm_op.cu
index 31ada0c23f59..66d73597f745 100644
--- a/caffe2/operators/instance_norm_op.cu
+++ b/caffe2/operators/instance_norm_op.cu
@@ -188,8 +188,6 @@ bool InstanceNormOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
   const auto& scale = Input(SCALE);
   const auto& bias = Input(BIAS);
 
-  auto mean = OutputSize() >= 2 ? Output(MEAN) : &mean_;
-  auto inv_stdev = OutputSize() >= 3 ? Output(INV_STDEV) : &inv_stdev_;
   CAFFE_ENFORCE_EQ(4, input.dim());
   const int N = input.dim32(0);
   const int H = input.dim32(1);
@@ -200,8 +198,22 @@ bool InstanceNormOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
   CAFFE_ENFORCE_EQ(1, bias.dim());
   CAFFE_ENFORCE_EQ(C, bias.dim32(0));
   auto output = Output(OUTPUT, input.sizes(), at::dtype<float>());
-  mean->Resize(N, C);
-  inv_stdev->Resize(N, C);
+
+  Tensor* mean;
+  if (OutputSize() >= 2) {
+    mean = Output(MEAN, {N, C}, at::dtype<float>().device(CUDA));
+  } else {
+    ReinitializeTensor(&mean_, {N, C}, at::dtype<float>().device(CUDA));
+    mean = &mean_;
+  }
+
+  Tensor* inv_stdev;
+  if (OutputSize() >= 3) {
+    inv_stdev = Output(INV_STDEV, {N, C}, at::dtype<float>().device(CUDA));
+  } else {
+    ReinitializeTensor(&inv_stdev_, {N, C}, at::dtype<float>().device(CUDA));
+    inv_stdev = &inv_stdev_;
+  }
 
   const auto input_data = input.data<float>();
   const auto scale_data = scale.data<float>();
@@ -265,8 +277,6 @@ bool InstanceNormOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
   const auto& scale = Input(SCALE);
   const auto& bias = Input(BIAS);
 
-  auto mean = OutputSize() >= 2 ? Output(MEAN) : &mean_;
-  auto inv_stdev = OutputSize() >= 3 ? Output(INV_STDEV) : &inv_stdev_;
   CAFFE_ENFORCE_EQ(4, input.dim());
   const int N = input.dim32(0);
   const int C = input.dim32(1);
@@ -277,8 +287,22 @@ bool InstanceNormOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
   CAFFE_ENFORCE_EQ(1, bias.dim());
   CAFFE_ENFORCE_EQ(C, bias.dim32(0));
   auto output = Output(OUTPUT, input.sizes(), at::dtype<float>());
-  mean->Resize(N, C);
-  inv_stdev->Resize(N, C);
+
+  Tensor* mean;
+  if (OutputSize() >= 2) {
+    mean = Output(MEAN, {N, C}, at::dtype<float>().device(CUDA));
+  } else {
+    ReinitializeTensor(&mean_, {N, C}, at::dtype<float>().device(CUDA));
+    mean = &mean_;
+  }
+
+  Tensor* inv_stdev;
+  if (OutputSize() >= 3) {
+    inv_stdev = Output(INV_STDEV, {N, C}, at::dtype<float>().device(CUDA));
+  } else {
+    ReinitializeTensor(&inv_stdev_, {N, C}, at::dtype<float>().device(CUDA));
+    inv_stdev = &inv_stdev_;
+  }
 
   const auto input_data = input.data<float>();
   const auto scale_data = scale.data<float>();
@@ -471,9 +495,9 @@ bool InstanceNormGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
   const auto& scale = Input(SCALE);
   const auto& bias = Input(BIAS);
   const auto& output_grad = Input(OUTPUT_GRAD);
+
   const auto& mean = InputSize() >= 5 ? Input(MEAN) : mean_;
   const auto& inv_stdev = InputSize() >= 6 ? Input(INV_STDEV) : inv_stdev_;
-
   CAFFE_ENFORCE_EQ(4, input.dim());
   const int N = input.dim32(0);
   const int C = input.dim32(1);
@@ -507,7 +531,7 @@ bool InstanceNormGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
   const auto dim_stride = 1;
 
   if (InputSize() < 5) {
-    mean_.Resize(N, C);
+    ReinitializeTensor(&mean_, {N, C}, at::dtype<float>().device(CUDA));
     auto mean_mutable_data = mean_.mutable_data<float>();
     InstanceNormMeanKernel<<<
         CAFFE_GET_BLOCKS(N * C),
@@ -530,7 +554,7 @@ bool InstanceNormGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
   const auto mean_data = mean.data<float>();
 
   if (InputSize() < 6) {
-    inv_stdev_.Resize(N, C);
+    ReinitializeTensor(&inv_stdev_, {N, C}, at::dtype<float>().device(CUDA));
     auto inv_stdev_mutable_data = inv_stdev_.mutable_data<float>();
     InstanceNormInvStdevKernel<<<
         CAFFE_GET_BLOCKS(N * C),
diff --git a/caffe2/operators/instance_norm_op.h b/caffe2/operators/instance_norm_op.h
index cbd6bd44bf44..3a665ff891c7 100644
--- a/caffe2/operators/instance_norm_op.h
+++ b/caffe2/operators/instance_norm_op.h
@@ -41,8 +41,8 @@ class InstanceNormOp : public Operator<Context> {
   StorageOrder order_;
 
   // temp results that get passed to the gradient, but are otherwise stored here
-  Tensor mean_{Context::GetDeviceType()};
-  Tensor inv_stdev_{Context::GetDeviceType()};
+  Tensor mean_;
+  Tensor inv_stdev_;
 
   INPUT_TAGS(INPUT, SCALE, BIAS);
   OUTPUT_TAGS(OUTPUT, MEAN, INV_STDEV);
diff --git a/caffe2/operators/onnxifi_op.cc b/caffe2/operators/onnxifi_op.cc
index 78bf949902ea..b7666cb33645 100644
--- a/caffe2/operators/onnxifi_op.cc
+++ b/caffe2/operators/onnxifi_op.cc
@@ -95,22 +95,49 @@ void SetOutputTensorDescriptorTypeAndBuffer(
   desc->scales = &cpu_int8tensor->scale;
   desc->biases = &cpu_int8tensor->zero_point;
 }
+
+#ifndef C10_MOBILE
+void CopyDescriptor(
+    const ExternalTensorDescriptor* from,
+    onnxTensorDescriptorV1* to) {
+  to->dataType = from->dataType;
+  to->buffer = from->buffer;
+  to->quantizationParams = from->quantizationParams;
+  to->quantizationAxis = from->quantizationAxis;
+  to->scales = from->scales;
+  to->biases = from->biases;
+  to->dimensions = from->dimensions;
+  to->shape = from->shape;
+}
+#endif
+
 void BlobToTensorDescriptor(
     const std::string& name,
     Workspace* ws,
     onnxTensorDescriptorV1* desc,
-    std::vector<std::vector<uint64_t>>* shapes) {
+    std::vector<std::vector<uint64_t>>* shapes,
+    std::vector<std::vector<float>>* all_scales,
+    std::vector<std::vector<float>>* all_offsets) {
   const Blob* blob = ws->GetBlob(name);
   CAFFE_ENFORCE(blob, "Blob ", name, " doesn't exist");
   const bool is_int8tensor =
       blob->meta().id() == TypeMeta::Id<int8::Int8TensorCPU>();
+  bool is_external_tensor;
+#ifndef C10_MOBILE
+  auto function_ptr =
+      ExternalTensorFunctionsBaseRegistry()->Create(blob->meta().id());
+  is_external_tensor = function_ptr != nullptr;
+#else
+  is_external_tensor = false;
+#endif
   // Memory type
   // We only allow weights to be CPU tensor or int8tensor for now
   CAFFE_ENFORCE(
-      (BlobIsTensorType(*blob, CPU) || BlobIsInt8TensorCPUType(*blob)),
+      (BlobIsTensorType(*blob, CPU) || BlobIsInt8TensorCPUType(*blob) ||
+       is_external_tensor),
       "Initialization blob ",
       name,
-      " needs to be TensorCPU or Int8TensorCPU");
+      " needs to be TensorCPU or Int8TensorCPU or Int8FCDNNLowPPackedWeightBlob Based class");
   desc->tag = ONNXIFI_TAG_TENSOR_DESCRIPTOR_V1;
   desc->memoryType = ONNXIFI_MEMORY_TYPE_CPU;
 
@@ -124,6 +151,13 @@ void BlobToTensorDescriptor(
     desc->dimensions = shape.size();
     shapes->emplace_back(shape.cbegin(), shape.cend());
     desc->shape = shapes->back().data();
+  } else if (is_external_tensor) {
+#ifndef C10_MOBILE
+    ExternalTensorDescriptor ext_desc;
+    function_ptr->SetupExternalTensorDescriptor(
+        blob, shapes, all_scales, all_offsets, &ext_desc);
+    CopyDescriptor(&ext_desc, desc);
+#endif
   } else {
     // Data type
     const auto& cpu_tensor = blob->template Get<TensorCPU>();
@@ -159,7 +193,8 @@ OnnxifiOp<CPUContext>::buildInitializationList(
       weight_names->emplace_back(s);
       onnxTensorDescriptorV1 tensor_desc;
       tensor_desc.name = weight_names->back().c_str();
-      BlobToTensorDescriptor(s, ws, &tensor_desc, weight_shapes);
+      BlobToTensorDescriptor(
+          s, ws, &tensor_desc, weight_shapes, &all_scales_, &all_offsets_);
       descs.push_back(tensor_desc);
       initialization_list.erase(it);
     }
diff --git a/caffe2/operators/onnxifi_op.h b/caffe2/operators/onnxifi_op.h
index 657f2cb5c180..4d5d71da5c5f 100644
--- a/caffe2/operators/onnxifi_op.h
+++ b/caffe2/operators/onnxifi_op.h
@@ -304,6 +304,10 @@ class OnnxifiOp final : public Operator<Context> {
   // dim but uint64_t for onnxDesciptor dim. Maybe we should just use int64_t
   c10::SmallVector<int64_t, 4> tensor_dims_int64_;
 
+  // This is for multi group quantization info
+  std::vector<std::vector<float>> all_scales_;
+  std::vector<std::vector<float>> all_offsets_;
+
   // output shape hints
   std::unordered_map<int, TensorInfo> output_shape_hints_;
 
diff --git a/caffe2/operators/segment_reduction_op.cc b/caffe2/operators/segment_reduction_op.cc
index 254f917a5c97..34d19c2d0c00 100644
--- a/caffe2/operators/segment_reduction_op.cc
+++ b/caffe2/operators/segment_reduction_op.cc
@@ -579,3 +579,36 @@ REGISTER_LENGTHS_OPS_MAIN_INPUT_AND_FORWARD_OUTPUT_GRADIENT(
     LengthsMaxWithMainInputAndForwardOutputGradient,
     AbstractLengthsDef<float, int, CPUContext, MaxReducerDef>);
 } // namespace caffe2
+
+// Macro doesn't like comma
+using LengthsSumCPUOp = caffe2::AbstractLengthsDef<
+    float,
+    int,
+    caffe2::CPUContext,
+    caffe2::SumReducerDef,
+    true>::ForwardOp;
+using LengthsMeanCPUOp = caffe2::AbstractLengthsDef<
+    float,
+    int,
+    caffe2::CPUContext,
+    caffe2::MeanReducerDef,
+    true>::ForwardOp;
+using LengthsMaxCPUOp = caffe2::AbstractLengthsDef<
+    float,
+    int,
+    caffe2::CPUContext,
+    caffe2::MaxReducerDef,
+    true>::ForwardOp;
+
+C10_REGISTER_CAFFE2_OPERATOR_CPU(
+    LengthsSum,
+    "_caffe2::LengthsSum(Tensor data, Tensor lengths) -> Tensor",
+    LengthsSumCPUOp);
+C10_REGISTER_CAFFE2_OPERATOR_CPU(
+    LengthsMean,
+    "_caffe2::LengthsMean(Tensor data, Tensor lengths) -> Tensor",
+    LengthsMeanCPUOp);
+C10_REGISTER_CAFFE2_OPERATOR_CPU(
+    LengthsMax,
+    "_caffe2::LengthsMax(Tensor data, Tensor lengths) -> Tensor",
+    LengthsMaxCPUOp);
diff --git a/caffe2/operators/segment_reduction_op.h b/caffe2/operators/segment_reduction_op.h
index a2678a67ac95..24d4d84e65a3 100644
--- a/caffe2/operators/segment_reduction_op.h
+++ b/caffe2/operators/segment_reduction_op.h
@@ -1,11 +1,16 @@
 #ifndef CAFFE2_OPERATORS_SEGMENT_REDUCTION_OP_H_
 #define CAFFE2_OPERATORS_SEGMENT_REDUCTION_OP_H_
 
+#include "caffe2/core/c10_operator.h"
 #include "caffe2/core/context.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/operators/reducer_functors.h"
 
+C10_DECLARE_CAFFE2_OPERATOR(LengthsSum);
+C10_DECLARE_CAFFE2_OPERATOR(LengthsMean);
+C10_DECLARE_CAFFE2_OPERATOR(LengthsMax);
+
 namespace caffe2 {
 
 template <typename TData>
diff --git a/caffe2/operators/segment_reduction_op_gpu.cu b/caffe2/operators/segment_reduction_op_gpu.cu
index 5e1099ef9dc1..fc4bc5a21c83 100644
--- a/caffe2/operators/segment_reduction_op_gpu.cu
+++ b/caffe2/operators/segment_reduction_op_gpu.cu
@@ -3,9 +3,9 @@
 #include <cub/device/device_scan.cuh>
 #include "caffe2/core/context_gpu.h"
 #include "caffe2/core/operator.h"
+#include "caffe2/operators/segment_reduction_op.h"
 #include "caffe2/utils/math.h"
 
-
 namespace caffe2 {
 
 namespace {
@@ -411,8 +411,10 @@ template <typename T, class Context = CUDAContext, bool SparseFused = true>
 class CUDASparseLengthsSumOp : public Operator<CUDAContext> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
-  CUDASparseLengthsSumOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<CUDAContext>(operator_def, ws) {}
+
+  template <class... Args>
+  explicit CUDASparseLengthsSumOp(Args&&... args)
+      : Operator<CUDAContext>(std::forward<Args>(args)...) {}
 
   ~CUDASparseLengthsSumOp() {}
 
@@ -531,8 +533,10 @@ template <typename T, class Context = CUDAContext, bool SparseFused = true>
 class CUDASparseLengthsMeanOp : public Operator<CUDAContext> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
-  CUDASparseLengthsMeanOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<CUDAContext>(operator_def, ws) {}
+
+  template <class... Args>
+  explicit CUDASparseLengthsMeanOp(Args&&... args)
+      : Operator<CUDAContext>(std::forward<Args>(args)...) {}
 
   ~CUDASparseLengthsMeanOp() {}
 
@@ -652,8 +656,10 @@ template <typename T, class Context = CUDAContext, bool SparseFused = true>
 class CUDASparseLengthsMaxOp : public Operator<CUDAContext> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
-  CUDASparseLengthsMaxOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<CUDAContext>(operator_def, ws) {}
+
+  template <class... Args>
+  explicit CUDASparseLengthsMaxOp(Args&&... args)
+      : Operator<CUDAContext>(std::forward<Args>(args)...) {}
 
   ~CUDASparseLengthsMaxOp() {}
 
@@ -966,7 +972,10 @@ class CUDAUnsortedSegmentSumOp : public Operator<CUDAContext> {
           context_.cuda_stream());
 
       // the second call do the real computation.
-      ReinitializeTensor(&buffer_tensor_, {static_cast<int64_t>(tmp_storage_bytes)}, at::dtype<char>().device(CUDA));
+      ReinitializeTensor(
+          &buffer_tensor_,
+          {static_cast<int64_t>(tmp_storage_bytes)},
+          at::dtype<char>().device(CUDA));
       cub::DeviceReduce::Max(
           static_cast<void*>(buffer_tensor_.mutable_data<char>()),
           tmp_storage_bytes,
@@ -996,46 +1005,47 @@ class CUDAUnsortedSegmentSumOp : public Operator<CUDAContext> {
         output->numel(), T(0), output->template mutable_data<T>(), &context_);
 
     if (!mean) {
-      UnsortedSegmentSumKernel<SIndex, T><<<
-          CAFFE_GET_BLOCKS(data.numel()),
-          CAFFE_CUDA_NUM_THREADS,
-          0,
-          context_.cuda_stream()>>>(
-          data.numel(),
-          slize_sz,
-          segment_ids.template data<SIndex>(),
-          data.template data<T>(),
-          output->template mutable_data<T>(),
-          nullptr);
+      UnsortedSegmentSumKernel<SIndex, T>
+          <<<CAFFE_GET_BLOCKS(data.numel()),
+             CAFFE_CUDA_NUM_THREADS,
+             0,
+             context_.cuda_stream()>>>(
+              data.numel(),
+              slize_sz,
+              segment_ids.template data<SIndex>(),
+              data.template data<T>(),
+              output->template mutable_data<T>(),
+              nullptr);
     } else {
       // For mean, we need to compute scaling factors
-      ReinitializeTensor(&scaling_factors_, {K + 1}, at::dtype<int>().device(CUDA));
+      ReinitializeTensor(
+          &scaling_factors_, {K + 1}, at::dtype<int>().device(CUDA));
       math::Set<int, CUDAContext>(
           scaling_factors_.numel(),
           int(0),
           scaling_factors_.template mutable_data<int>(),
           &context_);
-      UnsortedSegmentSumKernel<SIndex, T><<<
-          CAFFE_GET_BLOCKS(data.numel()),
-          CAFFE_CUDA_NUM_THREADS,
-          0,
-          context_.cuda_stream()>>>(
-          data.numel(),
-          slize_sz,
-          segment_ids.template data<SIndex>(),
-          data.template data<T>(),
-          output->template mutable_data<T>(),
-          scaling_factors_.template mutable_data<int>());
+      UnsortedSegmentSumKernel<SIndex, T>
+          <<<CAFFE_GET_BLOCKS(data.numel()),
+             CAFFE_CUDA_NUM_THREADS,
+             0,
+             context_.cuda_stream()>>>(
+              data.numel(),
+              slize_sz,
+              segment_ids.template data<SIndex>(),
+              data.template data<T>(),
+              output->template mutable_data<T>(),
+              scaling_factors_.template mutable_data<int>());
       // Divide by the scaling factors to get means
-      SegmentScalingKernel<SIndex, T><<<
-          CAFFE_GET_BLOCKS(output->numel()),
-          CAFFE_CUDA_NUM_THREADS,
-          0,
-          context_.cuda_stream()>>>(
-          output->numel(),
-          slize_sz,
-          scaling_factors_.template data<int>(),
-          output->template mutable_data<T>());
+      SegmentScalingKernel<SIndex, T>
+          <<<CAFFE_GET_BLOCKS(output->numel()),
+             CAFFE_CUDA_NUM_THREADS,
+             0,
+             context_.cuda_stream()>>>(
+              output->numel(),
+              slize_sz,
+              scaling_factors_.template data<int>(),
+              output->template mutable_data<T>());
     }
     return true;
   }
@@ -1821,3 +1831,15 @@ REGISTER_CUDA_OPERATOR(
     LengthsIndicesInGradientMeanGradient,
     CUDASparseLengthsMeanGradientWithIndicesOp<float, CUDAContext>);
 } // namespace caffe2
+
+// Macro doesn't like comma
+using LengthsSumCUDAOp =
+    caffe2::CUDASparseLengthsSumOp<float, caffe2::CUDAContext, false>;
+using LengthsMeanCUDAOp =
+    caffe2::CUDASparseLengthsMeanOp<float, caffe2::CUDAContext, false>;
+using LengthsMaxCUDAOp =
+    caffe2::CUDASparseLengthsMaxOp<float, caffe2::CUDAContext, false>;
+
+C10_REGISTER_CAFFE2_OPERATOR_CUDA(LengthsSum, LengthsSumCUDAOp);
+C10_REGISTER_CAFFE2_OPERATOR_CUDA(LengthsMean, LengthsMeanCUDAOp);
+C10_REGISTER_CAFFE2_OPERATOR_CUDA(LengthsMax, LengthsMaxCUDAOp);
diff --git a/caffe2/operators/sparse_to_dense_mask_op.h b/caffe2/operators/sparse_to_dense_mask_op.h
index 6867d87c85f4..62b27c603cfc 100644
--- a/caffe2/operators/sparse_to_dense_mask_op.h
+++ b/caffe2/operators/sparse_to_dense_mask_op.h
@@ -66,11 +66,10 @@ class SparseToDenseMaskOp : public SparseToDenseMaskBase<Context> {
   template <class... Args>
   explicit SparseToDenseMaskOp(Args&&... args)
       : SparseToDenseMaskBase<Context>(std::forward<Args>(args)...) {
-    returnPresenceMask_ = this->template GetSingleArgument<bool>(
-        "return_presence_mask", false);
-    maxSkippedSparseIndices_ =
-        this->template GetSingleArgument<int32_t>(
-            "max_skipped_indices", kMaxSkippedSparseIndices);
+    returnPresenceMask_ =
+        this->template GetSingleArgument<bool>("return_presence_mask", false);
+    maxSkippedRows_ = this->template GetSingleArgument<int32_t>(
+        "max_skipped_indices", kMaxSkippedSparseIndices);
   }
 
   bool RunOnDevice() override {
@@ -151,15 +150,13 @@ class SparseToDenseMaskOp : public SparseToDenseMaskBase<Context> {
 
     int64_t offset = 0;
     for (int r = 0; r < rows; r++) {
+      bool skippedSparseIndex = false;
       for (int c = 0; c < lengths_vec[r]; c++) {
         const auto sparse_index = sparse_indices_vec[offset + c];
         if (sparse_index < 0 ||
             sparse_index >= std::numeric_limits<TInd>::max()) {
+          skippedSparseIndex = true;
           LOG(WARNING) << "Skipping invalid sparse index: " << sparse_index;
-          CAFFE_ENFORCE_LT(
-              ++skippedSparseIndices_,
-              maxSkippedSparseIndices_,
-              "Too many sparse indices skipped");
           continue;
         }
         int idx = this->getFeatureIdx(sparse_index);
@@ -174,6 +171,11 @@ class SparseToDenseMaskOp : public SparseToDenseMaskBase<Context> {
           }
         }
       }
+      skippedRows_ += skippedSparseIndex;
+      CAFFE_ENFORCE_LT(
+          skippedRows_,
+          maxSkippedRows_,
+          "Too many rows with invalid sparse indices skipped");
       offset += lengths_vec[r];
     }
 
@@ -181,11 +183,11 @@ class SparseToDenseMaskOp : public SparseToDenseMaskBase<Context> {
   }
 
  private:
-  static const uint32_t kMaxSkippedSparseIndices = 5;
+  static const uint32_t kMaxSkippedSparseIndices = 50;
 
   bool returnPresenceMask_;
-  uint32_t maxSkippedSparseIndices_ = 0;
-  uint32_t skippedSparseIndices_ = 0;
+  uint32_t maxSkippedRows_ = 0;
+  uint32_t skippedRows_ = 0;
 
   INPUT_TAGS(INDICES, VALUES, DEFAULT, LENGTHS);
   OUTPUT_TAGS(OUTPUTVALUE, PRESENCEMASK);
diff --git a/caffe2/operators/utility_ops.cc b/caffe2/operators/utility_ops.cc
index 0edc47efea31..10cec6c9ccb3 100644
--- a/caffe2/operators/utility_ops.cc
+++ b/caffe2/operators/utility_ops.cc
@@ -51,6 +51,7 @@ REGISTER_CPU_OPERATOR(
     ScatterWeightedSum,
     ScatterWeightedSumOp<float, CPUContext>);
 REGISTER_CPU_OPERATOR(ScatterAssign, ScatterAssignOp<CPUContext>);
+REGISTER_CPU_OPERATOR(Scatter, ScatterOp<CPUContext>);
 
 REGISTER_CPU_OPERATOR(LengthsToShape, LengthsToShapeOp<CPUContext>);
 REGISTER_CPU_OPERATOR(HasElements, HasElementsOp<CPUContext>);
@@ -369,6 +370,38 @@ Currently only works on CPU because of access to INDICES.
         "Update slices, with shape len(INDICES) + shape(X_0)[1:]")
     .Output(0, "DATA", "Has to be exactly the same tensor as the input 0");
 
+OPERATOR_SCHEMA(Scatter)
+    .NumInputs(3)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .SetDoc(R"DOC(
+Update values of the tensor by overriding current value specified by indices.
+
+Writes all values from the tensor UPDATES into DATA at the indices specified in the INDICES tensor.
+For each value in DATA, its output index is specified by its index in UPDATES and by the corresponding value in INDICES for the specified axis.
+
+For a 3-D tensor, DATA is updated as:
+
+DATA[INDICES[i][j][k]][j][k] = UPDATES[i][j][k]  # if axis == 0
+DATA[i][INDICES[i][j][k]][k] = UPDATES[i][j][k]  # if axis == 1
+DATA[i][j][INDICES[i][j][k]] = UPDATES[i][j][k]  # if axis == 2
+
+Currently only works on CPU because of access to INDICES.
+)DOC")
+    .Input(0, "DATA", "Tensor to be updated.")
+    .Input(
+        1,
+        "INDICES",
+        "1-D list of indices on the first dimension"
+        "of X_0 that need to be updated")
+    .Input(
+        2,
+        "UPDATES",
+        "Update slices, with shape len(INDICES) + shape(X_0)[1:]")
+    .Output(0, "OUTPUT", "The updated output.")
+    .Arg(
+        "axis",
+        "*(type: int; default: 1)* Which dimension to scatter on.");
 
 OPERATOR_SCHEMA(HasElements)
     .NumInputs(1)
@@ -739,6 +772,7 @@ REGISTER_GRADIENT(Sum, GetSumGradient);
 
 SHOULD_NOT_DO_GRADIENT(ScatterWeightedSum);
 SHOULD_NOT_DO_GRADIENT(ScatterAssign);
+SHOULD_NOT_DO_GRADIENT(Scatter);
 
 class GetWeightedSumGradient : public GradientMakerBase {
   using GradientMakerBase::GradientMakerBase;
diff --git a/caffe2/operators/utility_ops.h b/caffe2/operators/utility_ops.h
index 2b38d1b67f96..8dd1c45cc461 100644
--- a/caffe2/operators/utility_ops.h
+++ b/caffe2/operators/utility_ops.h
@@ -738,6 +738,106 @@ class ScatterAssignOp : public Operator<Context> {
   INPUT_TAGS(DATA, INDICES, SLICES);
 };
 
+template <class Context>
+class ScatterOp : public Operator<CPUContext> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  template <class... Args>
+  explicit ScatterOp(Args&&... args)
+      : Operator<CPUContext>(std::forward<Args>(args)...),
+        OP_SINGLE_ARG(int, "axis", axis_, 1) {
+  }
+
+  virtual ~ScatterOp() noexcept override {}
+
+  bool RunOnDevice() override {
+    
+    TORCH_CHECK(Context::GetDeviceType() == kCPU, "ScatterOp currently only supports CPU.")
+
+    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
+        this, this->template Input<Tensor>(INDICES, CPU));
+  }
+
+  template <typename IndexType>
+  bool DoRunWithType() {
+    const Tensor& data = Input(DATA);
+    const Tensor& indices = Input(INDICES);
+    const Tensor& updates = Input(UPDATES);
+    const TypeMeta dataType = data.dtype();
+    size_t item_bytesize = dataType.itemsize();
+
+    // ONNX allows negative axis to index from the back, valid range: [-r, r].
+    axis_ = data.canonical_axis_index(axis_);
+    
+    CAFFE_ENFORCE_GE(data.dim(), axis_ + 1, "DATA should be at least [axis+1]-D");
+    CAFFE_ENFORCE_GE(axis_, 0, "Axis should be non-negative");
+    CAFFE_ENFORCE_LT(axis_, data.dim(), "Axis out of range");
+
+    Tensor* output = Output(0, data.sizes().vec(), at::dtype(dataType));
+    output->CopyFrom(data);
+    char* out = static_cast<char*>(output->raw_mutable_data(dataType));
+
+    // Succeed if size of output is zero, which can happen for empty batch which
+    // would have data dimension size of 0.
+    // This *must* be done AFTER output->raw_mutable_data() above as that has
+    // important allocation side effect that we must see.
+    if (output->numel() == 0) {
+      return true;
+    }
+
+    const IndexType* idxs = indices.template data<IndexType>();
+    const char* src_base = static_cast<const char*>(updates.raw_data());
+
+    const int64_t outer_dims_product = updates.size_to_dim(axis_);
+    const int64_t block_size = updates.size_from_dim(axis_ + 1);
+    const int64_t block_bytesize = block_size * item_bytesize;
+
+    const int64_t src_indexing_axis_dim = updates.size(axis_);
+    const int64_t src_batch_bytesize = updates.size_from_dim(axis_) * item_bytesize;
+    const int64_t dst_batch_size = data.size_from_dim(axis_) * item_bytesize;
+    
+    const int64_t N = indices.size(axis_);
+
+    check_indexarray_range<IndexType>(idxs, N, src_indexing_axis_dim);
+
+    int64_t i = 0;
+    for (int64_t batch = 0; batch < outer_dims_product; ++batch) {
+      int64_t i_max = i + N;
+      for (; i < i_max && i < indices.numel(); ++i) {
+        auto idx = idxs[i];
+
+        auto src = src_base + batch * src_batch_bytesize + idx * block_bytesize;
+        auto dst = out + batch * dst_batch_size + (i - i_max + N) * block_bytesize;
+        context_.CopyItemsSameDevice(dataType, block_size, src, dst);
+      }
+    }
+    return true;
+  }
+
+  INPUT_TAGS(DATA, INDICES, UPDATES);
+
+  // Check that indices fall within dimension array size with CAFFE_ENFORCE.
+  template <typename IndexType>
+  static void check_indexarray_range(
+      const IndexType* indices,
+      int64_t n,
+      IndexType indexing_axis_dim) {
+    for (auto i = 0; i < n; ++i) {
+      auto idx = indices[i];
+      CAFFE_ENFORCE(
+          0 <= idx && idx < indexing_axis_dim,
+          "INDICES element is out of DATA bounds, id=",
+          idx,
+          " axis_dim=",
+          indexing_axis_dim);
+    }
+  }
+
+ protected:
+  int axis_;
+};
+
 template <class Context>
 class LengthsToSegmentIdsOp : public Operator<Context> {
  public:
diff --git a/caffe2/opt/backend_transformer_base.cc b/caffe2/opt/backend_transformer_base.cc
index 21bca5826991..45f88bda400f 100644
--- a/caffe2/opt/backend_transformer_base.cc
+++ b/caffe2/opt/backend_transformer_base.cc
@@ -54,8 +54,16 @@ QTensorProto BackendTransformerBase::wrapShapeInfoIntoQTensorProto(
       "Only quantized shapeinfo can be extracted into QTensor!");
   t.set_name(name);
   t.set_data_type(shape_info.shape.data_type());
-  t.set_scale(shape_info.q_info.scale);
-  t.set_bias(shape_info.q_info.offset);
+  t.set_axis(shape_info.q_info.axis);
+  t.set_is_multiparam(true);
+  for (const auto i : shape_info.q_info.scale) {
+    t.add_scales(i);
+  }
+  t.set_scale(1.0);
+  for (const auto i : shape_info.q_info.offset) {
+    t.add_biases(i);
+  }
+  t.set_bias(0.0);
   // precision and is_signed is not used in onnxifi workflow, but it is required
   // field
   t.set_precision(0);
@@ -119,9 +127,9 @@ ShapeInfoMap BackendTransformerBase::inferShapes(
       shape_map.emplace(s, shape_info);
     }
   }
-  BoundShapeInferencer eng(spec);
-  eng.InferBoundShapeAndType(*pred_net, shape_map);
-  const auto& out_map = eng.shape_info();
+  auto eng = BoundShapeInferencerRegistry()->Create("C10", spec);
+  eng->InferBoundShapeAndType(*pred_net, shape_map, ws);
+  const auto& out_map = eng->shape_info();
   shape_map.clear();
   for (const auto& kv : out_map) {
     shape_map.emplace(
diff --git a/caffe2/opt/bound_shape_inference_test.cc b/caffe2/opt/bound_shape_inference_test.cc
index 9d4d274b3954..961a03397122 100644
--- a/caffe2/opt/bound_shape_inference_test.cc
+++ b/caffe2/opt/bound_shape_inference_test.cc
@@ -51,7 +51,7 @@ TEST(BoundShapeInference, SparseLengthsSum) {
       "Weights", makeTensorInfo(ShapeInfo::DimType::CONSTANT, {1000, 16}));
   BoundShapeSpec spec(20, 1000);
   BoundShapeInferencer eng(spec);
-  eng.InferBoundShapeAndType(net, shape_map);
+  eng.InferBoundShapeAndType(net, shape_map, nullptr);
   const auto& out_shape = eng.shape_info();
   verifyShapeInfo(
       out_shape, "Weights", ShapeInfo::DimType::CONSTANT, {1000, 16});
@@ -86,7 +86,7 @@ TEST(BoundShapeInference, SparseLengthsSumFused8BitRowwise) {
           ShapeInfo::DimType::CONSTANT, {1000, 58}, TensorProto_DataType_INT8));
   BoundShapeSpec spec(20, 1000);
   BoundShapeInferencer eng(spec);
-  eng.InferBoundShapeAndType(net, shape_map);
+  eng.InferBoundShapeAndType(net, shape_map, nullptr);
   const auto& out_shape = eng.shape_info();
   verifyShapeInfo(
       out_shape,
@@ -127,7 +127,7 @@ TEST(BoundShapeInference, LengthsRangeFill) {
   ShapeInfoMap shape_map;
   BoundShapeSpec spec(20, 1000);
   BoundShapeInferencer eng(spec);
-  eng.InferBoundShapeAndType(net, shape_map);
+  eng.InferBoundShapeAndType(net, shape_map, nullptr);
   const auto& out_shape = eng.shape_info();
   verifyShapeInfo(
       out_shape,
@@ -175,7 +175,7 @@ TEST(BoundShapeInference, Reshape) {
   shape_map.emplace("B0", makeTensorInfo(ShapeInfo::DimType::CONSTANT, {16}));
   BoundShapeSpec spec(20, 1000);
   BoundShapeInferencer eng(spec);
-  eng.InferBoundShapeAndType(net, shape_map);
+  eng.InferBoundShapeAndType(net, shape_map, nullptr);
   const auto& out_shape = eng.shape_info();
   verifyShapeInfo(
       out_shape, "X0", ShapeInfo::DimType::BATCH, {spec.max_batch_size, 1024});
@@ -203,7 +203,7 @@ TEST(BoundShapeInference, ConcatMissingInput) {
       "I0",
       makeTensorInfo(ShapeInfo::DimType::BATCH, {spec.max_batch_size, 60}));
   BoundShapeInferencer eng(spec);
-  eng.InferBoundShapeAndType(net, shape_map);
+  eng.InferBoundShapeAndType(net, shape_map, nullptr);
   const auto& out_shape = eng.shape_info();
   verifyShapeInfo(
       out_shape, "I0", ShapeInfo::DimType::BATCH, {spec.max_batch_size, 60});
@@ -233,7 +233,7 @@ TEST(BoundShapeInference, ConcatInferInputBackwards) {
       "W0", makeTensorInfo(ShapeInfo::DimType::CONSTANT, {101, 16}));
   shape_map.emplace("B0", makeTensorInfo(ShapeInfo::DimType::CONSTANT, {16}));
   BoundShapeInferencer eng(spec);
-  eng.InferBoundShapeAndType(net, shape_map);
+  eng.InferBoundShapeAndType(net, shape_map, nullptr);
   const auto& out_shape = eng.shape_info();
   verifyShapeInfo(
       out_shape, "I0", ShapeInfo::DimType::BATCH, {spec.max_batch_size, 60});
@@ -274,7 +274,7 @@ TEST(BoundShapeInference, Split) {
       "X1",
       makeTensorInfo(ShapeInfo::DimType::BATCH, {spec.max_batch_size, 2, 48}));
   BoundShapeInferencer eng(spec);
-  eng.InferBoundShapeAndType(net, shape_map);
+  eng.InferBoundShapeAndType(net, shape_map, nullptr);
   const auto& out_shape = eng.shape_info();
   verifyShapeInfo(
       out_shape, "X", ShapeInfo::DimType::BATCH, {spec.max_batch_size, 48});
@@ -317,7 +317,7 @@ TEST(BoundShapeInference, FC) {
   shape_map.emplace("B1", makeTensorInfo(ShapeInfo::DimType::CONSTANT, {1024}));
   BoundShapeSpec spec(20, 1000);
   BoundShapeInferencer eng(spec);
-  eng.InferBoundShapeAndType(net, shape_map);
+  eng.InferBoundShapeAndType(net, shape_map, nullptr);
   const auto& out_shape = eng.shape_info();
   verifyShapeInfo(
       out_shape, "X0", ShapeInfo::DimType::BATCH, {spec.max_batch_size, 1024});
@@ -342,7 +342,7 @@ TEST(BoundShapeInference, FC3D) {
   shape_map.emplace("B0", makeTensorInfo(ShapeInfo::DimType::CONSTANT, {16}));
   BoundShapeSpec spec(20, 1000);
   BoundShapeInferencer eng(spec);
-  eng.InferBoundShapeAndType(net, shape_map);
+  eng.InferBoundShapeAndType(net, shape_map, nullptr);
   const auto& out_shape = eng.shape_info();
   verifyShapeInfo(
       out_shape, "X0", ShapeInfo::DimType::BATCH, {spec.max_batch_size, 1024});
@@ -350,46 +350,6 @@ TEST(BoundShapeInference, FC3D) {
       out_shape, "Out0", ShapeInfo::DimType::BATCH, {spec.max_batch_size, 16});
 }
 
-TEST(BoundShapeInference, ClipRangesGatherSigridHash) {
-  FLAGS_caffe2_extract_feature_length_for_shape_inference = true;
-  NetDef net;
-  net.add_op()->CopyFrom(CreateOperatorDef(
-      "ClipRangesGatherSigridHash",
-      "",
-      {"R0", "V0"},
-      {"F0_lengths_0", "F0_values_0", "F1_lengths_0", "F1_values_0"},
-      {MakeArgument<std::vector<int>>("max_lengths", {200, 400})}));
-  ShapeInfoMap shape_map;
-  BoundShapeSpec spec(50, 1000);
-  BoundShapeInferencer eng(spec);
-  eng.InferBoundShapeAndType(net, shape_map);
-  const auto& out_shape = eng.shape_info();
-  verifyShapeInfo(
-      out_shape,
-      "F0_lengths_0",
-      ShapeInfo::DimType::BATCH,
-      {spec.max_batch_size},
-      TensorProto_DataType_INT32);
-  verifyShapeInfo(
-      out_shape,
-      "F0_values_0",
-      ShapeInfo::DimType::SEQ,
-      {spec.max_batch_size * 200},
-      TensorProto_DataType_INT64);
-  verifyShapeInfo(
-      out_shape,
-      "F1_lengths_0",
-      ShapeInfo::DimType::BATCH,
-      {spec.max_batch_size},
-      TensorProto_DataType_INT32);
-  verifyShapeInfo(
-      out_shape,
-      "F1_values_0",
-      ShapeInfo::DimType::SEQ,
-      {spec.max_batch_size * 400},
-      TensorProto_DataType_INT64);
-}
-
 TEST(BoundShapeInference, Combo0) {
   NetDef net;
   net.add_op()->CopyFrom(CreateOperatorDef(
@@ -421,56 +381,9 @@ TEST(BoundShapeInference, Combo0) {
       "Indices", makeTensorInfo(ShapeInfo::DimType::CONSTANT, {2}));
   BoundShapeSpec spec(20, 1000);
   BoundShapeInferencer eng(spec);
-  eng.InferBoundShapeAndType(net, shape_map);
+  eng.InferBoundShapeAndType(net, shape_map, nullptr);
   const auto& out_shape = eng.shape_info();
   LOG(INFO) << eng.PrintShapeInfo();
   verifyShapeInfo(
       out_shape, "Gout", ShapeInfo::DimType::BATCH, {spec.max_batch_size, 2});
 }
-
-TEST(BoundShapeInference, Combo1) {
-  FLAGS_caffe2_extract_feature_length_for_shape_inference = true;
-  NetDef net;
-  net.add_op()->CopyFrom(CreateOperatorDef(
-      "ClipRangesGatherSigridHash",
-      "",
-      {"R0", "V0"},
-      {"F0_lengths_0", "F0_values_0", "F1_lengths_0", "F1_values_0"},
-      {MakeArgument<std::vector<int>>("max_lengths", {300, 400})}));
-
-  net.add_op()->CopyFrom(CreateOperatorDef(
-      "SparseLengthsSumFused8BitRowwise",
-      "",
-      {"Weights", "F0_values_0", "F0_lengths_0"},
-      {"Out"},
-      {}));
-  ShapeInfoMap shape_map;
-  shape_map.emplace(
-      "Weights",
-      makeTensorInfo(
-          ShapeInfo::DimType::CONSTANT, {1000, 58}, TensorProto_DataType_INT8));
-  BoundShapeSpec spec(20, 1000);
-  BoundShapeInferencer eng(spec);
-  eng.InferBoundShapeAndType(net, shape_map);
-  const auto& out_shape = eng.shape_info();
-  verifyShapeInfo(
-      out_shape,
-      "Weights",
-      ShapeInfo::DimType::CONSTANT,
-      {1000, 58},
-      TensorProto_DataType_INT8);
-  verifyShapeInfo(
-      out_shape,
-      "F0_values_0",
-      ShapeInfo::DimType::SEQ,
-      {spec.max_batch_size * 300},
-      TensorProto_DataType_INT64);
-  verifyShapeInfo(
-      out_shape,
-      "F0_lengths_0",
-      ShapeInfo::DimType::BATCH,
-      {spec.max_batch_size},
-      TensorProto_DataType_INT32);
-  verifyShapeInfo(
-      out_shape, "Out", ShapeInfo::DimType::BATCH, {spec.max_batch_size, 50});
-}
diff --git a/caffe2/opt/bound_shape_inferencer.cc b/caffe2/opt/bound_shape_inferencer.cc
index 58652e4ab725..68167609ac44 100644
--- a/caffe2/opt/bound_shape_inferencer.cc
+++ b/caffe2/opt/bound_shape_inferencer.cc
@@ -38,17 +38,19 @@ int64_t SizeToDim(const TensorShape& shape, int axis) {
   }
   return r;
 }
+} // namespace
 
-void EnsureShapeNames(std::unordered_map<std::string, ShapeInfo>* info) {
+void BoundShapeInferencer::EnsureShapeNames(
+    std::unordered_map<std::string, ShapeInfo>* info) const {
   for (auto& kv : *info) {
     kv.second.shape.set_name(kv.first);
   }
 }
-} // namespace
 
 void BoundShapeInferencer::InferBoundShapeAndType(
     const NetDef& net,
-    const std::unordered_map<std::string, ShapeInfo>& info) {
+    const std::unordered_map<std::string, ShapeInfo>& info,
+    caffe2::Workspace* ws) {
   const static std::unordered_set<std::string> unsupported{"Tile"};
   shape_info_ = info;
 
@@ -79,10 +81,6 @@ void BoundShapeInferencer::InferBoundShapeAndType(
       InferGivenTensorFill(op);
     } else if (op.type() == "Shape") {
       InferShape(op);
-    } else if (
-        op.type() == "ClipRangesGatherSigridHash" &&
-        FLAGS_caffe2_extract_feature_length_for_shape_inference) {
-      InferClipRangesGatherSigridHash(op);
     } else {
       InferCommonOp(op);
     }
@@ -125,8 +123,11 @@ TensorShape& BoundShapeInferencer::CheckAndSetTensorShapeAndType(
   TensorShape& shape = shape_info.shape;
   if (is_quantized) {
     shape_info.is_quantized = true;
-    shape_info.q_info.scale = 1;
-    shape_info.q_info.offset = 0;
+    shape_info.q_info.scale.clear();
+    shape_info.q_info.scale.push_back(1);
+    shape_info.q_info.offset.clear();
+    shape_info.q_info.offset.push_back(0);
+    shape_info.q_info.axis = 1;
   }
   if (!rt.second) {
     // Check shape consistency
@@ -340,9 +341,9 @@ void BoundShapeInferencer::InferConcatInputs(const OperatorDef& op) {
   }
 }
 
-// For concat net, if some inputs are missing and we have add_axis argument, it
-// means that all the inputs should be of the same dimension. In this case, we
-// can infer the shape of the missing inputs
+// For concat net, if some inputs are missing and we have add_axis argument,
+// it means that all the inputs should be of the same dimension. In this case,
+// we can infer the shape of the missing inputs
 void BoundShapeInferencer::InferConcat(const OperatorDef& op) {
   ArgumentHelper helper(op);
   auto add_axis = helper.GetSingleArgument<int32_t>("add_axis", 0);
@@ -418,7 +419,8 @@ void BoundShapeInferencer::InferFC(const OperatorDef& op) {
   const ShapeInfo& b_shape_info = b_it->second;
   auto x_it = shape_info_.find(op.input(0));
   if (x_it == shape_info_.end()) {
-    // We don't have a hint at the x input we try to deduce it from weight shape
+    // We don't have a hint at the x input we try to deduce it from weight
+    // shape
     ArgumentHelper helper(op);
     auto axis = helper.GetSingleArgument<int32_t>("axis", 1);
     auto axis_w = helper.GetSingleArgument<int32_t>("axis_w", 1);
@@ -464,104 +466,67 @@ void BoundShapeInferencer::InferFC(const OperatorDef& op) {
       false);
 }
 
-void BoundShapeInferencer::InferClipRangesGatherSigridHash(
-    const OperatorDef& op) {
-  CAFFE_ENFORCE(
-      op.output_size() % 2 == 0,
-      "ClipRangesGatherSigridHash has to have even number of outputs");
-  ArgumentHelper helper(op);
-  auto max_lengths_arg = helper.GetRepeatedArgument<int>("max_lengths");
-  CAFFE_ENFORCE_EQ(
-      max_lengths_arg.size() * 2,
-      op.output_size(),
-      "Output size of ClipRangesGatherSigridHash has to be the same with 2 * length of max_lengths arg");
-  for (int i = 0; i < op.output_size(); i++) {
-    auto output_name = op.output(i);
-    if (i % 2 == 0) {
-      CAFFE_ENFORCE(
-          output_name.find("lengths") != std::string::npos,
-          "In ClipRangesGatherSigridHash, name of output in even index has to contain 'lengths'");
-      CheckAndSetTensorShapeAndType(
-          output_name,
-          ShapeInfo::DimType::BATCH,
-          {spec_.max_batch_size},
-          TensorProto_DataType_INT32,
-          false);
-    } else {
-      CAFFE_ENFORCE(
-          output_name.find("values") != std::string::npos,
-          "In ClipRangesGatherSigridHash, name of output in odd index has to contain 'values'");
-      CheckAndSetTensorShapeAndType(
-          output_name,
-          ShapeInfo::DimType::SEQ,
-          {max_lengths_arg[i / 2] * spec_.max_batch_size},
-          TensorProto_DataType_INT64,
-          false);
-    }
-  }
-}
-
 void BoundShapeInferencer::InferCommonOp(const OperatorDef& op) {
   // First, we need to check that all the input shape/types are already
   // presented
   try {
-  std::vector<TensorShape> input_shapes;
-  for (const auto& input : op.input()) {
-    const auto it = shape_info_.find(input);
-    if (it == shape_info_.end()) {
-      LOG(WARNING) << "Cannot find shape info for " << input << ". Skipping "
-                   << op.type();
-      return;
+    std::vector<TensorShape> input_shapes;
+    for (const auto& input : op.input()) {
+      const auto it = shape_info_.find(input);
+      if (it == shape_info_.end()) {
+        LOG(WARNING) << "Cannot find shape info for " << input << ". Skipping "
+                     << op.type();
+        return;
+      }
+      input_shapes.emplace_back(it->second.shape);
     }
-    input_shapes.emplace_back(it->second.shape);
-  }
 
-  const OpSchema* schema = OpSchemaRegistry::Schema(op.type());
-  CAFFE_ENFORCE(schema);
-  std::vector<TensorShape> output_shapes;
+    const OpSchema* schema = OpSchemaRegistry::Schema(op.type());
+    CAFFE_ENFORCE(schema);
+    std::vector<TensorShape> output_shapes;
     output_shapes = schema->InferTensor(op, input_shapes);
-  int i = 0;
-  bool is_quantized =
-      !(op.type().compare(0, 4, "Int8")) && (op.type() != "Int8Dequantize");
-  TensorProto::DataType infered_data_type = TensorProto::UNDEFINED;
-  if (is_quantized) {
-    const static std::map<std::string, int> type_info_from_input = {
-        {"Int8Quantize", -1}, // Force this op's output to be uint8
-        {"Int8ConvRelu", 1},
-        {"Int8MaxPool", 0},
-        {"Int8AveragePool", 0},
-        {"Int8FC", 1},
-        {"Int8Conv", 1},
-        {"Int8SumRelu", 0}};
-    CAFFE_ENFORCE(
-        type_info_from_input.find(op.type()) != type_info_from_input.end(),
-        "Undefined quantized output data type, add it into type_info_from_input");
-    int target = type_info_from_input.find(op.type())->second;
-    if (target == -1) {
-      infered_data_type = TensorProto::UINT8;
-    } else {
-      CAFFE_ENFORCE(target < input_shapes.size());
-      infered_data_type = input_shapes[target].data_type();
+    int i = 0;
+    bool is_quantized =
+        !(op.type().compare(0, 4, "Int8")) && (op.type() != "Int8Dequantize");
+    TensorProto::DataType infered_data_type = TensorProto::UNDEFINED;
+    if (is_quantized) {
+      const static std::map<std::string, int> type_info_from_input = {
+          {"Int8Quantize", -1}, // Force this op's output to be uint8
+          {"Int8ConvRelu", 1},
+          {"Int8MaxPool", 0},
+          {"Int8AveragePool", 0},
+          {"Int8FC", 1},
+          {"Int8Conv", 1},
+          {"Int8SumRelu", 0}};
+      CAFFE_ENFORCE(
+          type_info_from_input.find(op.type()) != type_info_from_input.end(),
+          "Undefined quantized output data type, add it into type_info_from_input");
+      int target = type_info_from_input.find(op.type())->second;
+      if (target == -1) {
+        infered_data_type = TensorProto::UINT8;
+      } else {
+        CAFFE_ENFORCE(target < input_shapes.size());
+        infered_data_type = input_shapes[target].data_type();
+      }
+    } else if (op.type() == "Int8Dequantize") {
+      infered_data_type = TensorProto::FLOAT;
     }
-  } else if (op.type() == "Int8Dequantize") {
-    infered_data_type = TensorProto::FLOAT;
-  }
 
-  for (const auto& shape : output_shapes) {
-    if (infered_data_type == TensorProto::UNDEFINED) {
-      infered_data_type = shape.data_type();
-    }
-    if (shape.unknown_shape()) {
-      ++i;
-      continue;
+    for (const auto& shape : output_shapes) {
+      if (infered_data_type == TensorProto::UNDEFINED) {
+        infered_data_type = shape.data_type();
+      }
+      if (shape.unknown_shape()) {
+        ++i;
+        continue;
+      }
+      CheckAndSetTensorShapeAndType(
+          op.output(i++),
+          current_dim_type_,
+          ConvertToVec(shape.dims()),
+          infered_data_type,
+          is_quantized);
     }
-    CheckAndSetTensorShapeAndType(
-        op.output(i++),
-        current_dim_type_,
-        ConvertToVec(shape.dims()),
-        infered_data_type,
-        is_quantized);
-  }
   } catch (const caffe2::EnforceNotMet& e) {
     LOG(ERROR) << "Enforce not met while inferring shapes for " << op.type()
                << ": " << e.msg();
@@ -571,4 +536,18 @@ void BoundShapeInferencer::InferCommonOp(const OperatorDef& op) {
   }
 }
 
+std::shared_ptr<BoundShapeInferencerBase> getBoundShapeInferencer(
+    const BoundShapeSpec& spec) {
+  return std::make_shared<BoundShapeInferencer>(spec);
+}
+
+C10_DEFINE_SHARED_REGISTRY(
+    BoundShapeInferencerRegistry,
+    BoundShapeInferencerBase,
+    const BoundShapeSpec&);
+
+C10_REGISTER_CREATOR(
+    BoundShapeInferencerRegistry,
+    C10,
+    getBoundShapeInferencer);
 } // namespace caffe2
diff --git a/caffe2/opt/bound_shape_inferencer.h b/caffe2/opt/bound_shape_inferencer.h
index 2b5a4ce067dc..77bb3003bc61 100644
--- a/caffe2/opt/bound_shape_inferencer.h
+++ b/caffe2/opt/bound_shape_inferencer.h
@@ -29,16 +29,19 @@ struct CAFFE2_API BoundShapeSpec {
 /// then propagates the bound shape down the network. For now the variable part
 /// (bound part) is the first dimension of the shape, which usually corresponds
 /// to the batch size or sequence lookup size.
-class CAFFE2_API BoundShapeInferencer {
+class BoundShapeInferencerBase {
  public:
-  explicit BoundShapeInferencer(const BoundShapeSpec& spec) : spec_(spec) {
+  explicit BoundShapeInferencerBase(const BoundShapeSpec& spec) : spec_(spec) {
     CAFFE_ENFORCE_GE(spec_.max_batch_size, 0);
     CAFFE_ENFORCE_GE(spec_.max_seq_size, 0);
   }
 
-  void InferBoundShapeAndType(
+  virtual ~BoundShapeInferencerBase() {}
+
+  virtual void InferBoundShapeAndType(
       const NetDef& net,
-      const std::unordered_map<std::string, ShapeInfo>& info);
+      const std::unordered_map<std::string, ShapeInfo>& info,
+      caffe2::Workspace* ws) = 0;
 
   const ShapeInfoMap& shape_info() const {
     return shape_info_;
@@ -58,7 +61,24 @@ class CAFFE2_API BoundShapeInferencer {
     return ss.str();
   }
 
- private:
+ protected:
+  const BoundShapeSpec spec_;
+  std::unordered_map<std::string, ShapeInfo> shape_info_;
+};
+
+class CAFFE2_API BoundShapeInferencer : public BoundShapeInferencerBase {
+ public:
+  explicit BoundShapeInferencer(const BoundShapeSpec& spec)
+      : BoundShapeInferencerBase(spec) {}
+
+  virtual ~BoundShapeInferencer() override {}
+
+  void InferBoundShapeAndType(
+      const NetDef& net,
+      const std::unordered_map<std::string, ShapeInfo>& info,
+      caffe2::Workspace* ws) override;
+
+ protected:
   TensorShape& CheckAndSetTensorShapeAndType(
       const std::string& name,
       ShapeInfo::DimType t,
@@ -83,16 +103,23 @@ class CAFFE2_API BoundShapeInferencer {
   void InferShape(const OperatorDef& op);
   void InferReshape(const OperatorDef& op);
   void InferLengthsRangeFill(const OperatorDef& op);
-  void InferClipRangesGatherSigridHash(const OperatorDef& op);
 
   // Standard shape/type inference using op schema registered shape inference
   // function
   void InferCommonOp(const OperatorDef& op);
 
-  const BoundShapeSpec spec_;
+  void EnsureShapeNames(std::unordered_map<std::string, ShapeInfo>* info) const;
+
   ShapeInfo::DimType current_dim_type_{ShapeInfo::DimType::BATCH};
   int64_t current_max_batch_size_{0};
-  std::unordered_map<std::string, ShapeInfo> shape_info_;
 };
 
+CAFFE2_API std::shared_ptr<BoundShapeInferencerBase> getBoundShapeInferencer(
+    const BoundShapeSpec& spec);
+
+C10_DECLARE_SHARED_REGISTRY(
+    BoundShapeInferencerRegistry,
+    BoundShapeInferencerBase,
+    const BoundShapeSpec&);
+
 } // namespace caffe2
diff --git a/caffe2/opt/optimize_ideep.cc b/caffe2/opt/optimize_ideep.cc
index d770479512b9..f0d251e66e99 100644
--- a/caffe2/opt/optimize_ideep.cc
+++ b/caffe2/opt/optimize_ideep.cc
@@ -79,6 +79,11 @@ bool isOnIdeepDevice(const repr::NeuralNetOperator& nnOp) {
 }
 
 bool isConvFusion(repr::NNGraph::NodeRef convNode, int fusion_type) {
+  // Here we only check the type of ConvFusion op (for FP32 only)
+  if (!repr::nn::is<repr::Conv>(convNode)) {
+    return false;
+  }
+
   auto conv = repr::nn::get<repr::Conv>(convNode);
   auto& op = getOpDef(*conv);
 
diff --git a/caffe2/opt/shape_info.cc b/caffe2/opt/shape_info.cc
index ebd7b68303be..7eb8d0d3726e 100644
--- a/caffe2/opt/shape_info.cc
+++ b/caffe2/opt/shape_info.cc
@@ -14,7 +14,23 @@ ShapeInfo getShapeInfoFromBlob(const Blob* blob) {
   if (blob->meta().id() == TypeMeta::Id<int8::Int8TensorCPU>()) {
     shape_info.is_quantized = true;
     LoadInt8TensorInfoOfBlob(
-        &shape_info.q_info.scale, &shape_info.q_info.offset, blob);
+        &shape_info.q_info.scale,
+        &shape_info.q_info.offset,
+        &shape_info.q_info.axis,
+        blob);
+  } else {
+#ifndef C10_MOBILE
+    auto function_ptr =
+        ExternalTensorFunctionsBaseRegistry()->Create(blob->meta().id());
+    if (function_ptr != nullptr) {
+      shape_info.is_quantized = true;
+      function_ptr->LoadInfoOfBlob(
+          blob,
+          &shape_info.q_info.scale,
+          &shape_info.q_info.offset,
+          &shape_info.q_info.axis);
+    }
+#endif
   }
   return shape_info;
 }
diff --git a/caffe2/opt/shape_info.h b/caffe2/opt/shape_info.h
index 06d42821f9fb..622b6404c0c8 100644
--- a/caffe2/opt/shape_info.h
+++ b/caffe2/opt/shape_info.h
@@ -5,13 +5,25 @@
 namespace caffe2 {
 
 struct CAFFE2_API QShapeInfo {
-  QShapeInfo(float o = 0, float s = 1) : offset(o), scale(s) {}
-  float offset;
-  float scale;
-  // TODO zrphercule
-  // Add multi offset/scale support here
+  QShapeInfo(float o = 0, float s = 1, uint32_t a = 1) {
+    offset.clear();
+    scale.clear();
+    offset.push_back(o);
+    scale.push_back(s);
+    axis = a;
+  }
+
+  uint32_t axis;
+  vector<float> offset;
+  vector<float> scale;
 };
 
+CAFFE2_API void LoadInt8FCDNNLowPPackedWeightBlobInfoOfBlob(
+    std::vector<float>* scale,
+    std::vector<float>* offset,
+    uint32_t* axis,
+    const Blob* b);
+
 struct CAFFE2_API ShapeInfo {
   enum DimType : int8_t { UNKNOWN = 0, CONSTANT = 1, BATCH = 2, SEQ = 3 };
   ShapeInfo(bool q = false) : is_quantized(q) {}
diff --git a/caffe2/perfkernels/embedding_lookup_avx2.cc b/caffe2/perfkernels/embedding_lookup_avx2.cc
index 271c07a7bb5d..87ac57356d2a 100644
--- a/caffe2/perfkernels/embedding_lookup_avx2.cc
+++ b/caffe2/perfkernels/embedding_lookup_avx2.cc
@@ -1291,6 +1291,7 @@ static bool EmbeddingLookup_int32_t_half_float__avx2_fma(
     }
   } else {
     // generic code
+    alignas(64) at::Half vtmp1[8] = {0};
     for (int rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
       float* op = &out[rangeIndex * block_size];
       int64_t j = 0;
@@ -1335,10 +1336,10 @@ static bool EmbeddingLookup_int32_t_half_float__avx2_fma(
           _mm_prefetch(
               reinterpret_cast<const char*>(&ip_next_T0[j]), _MM_HINT_T0);
         }
-        alignas(64) at::Half vtmp1[8];
         for (; j < block_size; j++) {
           vtmp1[0] = ip[j];
-          __m256 vtmp2 = _mm256_cvtph_ps(*((__m128i*)vtmp1));
+          __m256 vtmp2 =
+              _mm256_cvtph_ps(*(reinterpret_cast<const __m128i*>(vtmp1)));
           op[j] += wgt * ((float*)(&vtmp2))[0];
         }
       }
@@ -1837,6 +1838,7 @@ static bool EmbeddingLookup_int64_t_half_float__avx2_fma(
     }
   } else {
     // generic code
+    alignas(64) at::Half vtmp1[8] = {0};
     for (int64_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
       float* op = &out[rangeIndex * block_size];
       int64_t j = 0;
@@ -1881,10 +1883,10 @@ static bool EmbeddingLookup_int64_t_half_float__avx2_fma(
           _mm_prefetch(
               reinterpret_cast<const char*>(&ip_next_T0[j]), _MM_HINT_T0);
         }
-        alignas(64) at::Half vtmp1[8];
         for (; j < block_size; j++) {
           vtmp1[0] = ip[j];
-          __m256 vtmp2 = _mm256_cvtph_ps(*((__m128i*)vtmp1));
+          __m256 vtmp2 =
+              _mm256_cvtph_ps(*(reinterpret_cast<const __m128i*>(vtmp1)));
           op[j] += wgt * ((float*)(&vtmp2))[0];
         }
       }
diff --git a/caffe2/perfkernels/embedding_lookup_fused_8bit_rowwise_avx2.cc b/caffe2/perfkernels/embedding_lookup_fused_8bit_rowwise_avx2.cc
index 12f790df22e3..230b3bc85687 100644
--- a/caffe2/perfkernels/embedding_lookup_fused_8bit_rowwise_avx2.cc
+++ b/caffe2/perfkernels/embedding_lookup_fused_8bit_rowwise_avx2.cc
@@ -1280,6 +1280,7 @@ static bool Fused8BitRowwiseEmbeddingLookup_int32_t_half_float__avx2_fma(
     }
   } else {
     // generic code
+    alignas(64) at::Half vtmp1[8] = {0};
     for (int rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
       float* op = &out[rangeIndex * block_size];
       int64_t j = 0;
@@ -1324,10 +1325,10 @@ static bool Fused8BitRowwiseEmbeddingLookup_int32_t_half_float__avx2_fma(
           _mm_prefetch(
               reinterpret_cast<const char*>(&ip_next_T0[j]), _MM_HINT_T0);
         }
-        alignas(64) at::Half vtmp1[8];
         for (; j < block_size; j++) {
           vtmp1[0] = ip[j];
-          __m256 vtmp2 = _mm256_cvtph_ps(*((__m128i*)vtmp1));
+          __m256 vtmp2 =
+              _mm256_cvtph_ps(*(reinterpret_cast<const __m128i*>(vtmp1)));
           op[j] += wgt * ((float*)(&vtmp2))[0];
         }
       }
@@ -1821,6 +1822,7 @@ static bool Fused8BitRowwiseEmbeddingLookup_int64_t_half_float__avx2_fma(
     }
   } else {
     // generic code
+    alignas(64) at::Half vtmp1[8] = {0};
     for (int64_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
       float* op = &out[rangeIndex * block_size];
       int64_t j = 0;
@@ -1865,10 +1867,10 @@ static bool Fused8BitRowwiseEmbeddingLookup_int64_t_half_float__avx2_fma(
           _mm_prefetch(
               reinterpret_cast<const char*>(&ip_next_T0[j]), _MM_HINT_T0);
         }
-        alignas(64) at::Half vtmp1[8];
         for (; j < block_size; j++) {
           vtmp1[0] = ip[j];
-          __m256 vtmp2 = _mm256_cvtph_ps(*((__m128i*)vtmp1));
+          __m256 vtmp2 =
+              _mm256_cvtph_ps(*(reinterpret_cast<const __m128i*>(vtmp1)));
           op[j] += wgt * ((float*)(&vtmp2))[0];
         }
       }
diff --git a/caffe2/perfkernels/hp_emblookup_codegen.py b/caffe2/perfkernels/hp_emblookup_codegen.py
index 0af4120b064f..bfde7521446d 100644
--- a/caffe2/perfkernels/hp_emblookup_codegen.py
+++ b/caffe2/perfkernels/hp_emblookup_codegen.py
@@ -197,6 +197,8 @@ def compute(InType, use_weights, isa):
         return code
 
     code = []
+    if InType == "at::Half":
+        code.append("    alignas(64) at::Half vtmp1[8] = {0};")
     code.append(
         "    for ("
         + IndexType
@@ -283,14 +285,15 @@ def compute(InType, use_weights, isa):
     code.extend(compute(InType, use_weights, isa))
     code.append("        }")
     # leftover
-    if InType == "at::Half":
-        code.append("        alignas(64) at::Half vtmp1[8];")
     code.append("        for (; j < block_size; j++) {")
     if InType == "float":
         code.append("          op[j] += wgt * ip[j];")
     elif InType == "at::Half":
         code.append("          vtmp1[0] = ip[j];")
-        code.append("          __m256 vtmp2 = _mm256_cvtph_ps(*((__m128i*)vtmp1));")
+        code.append(
+            "          __m256 vtmp2 =\n"
+            "              _mm256_cvtph_ps(*(reinterpret_cast<const __m128i*>(vtmp1)));"
+        )
         code.append("          op[j] += wgt * ((float*)(&vtmp2))[0];")
     elif InType == "uint8_t":
         code.append("          op[j] += wgt * ((float)ip[j]) + bio;")
diff --git a/caffe2/predictor/predictor.cc b/caffe2/predictor/predictor.cc
index 615d871d5c79..300d161a6e65 100644
--- a/caffe2/predictor/predictor.cc
+++ b/caffe2/predictor/predictor.cc
@@ -4,6 +4,7 @@
 
 namespace caffe2 {
 
+class Workspace;
 namespace {
 
 void enforceIsTensor(Workspace* ws, const std::string& name) {
diff --git a/caffe2/predictor/predictor_config.h b/caffe2/predictor/predictor_config.h
index eda1c9d03ca2..243729b044e9 100644
--- a/caffe2/predictor/predictor_config.h
+++ b/caffe2/predictor/predictor_config.h
@@ -1,7 +1,8 @@
 #pragma once
 #include <memory>
-#include "caffe2/core/net.h"
+
 #include "caffe2/core/tensor.h"
+#include "caffe2/core/workspace.h"
 #include "caffe2/proto/metanet.pb.h"
 #include "caffe2/proto/predictor_consts.pb.h"
 
diff --git a/caffe2/proto/caffe2.proto b/caffe2/proto/caffe2.proto
index 2b079de3d44a..576c1034c561 100644
--- a/caffe2/proto/caffe2.proto
+++ b/caffe2/proto/caffe2.proto
@@ -126,6 +126,17 @@ message QTensorProto {
   repeated int32 data = 6 [packed = true];
   optional string name = 7;
   optional TensorProto.DataType data_type = 8 [default = INT32];
+
+  // Multi-group quantization params
+  repeated double scales = 9;
+  repeated double biases = 10;
+
+  // Multi-group quantization needed, indicates in which dimension
+  // we do the "group wise quantization"
+  optional int32 axis = 11;
+
+  // It should be true if it is a multi-group quantization proto
+  optional bool is_multiparam = 12 [default = false];
 }
 
 // TensorProtos stores multiple TensorProto objects in one single proto. This
diff --git a/caffe2/proto/torch.proto b/caffe2/proto/torch.proto
index 6208e0276cee..a140bb82cb5c 100644
--- a/caffe2/proto/torch.proto
+++ b/caffe2/proto/torch.proto
@@ -69,6 +69,9 @@ message ModuleDef {
   optional bool optimize = 8;
 
   repeated AttributeDef attributes = 9;
+
+  // Used for retrieving module state from the pickled IValues table
+  optional int64 get_state_attribute_id = 10;
 }
 
 // Represents all non-module code that the model depends on.
@@ -79,7 +82,7 @@ message LibDef {
 }
 
 enum ProtoVersion {
-  PROTO_VERSION_NEWEST = 0x0000000000000003;
+  PROTO_VERSION_NEWEST = 0x0000000000000005;
 }
 
 message ModelDef {
diff --git a/caffe2/python/checkpoint.py b/caffe2/python/checkpoint.py
index f73745513c10..73ef06999a54 100644
--- a/caffe2/python/checkpoint.py
+++ b/caffe2/python/checkpoint.py
@@ -19,7 +19,6 @@
 )
 
 logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
 
 
 
diff --git a/caffe2/python/layer_model_helper.py b/caffe2/python/layer_model_helper.py
index 38078773bed0..f142dde5cc38 100644
--- a/caffe2/python/layer_model_helper.py
+++ b/caffe2/python/layer_model_helper.py
@@ -106,6 +106,14 @@ def add_metric_field(self, name, value):
             (name, value)
         )
 
+    # an empty white_set will skip everything
+    def filter_metrics_schema(self, white_set):
+        logger.info("Filter metric schema with white_set {}".format(white_set))
+        field_names = self._metrics_schema.field_names()
+        for name in field_names:
+            if name not in white_set:
+                self._metrics_schema = self._metrics_schema - schema.Struct((name, schema.Scalar()))
+
     def add_ad_hoc_plot_blob(self, blob, dtype=None):
         assert isinstance(
             blob, (six.string_types, core.BlobReference)
diff --git a/caffe2/python/layers/batch_distill_lr_loss.py b/caffe2/python/layers/batch_distill_lr_loss.py
deleted file mode 100644
index c4a367956922..000000000000
--- a/caffe2/python/layers/batch_distill_lr_loss.py
+++ /dev/null
@@ -1,191 +0,0 @@
-## @package batch_distill_lr_loss
-# Module caffe2.python.layers.batch_distill_lr_loss
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-from caffe2.python import core, schema
-from caffe2.python.layers.layers import (
-    ModelLayer,
-)
-from caffe2.python.layers.tags import (
-    Tags
-)
-import numpy as np
-
-
-class BatchDistillLRLoss(ModelLayer):
-
-    def __init__(
-            self, model, input_record,
-            name='batch_distill_lr_loss', teacher_weight=0.0,
-            filter_invalid_teacher_label=False, **kwargs):
-
-        super(BatchDistillLRLoss, self).__init__(model, name, input_record, **kwargs)
-
-        assert teacher_weight >= 0 and teacher_weight <= 1, (
-            'teacher_weight=%0.2f should be in [0, 1]' % teacher_weight
-        )
-
-        self._teacher_weight = teacher_weight
-        self._filter_invalid_teacher_label = filter_invalid_teacher_label
-        # hyper-parameter determines whether to filter out bad teacehr labels,
-        # i.e., teacher labels that are zero.
-        if self._filter_invalid_teacher_label:
-            self.threshold = model.add_global_constant(
-                str(model.net.NextScopedBlob('threshold')),
-                [0.0],   # threshold for filtering teacher weight.
-                dtype=np.float
-            )
-            self.neg_ONE = model.add_global_constant(
-                str(model.net.NextScopedBlob('neg_ONE')),
-                [-1.0],
-                dtype=np.float
-            )
-            self.ONE = model._GetOne()
-        assert schema.is_schema_subset(
-            schema.Struct(
-                ('teacher_label', schema.Scalar()),
-                ('label', schema.Scalar()),
-                ('logit', schema.Scalar()),
-            ),
-            input_record
-        )
-        self.tags.update([Tags.EXCLUDE_FROM_PREDICTION])
-
-        self.output_schema = schema.Scalar(
-            np.float32,
-            self.get_next_blob_reference('output')
-        )
-
-    def add_ops(self, net):
-        label = self.input_record.label()
-        if self.input_record.label.field_type() != np.float32:
-            label = net.Cast(
-                label,
-                net.NextScopedBlob('float_label'),
-                to=core.DataType.FLOAT,
-            )
-
-        # Assuming 1-D input
-        label = net.ExpandDims(label, net.NextScopedBlob('expanded_label'),
-                               dims=[1])
-
-        teacher_label = self.input_record.teacher_label()
-
-        if self.input_record.teacher_label.field_type() != np.float32:
-            teacher_label = net.Cast(
-                teacher_label,
-                net.NextScopedBlob('float_teacher_label'),
-                to=core.DataType.FLOAT,
-            )
-        teacher_label = net.ExpandDims(
-            teacher_label, net.NextScopedBlob('expanded_teacher_label'),
-            dims=[1])
-
-        true_xent = net.SigmoidCrossEntropyWithLogits(
-            [self.input_record.logit(), label],
-            net.NextScopedBlob('cross_entropy')
-        )
-
-        teacher_xent = net.SigmoidCrossEntropyWithLogits(
-            [self.input_record.logit(), teacher_label],
-            net.NextScopedBlob('teacher_cross_entropy')
-        )
-        if self._filter_invalid_teacher_label:
-            squeezed_teacher_label = net.Squeeze(
-                teacher_label,
-                net.NextScopedBlob('squeezed_teacher_label'),
-                dims=[1]
-            )
-            # blob used to contain the original teacher weights
-            keep_weights = net.ConstantFill(
-                [squeezed_teacher_label],
-                net.NextScopedBlob('keep_weights'),
-                value=self._teacher_weight,
-                dtype=core.DataType.FLOAT
-            )
-            #blob used to zero out the teacher weights
-            zero_weights = net.ConstantFill(
-                [squeezed_teacher_label],
-                net.NextScopedBlob('zero_weights'),
-                value=0.0,
-                dtype=core.DataType.FLOAT
-            )
-
-            #Indicating which teacher labels are bad, i.e., are zero.
-            judge = net.GT(
-                [squeezed_teacher_label, self.threshold],
-                net.NextScopedBlob('judge'),
-                broadcast=1
-            )
-            #zero out bad teacher weights corresponding to bad teacher labels.
-            screened_teacher_weights = net.Conditional(
-                [judge, keep_weights, zero_weights],
-                net.NextScopedBlob('screened_teacher_weights')
-            )
-            neg_screened_teacher_weights = net.Mul(
-                [screened_teacher_weights, self.neg_ONE],
-                net.NextScopedBlob('neg_screened_teacher_weights'),
-                broadcast=1
-            )
-            one_minus_screened_teacher_weights = net.Add(
-                [neg_screened_teacher_weights, self.ONE],
-                net.NextScopedBlob('one_minus_screened_teacher_weights'),
-                broadcast=1
-            )
-            scaled_true_xent = net.Mul(
-                [true_xent, one_minus_screened_teacher_weights],
-                net.NextScopedBlob('scaled_cross_entropy'),
-                broadcast=1
-            )
-            scaled_teacher_xent = net.Mul(
-                [teacher_xent, screened_teacher_weights],
-                net.NextScopedBlob('scaled_teacher_cross_entropy'),
-                broadcast=1
-            )
-        else:
-            scaled_true_xent = net.Scale(
-                true_xent,
-                net.NextScopedBlob('scaled_cross_entropy'),
-                scale=float(1.0 - self._teacher_weight),
-            )
-            scaled_teacher_xent = net.Scale(
-                teacher_xent,
-                net.NextScopedBlob('scaled_teacher_cross_entropy'),
-                scale=float(self._teacher_weight),
-            )
-        if 'weight' in self.input_record.fields:
-            weight_blob = self.input_record.weight()
-            if self.input_record.weight.field_type().base != np.float32:
-                weight_blob = net.Cast(
-                    weight_blob,
-                    weight_blob + '_float32',
-                    to=core.DataType.FLOAT
-                )
-            weight_blob = net.StopGradient(
-                [weight_blob],
-                [net.NextScopedBlob('weight_stop_gradient')],
-            )
-            scaled_true_xent = net.Mul(
-                [scaled_true_xent, weight_blob],
-                net.NextScopedBlob('weighted_xent_label'),
-            )
-            scaled_teacher_xent = net.Mul(
-                [scaled_teacher_xent, weight_blob],
-                net.NextScopedBlob('weighted_xent_teacher'),
-            )
-
-        true_loss = net.AveragedLoss(
-            scaled_true_xent,
-            net.NextScopedBlob('true_loss')
-        )
-        teacher_loss = net.AveragedLoss(
-            scaled_teacher_xent,
-            net.NextScopedBlob('teacher_loss')
-        )
-        net.Add(
-            [true_loss, teacher_loss],
-            self.output_schema.field_blobs()
-        )
diff --git a/caffe2/python/layers/sparse_lookup.py b/caffe2/python/layers/sparse_lookup.py
index 73c937358b38..b38909ed9e7a 100644
--- a/caffe2/python/layers/sparse_lookup.py
+++ b/caffe2/python/layers/sparse_lookup.py
@@ -58,6 +58,10 @@ class SparseLookup(ModelLayer):
         'Float16UniformFill'
     ]
 
+    _fp16_compatible_reducers = [
+        'Sum', 'Mean', 'Sqrt', 'PositionWeighted', 'RecencyWeighted',
+    ]
+
     def __init__(self, model, input_record, inner_shape, reducer,
                  weight_init=None, weight_optim=None,
                  name='sparse_lookup', regularizer=None, **kwargs):
@@ -105,6 +109,14 @@ def __init__(self, model, input_record, inner_shape, reducer,
 
         # If fp16 is used, make sure fp16 init op is used
         if self.trainer_version == "fp16":
+            assert self.reducer in self._fp16_compatible_reducers, (
+                "Fp16 training is enabled. The reducer specified is not supported. "
+                "Got {}. Supported reducers: {}. Right now, in general, sum, mean, "
+                "positional pooling are supported. Attention is not. Please check "
+                "if there is fp16 trained sparse features using advanced pooling.".format(
+                    self.reducer, self._fp16_compatible_reducers)
+            )
+
             # if init op is UniformFill, we replace it directly
             if self.weight_init[0] == "UniformFill":
                 self.weight_init = ("Float16UniformFill", self.weight_init[1])
diff --git a/caffe2/python/layers_test.py b/caffe2/python/layers_test.py
index 031060301be5..eaee1e52e9c1 100644
--- a/caffe2/python/layers_test.py
+++ b/caffe2/python/layers_test.py
@@ -113,6 +113,31 @@ def testAddLoss(self):
         assert core.BlobReference('loss_blob_in_tuple_1')\
          in self.model.loss.field_blobs()
 
+    def testFilterMetricSchema(self):
+        self.model.add_metric_field("a:b", schema.Scalar())
+        self.model.add_metric_field("a:c", schema.Scalar())
+        self.model.add_metric_field("d", schema.Scalar())
+
+        self.assertEqual(
+            self.model.metrics_schema,
+            schema.Struct(
+                ("a", schema.Struct(
+                    ("b", schema.Scalar()),
+                    ("c", schema.Scalar()),
+                )),
+                ("d", schema.Scalar()),
+            ))
+
+        self.model.filter_metrics_schema({"a:b", "d"})
+        self.assertEqual(
+            self.model.metrics_schema,
+            schema.Struct(
+                ("a", schema.Struct(
+                    ("b", schema.Scalar()),
+                )),
+                ("d", schema.Scalar()),
+            ))
+
     def testAddOutputSchema(self):
         # add the first field
         self.model.add_output_schema('struct', schema.Struct())
@@ -701,72 +726,6 @@ def testSamplingTrain(self):
             ]
         )
 
-    def testDistillBatchLRLoss(self):
-        input_record = self.new_record(schema.Struct(
-            ('label', schema.Scalar((np.float64, (1,)))),
-            ('logit', schema.Scalar((np.float32, (2,)))),
-            ('teacher_label', schema.Scalar((np.float32(1,)))),
-            ('weight', schema.Scalar((np.float64, (1,))))
-        ))
-        loss = self.model.BatchDistillLRLoss(input_record)
-        self.assertEqual(schema.Scalar((np.float32, tuple())), loss)
-
-    def testDistillBatchLRLossWithTeacherWeightScreen(self):
-        input_record = self.new_record(schema.Struct(
-            ('label', schema.Scalar((np.float32, (2,)))),
-            ('logit', schema.Scalar((np.float32, (2, 1)))),
-            ('teacher_label', schema.Scalar((np.float32(2,)))),
-            ('weight', schema.Scalar((np.float64, (2,))))
-        ))
-        label_items = np.array([1.0, 1.0], dtype=np.float32)
-        logit_items = np.array([[1.0], [1.0]], dtype=np.float32)
-        teacher_label_items = np.array([0.8, -1.0], dtype=np.float32)
-        weight_items = np.array([1.0, 1.0], dtype=np.float32)
-        schema.FeedRecord(
-            input_record,
-            [label_items, logit_items, teacher_label_items, weight_items]
-        )
-        loss = self.model.BatchDistillLRLoss(
-            input_record,
-            teacher_weight=0.5,
-            filter_invalid_teacher_label=True
-        )
-        self.run_train_net_forward_only()
-        tensor_loss = workspace.FetchBlob(loss.field_blobs()[0])
-
-        def cross_entropy(label, logit):
-            return logit - logit * label + np.log(1 + np.exp(-1.0 * logit))
-
-        def cal_cross_entropy(
-            label_items, logit_items, teacher_label_items, weight_items
-        ):
-            total_ce = 0
-            for i in range(label_items.shape[0]):
-                true_xent = cross_entropy(label_items[i], logit_items[i, 0])
-                if teacher_label_items[i] > 0:
-                    teacher_xent = cross_entropy(
-                        teacher_label_items[i], logit_items[i, 0]
-                    )
-                else:
-                    teacher_xent = 0
-                teacher_weight = 0.5 if teacher_label_items[i] > 0 else 0
-                total_ce += (true_xent * (1 - teacher_weight) +
-                            teacher_xent * teacher_weight) * weight_items[i]
-            return total_ce / label_items.shape[0]
-
-        correct_ace = cal_cross_entropy(
-            label_items,
-            logit_items,
-            teacher_label_items,
-            weight_items
-        )
-        self.assertAlmostEqual(
-            tensor_loss,
-            np.array(correct_ace),
-            delta=0.0000001,
-            msg="Wrong cross entropy {}".format(tensor_loss)
-        )
-
     def testBatchLRLoss(self):
         input_record = self.new_record(schema.Struct(
             ('label', schema.Scalar((np.float64, (1,)))),
diff --git a/caffe2/python/modeling/compute_norm_for_blobs.py b/caffe2/python/modeling/compute_norm_for_blobs.py
index 8a1928e08ed9..6b4f1716ffcc 100644
--- a/caffe2/python/modeling/compute_norm_for_blobs.py
+++ b/caffe2/python/modeling/compute_norm_for_blobs.py
@@ -19,10 +19,11 @@ class ComputeNormForBlobs(NetModifier):
         blobs: list of blobs to compute norm for
         logging_frequency: frequency for printing norms to logs
         p: type of norm. Currently it supports p=1 or p=2
-        compute_averaged_norm: norm or averaged_norm (averaged_norm = norm/size)
+        compute_averaged_norm: norm or averaged_norm (averaged_norm = norm/size
+        row_index: to plot the entire blob or simply one row at the row_index)
     """
 
-    def __init__(self, blobs, logging_frequency, p=2, compute_averaged_norm=False):
+    def __init__(self, blobs, logging_frequency, p=2, compute_averaged_norm=False, row_index=None):
         self._blobs = blobs
         self._logging_frequency = logging_frequency
         self._p = p
@@ -31,11 +32,17 @@ def __init__(self, blobs, logging_frequency, p=2, compute_averaged_norm=False):
         if compute_averaged_norm:
             self._field_name_suffix = '_averaged' + self._field_name_suffix
 
+        if row_index and row_index < 0:
+            raise Exception('{0} is not a valid row index, row_index should be >= 0'.format(
+                row_index))
+        self.row_index = row_index
+
     def modify_net(self, net, init_net=None, grad_map=None, blob_to_device=None,
                    modify_output_record=False):
 
         p = self._p
         compute_averaged_norm = self._compute_averaged_norm
+        row_index = self.row_index
 
         CPU = muji.OnCPU()
         # if given, blob_to_device is a map from blob to device_option
@@ -51,12 +58,21 @@ def modify_net(self, net, init_net=None, grad_map=None, blob_to_device=None,
                 device = CPU
 
             with core.DeviceScope(device):
-                norm_name = net.NextScopedBlob(prefix=blob + self._field_name_suffix)
+                if row_index and row_index >= 0:
+                    blob = net.Slice(
+                        [blob],
+                        net.NextScopedBlob(prefix=blob + '_row_{0}'.format(row_index)),
+                        starts=[row_index, 0],
+                        ends=[row_index + 1, -1]
+                    )
+
                 cast_blob = net.Cast(
                     blob,
                     net.NextScopedBlob(prefix=blob + '_float'),
                     to=core.DataType.FLOAT
                 )
+
+                norm_name = net.NextScopedBlob(prefix=blob + self._field_name_suffix)
                 norm = net.LpNorm(
                     cast_blob, norm_name, p=p, average=compute_averaged_norm
                 )
diff --git a/caffe2/python/modeling/compute_norm_for_blobs_test.py b/caffe2/python/modeling/compute_norm_for_blobs_test.py
index f4e8d1ef5614..d6bfda1adf92 100644
--- a/caffe2/python/modeling/compute_norm_for_blobs_test.py
+++ b/caffe2/python/modeling/compute_norm_for_blobs_test.py
@@ -102,10 +102,10 @@ def test_compute_averaged_norm_for_blobs(self):
         workspace.RunNetOnce(model.net)
 
         fc1_w = workspace.FetchBlob('fc1_w')
-        fc1_w_l2_averaged_norm = workspace.FetchBlob('fc1_w_averaged_l2_norm')
+        fc1_w_averaged_l2_norm = workspace.FetchBlob('fc1_w_averaged_l2_norm')
 
-        self.assertEqual(fc1_w_l2_averaged_norm.size, 1)
-        self.assertAlmostEqual(fc1_w_l2_averaged_norm[0],
+        self.assertEqual(fc1_w_averaged_l2_norm.size, 1)
+        self.assertAlmostEqual(fc1_w_averaged_l2_norm[0],
                                np.linalg.norm(fc1_w)**2 / fc1_w.size,
                                delta=1e-5)
 
@@ -203,3 +203,30 @@ def test_compute_l1_averaged_norm_for_blobs(self):
                                delta=1e-5)
 
         self.assertEqual(len(model.net.Proto().op), 8)
+
+    def test_compute_norm_row_index_for_blobs(self):
+        model = model_helper.ModelHelper(name="test")
+        data = model.net.AddExternalInput("data")
+        fc1 = brew.fc(model, data, "fc1", dim_in=4, dim_out=2)
+
+        net_modifier = ComputeNormForBlobs(
+            blobs=['fc1_w'],
+            logging_frequency=10,
+            compute_averaged_norm=True,
+            row_index=1
+        )
+
+        net_modifier(model.net)
+
+        workspace.FeedBlob('data', np.random.rand(10, 4).astype(np.float32))
+
+        workspace.RunNetOnce(model.param_init_net)
+        workspace.RunNetOnce(model.net)
+
+        fc1_w = workspace.FetchBlob('fc1_w')
+        fc1_w_row_1_averaged_l2_norm = workspace.FetchBlob('fc1_w_row_1_averaged_l2_norm')
+
+        self.assertEqual(fc1_w_row_1_averaged_l2_norm.size, 1)
+        self.assertAlmostEqual(fc1_w_row_1_averaged_l2_norm[0],
+                               np.linalg.norm(fc1_w[1])**2 / fc1_w[1].size,
+                               delta=1e-5)
diff --git a/caffe2/python/operator_test/adadelta_test.py b/caffe2/python/operator_test/adadelta_test.py
index 94709dc6acdc..3c42cc9925cd 100644
--- a/caffe2/python/operator_test/adadelta_test.py
+++ b/caffe2/python/operator_test/adadelta_test.py
@@ -55,6 +55,8 @@ def ref_adadelta(param_in,
            **hu.gcs)
     def test_adadelta(self, inputs, lr, epsilon, decay, gc, dc):
         param, moment, moment_delta, grad = inputs
+        moment = np.abs(moment)
+        moment_delta = np.abs(moment_delta)
         lr = np.array([lr], dtype=np.float32)
 
         op = core.CreateOperator(
@@ -85,6 +87,7 @@ def test_adadelta(self, inputs, lr, epsilon, decay, gc, dc):
     def test_sparse_adadelta(self, inputs, lr, epsilon, decay, gc, dc):
         param, moment, moment_delta, grad = inputs
         moment = np.abs(moment)
+        moment_delta = np.abs(moment_delta)
         lr = np.array([lr], dtype=np.float32)
 
         # Create an indexing array containing values that are lists of indices,
diff --git a/caffe2/python/operator_test/adam_test.py b/caffe2/python/operator_test/adam_test.py
index 8209b1c04930..2a94528fde52 100644
--- a/caffe2/python/operator_test/adam_test.py
+++ b/caffe2/python/operator_test/adam_test.py
@@ -60,6 +60,7 @@ def ref_row_wise_adam(param, mom1, mom2, grad, LR, ITER,
            **hu.gcs)
     def test_adam(self, inputs, ITER, LR, beta1, beta2, epsilon, gc, dc):
         param, mom1, mom2, grad = inputs
+        mom2 = np.abs(mom2)
         ITER = np.array([ITER], dtype=np.int64)
         LR = np.array([LR], dtype=np.float32)
 
@@ -93,6 +94,7 @@ def test_adam(self, inputs, ITER, LR, beta1, beta2, epsilon, gc, dc):
            **hu.gcs_cpu_only)
     def test_adam_output_grad(self, inputs, ITER, LR, beta1, beta2, epsilon, gc, dc):
         param, mom1, mom2, grad = inputs
+        mom2 = np.abs(mom2)
         ITER = np.array([ITER], dtype=np.int64)
         LR = np.array([LR], dtype=np.float32)
 
diff --git a/caffe2/python/operator_test/bucketize_op_test.py b/caffe2/python/operator_test/bucketize_op_test.py
new file mode 100644
index 000000000000..f22ff6b0aed9
--- /dev/null
+++ b/caffe2/python/operator_test/bucketize_op_test.py
@@ -0,0 +1,35 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from caffe2.python import core, dyndep
+from hypothesis import given
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+import numpy as np
+
+
+class TestBucketizeOp(hu.HypothesisTestCase):
+    @given(
+        x=hu.tensor(
+            min_dim=1, max_dim=2, dtype=np.float32,
+            elements=st.floats(min_value=-5, max_value=5)),
+        **hu.gcs)
+    def test_bucketize_op(self, x, gc, dc):
+        length = np.random.randint(low=1, high=5)
+        boundaries = np.random.randn(length) * 5
+        boundaries.sort()
+
+        def ref(x, boundaries):
+            bucket_idx = np.digitize(x, boundaries, right=True)
+            return [bucket_idx]
+
+        op = core.CreateOperator('Bucketize',
+                                 ["X"], ["INDICES"],
+                                 boundaries=boundaries)
+        self.assertReferenceChecks(gc, op, [x, boundaries], ref)
+
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.py b/caffe2/python/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.py
index 05ce3d0f94c8..4db3f1529d81 100644
--- a/caffe2/python/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.py
+++ b/caffe2/python/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.py
@@ -4,13 +4,11 @@
 from __future__ import unicode_literals
 
 import numpy as np
-import os
 import unittest
 
-from hypothesis import given, settings
+from hypothesis import given
 import hypothesis.strategies as st
 
-from caffe2.proto import caffe2_pb2
 from caffe2.python import core, utils
 import caffe2.python.hypothesis_test_util as hu
 import caffe2.python.serialized_test.serialized_test_util as serial
@@ -32,7 +30,8 @@ def boxes_area(boxes):
 def map_rois_to_fpn_levels(
     rois,
     k_min, k_max,
-    roi_canonical_scale, roi_canonical_level):
+    roi_canonical_scale, roi_canonical_level
+):
     """Determine which FPN level each RoI in a set of RoIs should map to based
     on the heuristic in the FPN paper.
     """
@@ -130,25 +129,28 @@ def collect_and_distribute_fpn_rpn_ref(*inputs):
     return outputs
 
 
-class TestCollectAndDistributeFpnRpnProposals(serial.SerializedTestCase):
-    @serial.given(proposal_count=st.integers(min_value=1000, max_value=8000),
-                  rpn_min_level=st.integers(min_value=1, max_value=4),
-                  rpn_num_levels=st.integers(min_value=1, max_value=6),
-                  roi_min_level=st.integers(min_value=1, max_value=4),
-                  roi_num_levels=st.integers(min_value=1, max_value=6),
-                  rpn_post_nms_topN=st.integers(min_value=1000, max_value=4000),
-                  roi_canonical_scale=st.integers(min_value=100, max_value=300),
-                  roi_canonical_level=st.integers(min_value=1, max_value=8),
-                  **hu.gcs_cpu_only)
-    def test_collect_and_dist(
-        self,
-        proposal_count,
-        rpn_min_level, rpn_num_levels,
-        roi_min_level, roi_num_levels,
-        rpn_post_nms_topN,
-        roi_canonical_scale, roi_canonical_level,
-        gc, dc):
+def collect_rpn_ref(*inputs):
+    args = inputs[-1]
+    inputs = inputs[:-1]
+    rois = collect(inputs, **args)
+    return [rois]
+
 
+def distribute_fpn_ref(*inputs):
+    args = inputs[-1]
+    inputs = inputs[:-1]
+    rois = inputs[0]
+    num_roi_lvls = args['roi_num_levels']
+    outputs = (num_roi_lvls + 2) * [None]
+    distribute(rois, None, outputs, **args)
+    # remove the first rois from output of distribute
+    outputs.pop(0)
+    return outputs
+
+
+class TestCollectAndDistributeFpnRpnProposals(serial.SerializedTestCase):
+    @staticmethod
+    def _create_input(proposal_count, rpn_min_level, rpn_num_levels, roi_canonical_scale):
         np.random.seed(0)
 
         input_names = []
@@ -171,6 +173,30 @@ def test_collect_and_dist(
             input_names.append('rpn_roi_probs_fpn{}'.format(lvl + rpn_min_level))
             inputs.append(rpn_roi_score)
 
+        return input_names, inputs
+
+    @serial.given(proposal_count=st.integers(min_value=1000, max_value=8000),
+                  rpn_min_level=st.integers(min_value=1, max_value=4),
+                  rpn_num_levels=st.integers(min_value=1, max_value=6),
+                  roi_min_level=st.integers(min_value=1, max_value=4),
+                  roi_num_levels=st.integers(min_value=1, max_value=6),
+                  rpn_post_nms_topN=st.integers(min_value=1000, max_value=4000),
+                  roi_canonical_scale=st.integers(min_value=100, max_value=300),
+                  roi_canonical_level=st.integers(min_value=1, max_value=8),
+                  **hu.gcs_cpu_only)
+    def test_collect_and_dist(
+        self,
+        proposal_count,
+        rpn_min_level, rpn_num_levels,
+        roi_min_level, roi_num_levels,
+        rpn_post_nms_topN,
+        roi_canonical_scale, roi_canonical_level,
+        gc, dc
+    ):
+        input_names, inputs = self._create_input(
+            proposal_count, rpn_min_level, rpn_num_levels, roi_canonical_scale
+        )
+
         output_names = [
             'rois',
         ]
@@ -193,7 +219,6 @@ def test_collect_and_dist(
             ],
             device_option=gc)
         args = {
-            'proposal_count' : proposal_count,
             'rpn_min_level' : rpn_min_level,
             'rpn_num_levels' : rpn_num_levels,
             'roi_min_level' : roi_min_level,
@@ -205,10 +230,87 @@ def test_collect_and_dist(
         self.assertReferenceChecks(
             device_option=gc,
             op=op,
-            inputs=inputs+[args],
+            inputs=inputs + [args],
             reference=collect_and_distribute_fpn_rpn_ref,
         )
 
+    @given(
+        proposal_count=st.integers(min_value=1000, max_value=8000),
+        rpn_min_level=st.integers(min_value=1, max_value=4),
+        rpn_num_levels=st.integers(min_value=1, max_value=6),
+        roi_min_level=st.integers(min_value=1, max_value=4),
+        roi_num_levels=st.integers(min_value=1, max_value=6),
+        rpn_post_nms_topN=st.integers(min_value=1000, max_value=4000),
+        roi_canonical_scale=st.integers(min_value=100, max_value=300),
+        roi_canonical_level=st.integers(min_value=1, max_value=8),
+        **hu.gcs_cpu_only)
+    def test_collect_and_dist_separately(
+        self,
+        proposal_count,
+        rpn_min_level, rpn_num_levels,
+        roi_min_level, roi_num_levels,
+        rpn_post_nms_topN,
+        roi_canonical_scale, roi_canonical_level,
+        gc, dc
+    ):
+        input_names, inputs = self._create_input(
+            proposal_count, rpn_min_level, rpn_num_levels, roi_canonical_scale
+        )
+
+        collect_op = core.CreateOperator(
+            'CollectRpnProposals',
+            input_names,
+            ['rois'],
+            arg=[
+                utils.MakeArgument("rpn_max_level", rpn_min_level + rpn_num_levels - 1),
+                utils.MakeArgument("rpn_min_level", rpn_min_level),
+                utils.MakeArgument("rpn_post_nms_topN", rpn_post_nms_topN),
+            ],
+            device_option=gc)
+        collect_args = {
+            'rpn_min_level' : rpn_min_level,
+            'rpn_num_levels' : rpn_num_levels,
+            'rpn_post_nms_topN' : rpn_post_nms_topN,
+        }
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=collect_op,
+            inputs=inputs + [collect_args],
+            reference=collect_rpn_ref,
+        )
+
+        rois = collect(inputs, **collect_args)
+
+        output_names = []
+        for lvl in range(roi_num_levels):
+            output_names.append('rois_fpn{}'.format(lvl + roi_min_level))
+        output_names.append('rois_idx_restore')
+
+        distribute_op = core.CreateOperator(
+            'DistributeFpnProposals',
+            ['rois'],
+            output_names,
+            arg=[
+                utils.MakeArgument("roi_canonical_scale", roi_canonical_scale),
+                utils.MakeArgument("roi_canonical_level", roi_canonical_level),
+                utils.MakeArgument("roi_max_level", roi_min_level + roi_num_levels - 1),
+                utils.MakeArgument("roi_min_level", roi_min_level),
+            ],
+            device_option=gc)
+        distribute_args = {
+            'roi_min_level' : roi_min_level,
+            'roi_num_levels' : roi_num_levels,
+            'roi_canonical_scale' : roi_canonical_scale,
+            'roi_canonical_level' : roi_canonical_level}
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=distribute_op,
+            inputs=[rois, distribute_args],
+            reference=distribute_fpn_ref,
+        )
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/caffe2/python/operator_test/given_tensor_fill_op_test.py b/caffe2/python/operator_test/given_tensor_fill_op_test.py
index 36333f411ba1..bcd277cf258b 100644
--- a/caffe2/python/operator_test/given_tensor_fill_op_test.py
+++ b/caffe2/python/operator_test/given_tensor_fill_op_test.py
@@ -18,6 +18,7 @@ class TestGivenTensorFillOps(hu.HypothesisTestCase):
                (core.DataType.BOOL, np.bool_, "GivenTensorFill"),
                (core.DataType.INT32, np.int32, "GivenTensorFill"),
                (core.DataType.FLOAT, np.float32, "GivenTensorFill"),
+               (core.DataType.INT16, np.int16, "GivenTensorInt16Fill"),
                (core.DataType.INT32, np.int32, "GivenTensorIntFill"),
                (core.DataType.INT64, np.int64, "GivenTensorInt64Fill"),
                (core.DataType.BOOL, np.bool_, "GivenTensorBoolFill"),
diff --git a/caffe2/python/operator_test/math_ops_test.py b/caffe2/python/operator_test/math_ops_test.py
index 0772aee5c9b2..9a5159ee1770 100644
--- a/caffe2/python/operator_test/math_ops_test.py
+++ b/caffe2/python/operator_test/math_ops_test.py
@@ -19,6 +19,8 @@ class TestMathOps(serial.SerializedTestCase):
            exponent=st.floats(min_value=2.0, max_value=3.0),
            **hu.gcs)
     def test_elementwise_power(self, X, exponent, gc, dc):
+        # negative integer raised with non-integer exponent is domain error
+        X = np.abs(X)
         def powf(X):
             return (X ** exponent,)
 
diff --git a/caffe2/python/operator_test/shape_inference_test.py b/caffe2/python/operator_test/shape_inference_test.py
index a78d9434c811..04a99b2d8e94 100644
--- a/caffe2/python/operator_test/shape_inference_test.py
+++ b/caffe2/python/operator_test/shape_inference_test.py
@@ -539,6 +539,28 @@ def testHalfInt8Conversion(self):
         # TODO: find a tighter bound
         assert(np.allclose(x, x_recovered, atol=1e-2))
 
+    def testLearningRateOp(self):
+        net = core.Net("lr_test")
+        iteration = net.ConstantFill(
+            [],
+            "iteration",
+            shape=[1],
+            value=0,
+            dtype=core.DataType.INT64,
+        )
+        lr = net.LearningRate(
+            [iteration],
+            net.NextScopedBlob("weight_decay"),
+            base_lr=0.5,
+            policy="constantWarmup",
+            multiplier=0.0,
+            num_iter=0,
+        )
+        (shapes, types) = workspace.InferShapesAndTypes(
+            [net],
+        )
+        self.assertEqual(shapes['weight_decay'], [1])
+
     def testShapeOp(self):
         model = model_helper.ModelHelper(name="shape_op_test")
         model.Shape('x', 'y')
diff --git a/caffe2/python/operator_test/torch_integration_test.py b/caffe2/python/operator_test/torch_integration_test.py
index 4e811a193534..a3a0d6fbafa9 100644
--- a/caffe2/python/operator_test/torch_integration_test.py
+++ b/caffe2/python/operator_test/torch_integration_test.py
@@ -122,6 +122,7 @@ def bbox_transform_ref():
             -90,
             90,
             clip_angle_thresh,
+            legacy_plus_one=True,
         )
 
         torch.testing.assert_allclose(box_out, a)
@@ -161,6 +162,7 @@ def test_box_with_nms_limits(
                 -90,
                 90,
                 clip_angle_thresh,
+                legacy_plus_one=True,
             )
         ]
         class_prob = np.random.randn(sum(roi_counts), num_classes).astype(np.float32)
@@ -206,6 +208,7 @@ def box_with_nms_limit_ref():
             cls_agnostic_bbox_reg=False,
             input_boxes_include_bg_cls=True,
             output_classes_include_bg_cls=True,
+            legacy_plus_one=True,
         )
 
         for o, o_ref in zip(outputs, output_refs):
@@ -258,6 +261,7 @@ def generate_proposals_ref():
             -90,
             90,
             1.0,
+            legacy_plus_one=True,
         )
         torch.testing.assert_allclose(rois, a)
         torch.testing.assert_allclose(rois_probs, b)
@@ -392,6 +396,7 @@ def generate_proposals_ref():
             -90,
             90,
             1.0,
+            legacy_plus_one=True,
         )
         torch.testing.assert_allclose(rois, a.cpu())
         torch.testing.assert_allclose(rois_probs, b.cpu())
@@ -451,6 +456,51 @@ def test_roi_align_cpu(self):
     def test_roi_align_cuda(self):
         self._test_roi_align(device="cuda")
 
+    @given(roi_counts=st.lists(st.integers(0, 5), min_size=1, max_size=10))
+    def test_collect_and_distribute_fpn_rpn_proposals_op(self, roi_counts):
+        batch_size = len(roi_counts)
+        im_dims = np.random.randint(100, 600, batch_size)
+        rpn_rois_and_scores = []
+        for i in range(5):
+            rpn_rois_and_scores.append(torch.Tensor(generate_rois(roi_counts, im_dims)))
+        for i in range(5):
+            rpn_rois_and_scores.append(torch.rand(sum(roi_counts)))
+
+        rois = torch.ops._caffe2.CollectRpnProposals(
+            rpn_rois_and_scores,
+            rpn_max_level=6,
+            rpn_min_level=2,
+            rpn_post_nms_topN=sum(roi_counts),
+        )
+        fpn_outputs = torch.ops._caffe2.DistributeFpnProposals(
+            rois,
+            roi_canonical_scale=224,
+            roi_canonical_level=4,
+            roi_max_level=5,
+            roi_min_level=2,
+            legacy_plus_one=True,
+        )
+
+        all_outputs = torch.ops._caffe2.CollectAndDistributeFpnRpnProposals(
+            rpn_rois_and_scores,
+            roi_canonical_scale=224,
+            roi_canonical_level=4,
+            roi_max_level=5,
+            roi_min_level=2,
+            rpn_max_level=6,
+            rpn_min_level=2,
+            rpn_post_nms_topN=sum(roi_counts),
+            legacy_plus_one=True,
+        )
+
+        rois_fpn_list = fpn_outputs[:-1]
+        rois_idx_restore_int32 = fpn_outputs[-1]
+
+        # [rois] + fpn_outputs should be equal to all_outputs
+        torch.testing.assert_allclose(rois, all_outputs[0])
+        for x, y in zip(fpn_outputs, all_outputs[1:]):
+            torch.testing.assert_allclose(x, y)
+
     @given(X=hu.tensor(),
            fast_gelu=st.booleans())
     def _test_gelu_op(self, X, fast_gelu, device):
@@ -472,5 +522,58 @@ def test_gelu_op_cuda(self):
         self._test_gelu_op(device="cuda")
 
 
+    @given(inputs=hu.lengths_tensor(
+        dtype=np.float32,
+        min_value=1,
+        max_value=5,
+        allow_empty=True,
+    ))
+    def _test_lengths_op(self, inputs, ref_op_name, torch_op, device):
+        data, lengths = inputs
+
+        def _lengths_ref(X, Y):
+            ref_op = core.CreateOperator(ref_op_name, ["X", "Y"], "out")
+            workspace.FeedBlob("X", X)
+            workspace.FeedBlob("Y", Y)
+            workspace.RunOperatorOnce(ref_op)
+            return workspace.FetchBlob("out")
+
+        expected_output = _lengths_ref(data, lengths)
+        actual_output = torch_op(
+            torch.tensor(data), torch.tensor(lengths, dtype=torch.int32))
+
+        torch.testing.assert_allclose(expected_output, actual_output.cpu())
+
+    def _test_lengths_sum_op(self, device):
+        self._test_lengths_op("LengthsSum", torch.ops._caffe2.LengthsSum, device)
+
+    def test_lengths_sum_op(self):
+        self._test_lengths_sum_op(device="cpu")
+
+    @unittest.skipIf(not workspace.has_cuda_support, "No cuda support")
+    def test_lengths_sum_op_cuda(self):
+        self._test_lengths_sum_op(device="cuda")
+
+    def _test_lengths_mean_op(self, device):
+        self._test_lengths_op("LengthsMean", torch.ops._caffe2.LengthsMean, device)
+
+    def test_lengths_mean_op(self):
+        self._test_lengths_mean_op(device="cpu")
+
+    @unittest.skipIf(not workspace.has_cuda_support, "No cuda support")
+    def test_lengths_mean_op_cuda(self):
+        self._test_lengths_mean_op(device="cuda")
+
+    def _test_lengths_max_op(self, device):
+        self._test_lengths_op("LengthsMax", torch.ops._caffe2.LengthsMax, device)
+
+    def test_lengths_max_op(self):
+        self._test_lengths_max_op(device="cpu")
+
+    @unittest.skipIf(not workspace.has_cuda_support, "No cuda support")
+    def test_lengths_max_op_cuda(self):
+        self._test_lengths_max_op(device="cuda")
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/caffe2/python/pybind_state.cc b/caffe2/python/pybind_state.cc
index 2b2340e68722..66434ff4ef15 100644
--- a/caffe2/python/pybind_state.cc
+++ b/caffe2/python/pybind_state.cc
@@ -422,7 +422,7 @@ void addObjectMethods(py::module& m) {
       .def("_wrap_tensor_impl", [](Blob* blob, void* ptr) {
         auto p = c10::intrusive_ptr<c10::TensorImpl, at::UndefinedTensorImpl>::
             unsafe_reclaim_from_nonowning(static_cast<c10::TensorImpl*>(ptr));
-        AT_CHECK(p.defined(), "Can't wrap undefined tensor");
+        TORCH_CHECK(p.defined(), "Can't wrap undefined tensor");
         auto at_tensor = at::Tensor::wrap_tensor_impl(std::move(p));
         BlobSetTensor(blob, Tensor(std::move(at_tensor)));
       });
@@ -1263,6 +1263,14 @@ void addGlobalMethods(py::module& m) {
             net->TEST_Benchmark(warmup_runs, main_runs, run_individual);
         return stat;
       });
+  m.def("benchmark_net_once", [](const std::string& name) {
+    CAFFE_ENFORCE(gWorkspace);
+    auto* net = gWorkspace->GetNet(name);
+    CAFFE_ENFORCE(net, "Didn't find net: ", name);
+    py::gil_scoped_release g;
+    float stat = net->TEST_Benchmark_One_Run();
+    return stat;
+  });
 
   m.def("delete_net", [](const std::string& name) {
     CAFFE_ENFORCE(gWorkspace);
diff --git a/caffe2/python/schema.py b/caffe2/python/schema.py
index 9c7ef354d1de..72187ccda06c 100644
--- a/caffe2/python/schema.py
+++ b/caffe2/python/schema.py
@@ -502,6 +502,12 @@ def __add__(self, other):
                 children[name] = right_field
                 continue
             left_field = children[name]
+            if not (isinstance(left_field, Struct) and isinstance(right_field, Struct)):
+                raise TypeError(
+                    "Type of left_field, " + str(type(left_field)) +
+                    ", and type of right_field, " +
+                    str(type(right_field)) +
+                    ", must both the Struct to allow merging of the field, " + name)
             children[name] = left_field + right_field
 
         return Struct(*(viewitems(children)))
diff --git a/caffe2/python/sparse_to_dense_mask_test.py b/caffe2/python/sparse_to_dense_mask_test.py
index 73e10d0725b8..375068ef537e 100644
--- a/caffe2/python/sparse_to_dense_mask_test.py
+++ b/caffe2/python/sparse_to_dense_mask_test.py
@@ -35,7 +35,8 @@ def test_sparse_to_dense_mask_invalid_inputs(self):
             'SparseToDenseMask',
             ['indices', 'values', 'default', 'lengths'],
             ['output'],
-            mask=[999999999, 2])
+            mask=[999999999, 2],
+            max_skipped_indices=3)
         workspace.FeedBlob(
             'indices',
             np.array([2000000000000, 999999999, 2, 3, 4, 5], dtype=np.int32))
@@ -48,11 +49,13 @@ def test_sparse_to_dense_mask_invalid_inputs(self):
             workspace.RunOperatorOnce(op)
         except RuntimeError:
             self.fail("Exception raised with only one negative index")
+
+        # 3 invalid inputs should throw.
         workspace.FeedBlob(
             'indices',
-            np.array([2000000000000, 999999999, -2, -3, -4, -5], dtype=np.int32))
+            np.array([-1, 1, 2, 3, 4, 5], dtype=np.int32))
         with self.assertRaises(RuntimeError):
-            workspace.RunOperatorOnce(op)
+            workspace.RunOperatorMultiple(op, 3)
 
     def test_sparse_to_dense_mask_subtensor(self):
         op = core.CreateOperator(
diff --git a/caffe2/python/workspace.py b/caffe2/python/workspace.py
index c28865097099..cf03910b51e7 100644
--- a/caffe2/python/workspace.py
+++ b/caffe2/python/workspace.py
@@ -36,6 +36,7 @@
 RootFolder = C.root_folder
 Workspaces = C.workspaces
 BenchmarkNet = C.benchmark_net
+BenchmarkNetOnce = C.benchmark_net_once
 GetStats = C.get_stats
 
 operator_tracebacks = defaultdict(dict)
diff --git a/caffe2/quantization/server/fbgemm_pack_op.cc b/caffe2/quantization/server/fbgemm_pack_op.cc
index 7b28da9c9173..9da2013d935e 100644
--- a/caffe2/quantization/server/fbgemm_pack_op.cc
+++ b/caffe2/quantization/server/fbgemm_pack_op.cc
@@ -1,5 +1,6 @@
 #include "fbgemm_pack_op.h"
 
+#include "caffe2/core/tensor.h"
 #include "caffe2/core/tensor_int8.h"
 
 #include "caffe2_dnnlowp_utils.h"
@@ -570,10 +571,115 @@ bool ConvDNNLowPPackWeightOp::RunOnDevice() {
   return true;
 }
 
+bool Int8DNNLowpPackedWeightBlobShapeFunctions::IsSameMetaType(
+    TypeIdentifier id) {
+  return id == TypeMeta::Id<Int8FCDNNLowPPackedWeightBlob>() ||
+      id == TypeMeta::Id<Int8ConvDNNLowPPackedWeightBlob>();
+}
+
+TypeIdentifier Int8DNNLowpPackedWeightBlobShapeFunctions::GetTypeMetaId(
+    const string& name) {
+  if (name == "FC") {
+    return TypeMeta::Id<Int8FCDNNLowPPackedWeightBlob>();
+  } else if (name == "Conv") {
+    return TypeMeta::Id<Int8ConvDNNLowPPackedWeightBlob>();
+  } else {
+    CAFFE_THROW("Class type is not supported: ", name);
+    return TypeMeta::Id<Int8FCDNNLowPPackedWeightBlob>();
+  }
+}
+
+TypeMeta Int8DNNLowpPackedWeightBlobShapeFunctions::GetExternalTensorType(
+    const void* c) {
+  // There might be some problem if type if FC.
+  // We should use a different function.
+  const Int8ConvDNNLowPPackedWeightBlob* int8_tensor =
+      reinterpret_cast<const Int8ConvDNNLowPPackedWeightBlob*>(c);
+  return (int8_tensor->original_tensor).dtype();
+}
+
+vector<int64_t>
+Int8DNNLowpPackedWeightBlobShapeFunctions::GetExternalTensorInfo(
+    const void* c,
+    size_t* capacity,
+    DeviceOption* device) {
+  const Int8ConvDNNLowPPackedWeightBlob* int8_tensor =
+      reinterpret_cast<const Int8ConvDNNLowPPackedWeightBlob*>(c);
+  return GetTensorInfo(&(int8_tensor->original_tensor), capacity, device);
+}
+
+void Int8DNNLowpPackedWeightBlobShapeFunctions::LoadInfoOfBlob(
+    const Blob* blob,
+    std::vector<float>* scale,
+    std::vector<float>* offset,
+    uint32_t* axis) {
+  scale->clear();
+  offset->clear();
+  const Int8ConvDNNLowPPackedWeightBlob* int8_tensor =
+      reinterpret_cast<const Int8ConvDNNLowPPackedWeightBlob*>(blob->GetRaw());
+  const auto& qparams = int8_tensor->qparams;
+  for (const auto& qparam : qparams) {
+    scale->emplace_back(qparam.scale);
+    offset->emplace_back(static_cast<float>(qparam.zero_point));
+  }
+  *axis = 1;
+}
+
+void Int8DNNLowpPackedWeightBlobShapeFunctions::SetupExternalTensorDescriptor(
+    const Blob* blob,
+    std::vector<std::vector<uint64_t>>* shapes,
+    std::vector<std::vector<float>>* all_scales,
+    std::vector<std::vector<float>>* all_offsets,
+    ExternalTensorDescriptor* desc) {
+  const auto& dnntensor = blob->template Get<Int8ConvDNNLowPPackedWeightBlob>();
+  const Tensor& cpu_tensor = dnntensor.original_tensor;
+
+  if (cpu_tensor.template IsType<uint8_t>()) {
+    desc->dataType = kONNXIFI_DATATYPE_UINT8;
+    desc->buffer = reinterpret_cast<uint64_t>(cpu_tensor.data<uint8_t>());
+  } else if (cpu_tensor.template IsType<int32_t>()) {
+    desc->dataType = kONNXIFI_DATATYPE_INT32;
+    desc->buffer = reinterpret_cast<uint64_t>(cpu_tensor.data<int32_t>());
+  } else if (cpu_tensor.template IsType<int8_t>()) {
+    desc->dataType = kONNXIFI_DATATYPE_INT8;
+    desc->buffer = reinterpret_cast<uint64_t>(cpu_tensor.data<int8_t>());
+  } else {
+    CAFFE_THROW(
+        "Unsupported Int8ConvDNNLowPPackedWeightBlob type in ONNXIFI: ",
+        cpu_tensor.dtype().name());
+  }
+
+  desc->quantizationParams = dnntensor.qparams.size();
+  desc->quantizationAxis = 1;
+  std::vector<float> scales, offsets;
+  for (const auto v : dnntensor.qparams) {
+    scales.emplace_back(v.scale);
+    offsets.emplace_back(v.zero_point);
+  }
+  all_scales->push_back(scales);
+  all_offsets->push_back(offsets);
+  desc->scales = all_scales->back().data();
+  desc->biases = reinterpret_cast<int32_t*>(all_offsets->back().data());
+
+  // Set up dim and shape
+  const auto shape = cpu_tensor.sizes();
+  desc->dimensions = shape.size();
+  shapes->emplace_back(shape.cbegin(), shape.cend());
+  desc->shape = shapes->back().data();
+}
+
 // Explicitly register TypeMeta
 CAFFE_KNOWN_TYPE(Int8FCDNNLowPPackedWeightBlob);
 CAFFE_KNOWN_TYPE(Int8ConvDNNLowPPackedWeightBlob);
 
+// Register DNNLOWP Type in caffe2 core
+REGISTER_EXTERNAL_TENSOR_FUNCTIONS(
+    (TypeMeta::Id<Int8FCDNNLowPPackedWeightBlob>()),
+    Int8DNNLowpPackedWeightBlobShapeFunctions);
+REGISTER_EXTERNAL_TENSOR_FUNCTIONS(
+    (TypeMeta::Id<Int8ConvDNNLowPPackedWeightBlob>()),
+    Int8DNNLowpPackedWeightBlobShapeFunctions);
+
 REGISTER_CPU_OPERATOR_WITH_ENGINE(
     Int8FCPackWeight,
     DNNLOWP,
diff --git a/caffe2/quantization/server/fbgemm_pack_op.h b/caffe2/quantization/server/fbgemm_pack_op.h
index db7ff52d9b9f..31e7e6dd6040 100644
--- a/caffe2/quantization/server/fbgemm_pack_op.h
+++ b/caffe2/quantization/server/fbgemm_pack_op.h
@@ -101,5 +101,38 @@ fbgemm::CompressedSparseColumn* ExtractOutlierMatrix(
     int M,
     int nbits_in_non_outlier,
     vector<std::int8_t>& W_quantized);
+/*
+ * Set up used onnxifi data type constexpr
+ * Should always be synced with onnxifi.h
+ */
+constexpr uint64_t kONNXIFI_DATATYPE_UINT8 = 2;
+constexpr uint64_t kONNXIFI_DATATYPE_INT32 = 6;
+constexpr uint64_t kONNXIFI_DATATYPE_INT8 = 3;
+
+class Int8DNNLowpPackedWeightBlobShapeFunctions
+    : public ExternalTensorFunctionsBase {
+ public:
+  explicit Int8DNNLowpPackedWeightBlobShapeFunctions()
+      : ExternalTensorFunctionsBase() {}
+  ~Int8DNNLowpPackedWeightBlobShapeFunctions() override {}
+  bool IsSameMetaType(TypeIdentifier id) override;
+  void SetupExternalTensorDescriptor(
+      const Blob* blob,
+      std::vector<std::vector<uint64_t>>* shapes,
+      std::vector<std::vector<float>>* all_scales,
+      std::vector<std::vector<float>>* all_offsets,
+      ExternalTensorDescriptor* desc) override;
+  void LoadInfoOfBlob(
+      const Blob* blob,
+      std::vector<float>* scale,
+      std::vector<float>* offset,
+      uint32_t* axis) override;
+  TypeIdentifier GetTypeMetaId(const string& name) override;
+  TypeMeta GetExternalTensorType(const void* c) override;
+  vector<int64_t> GetExternalTensorInfo(
+      const void* c,
+      size_t* capacity,
+      DeviceOption* device) override;
+};
 
 } // namespace caffe2
diff --git a/caffe2/quantization/server/fully_connected_fake_lowp_op_avx2.cc b/caffe2/quantization/server/fully_connected_fake_lowp_op_avx2.cc
index 414bfe2d3d21..c66bb976280c 100644
--- a/caffe2/quantization/server/fully_connected_fake_lowp_op_avx2.cc
+++ b/caffe2/quantization/server/fully_connected_fake_lowp_op_avx2.cc
@@ -2,21 +2,39 @@
 
 namespace caffe2 {
 
+namespace {
+
+// NOTE: clang-format wants to use a different formatting but the
+// current formatting should be easier to read.
+alignas(64) const int ld_st_masks[8][8] = {
+  {  0,  0,  0,  0,  0,  0,  0,  0, },
+  { -1,  0,  0,  0,  0,  0,  0,  0, },
+  { -1, -1,  0,  0,  0,  0,  0,  0, },
+  { -1, -1, -1,  0,  0,  0,  0,  0, },
+  { -1, -1, -1, -1,  0,  0,  0,  0, },
+  { -1, -1, -1, -1, -1,  0,  0,  0, },
+  { -1, -1, -1, -1, -1, -1,  0,  0, },
+  { -1, -1, -1, -1, -1, -1, -1,  0, },
+};
+
+} // anonymous namespace
+
 // convert to float16 reducing mantissa, preserving exponent
 void fp32_to_bfp16(const float* source, size_t size, float* dest) {
   // Results on a 1 sign, 8 exponent, 7 mantissa
   constexpr int mask = 0xFFFF0000;
   __m256 wmask = _mm256_broadcast_ss(reinterpret_cast<const float*>(&mask));
 
-  for (auto i = 0; i < (size / 8) * 8; i += 8) {
+  size_t i = 0;
+  for (; i < (size / 8) * 8; i += 8) {
     __m256 data = _mm256_loadu_ps(&source[i]);
     _mm256_storeu_ps(&dest[i], _mm256_and_ps(wmask, data));
   }
-  for (auto i = (size / 8) * 8; i < size; i++) {
-    alignas(64) float tmp[8];
-    __m256 data = _mm256_and_ps(wmask, _mm256_set1_ps(source[i]));
-    _mm256_store_ps(tmp, data);
-    dest[i] = tmp[0];
+  if (i < size) {
+    __m256i ld_st_mask = _mm256_load_si256(
+        reinterpret_cast<const __m256i*>(ld_st_masks[size - i]));
+    __m256 data = _mm256_maskload_ps(&source[i], ld_st_mask);
+    _mm256_maskstore_ps(&dest[i], ld_st_mask, _mm256_and_ps(wmask, data));
   }
 }
 
@@ -26,15 +44,16 @@ void fp32_to_bfp24(const float* source, size_t size, float* dest) {
   constexpr int mask = 0xFFFFFF00;
   __m256 wmask = _mm256_broadcast_ss(reinterpret_cast<const float*>(&mask));
 
-  for (auto i = 0; i < (size / 8) * 8; i += 8) {
+  size_t i = 0;
+  for (; i < (size / 8) * 8; i += 8) {
     __m256 data = _mm256_loadu_ps(&source[i]);
     _mm256_storeu_ps(&dest[i], _mm256_and_ps(wmask, data));
   }
-  for (auto i = (size / 8) * 8; i < size; i++) {
-    alignas(64) float tmp[8];
-    __m256 data = _mm256_and_ps(wmask, _mm256_set1_ps(source[i]));
-    _mm256_store_ps(tmp, data);
-    dest[i] = tmp[0];
+  if (i < size) {
+    __m256i ld_st_mask = _mm256_load_si256(
+        reinterpret_cast<const __m256i*>(ld_st_masks[size - i]));
+    __m256 data = _mm256_maskload_ps(&source[i], ld_st_mask);
+    _mm256_maskstore_ps(&dest[i], ld_st_mask, _mm256_and_ps(wmask, data));
   }
 }
 
@@ -44,15 +63,16 @@ void fp32_to_bfp14(const float* source, size_t size, float* dest) {
   constexpr int mask = 0xFFFC0000;
   __m256 wmask = _mm256_broadcast_ss((float*)(&mask));
 
-  for (auto i = 0; i < (size / 8) * 8; i += 8) {
+  size_t i = 0;
+  for (; i < (size / 8) * 8; i += 8) {
     __m256 data = _mm256_loadu_ps(&source[i]);
     _mm256_storeu_ps(&dest[i], _mm256_and_ps(wmask, data));
   }
-  for (auto i = (size / 8) * 8; i < size; i++) {
-    alignas(64) float tmp[8];
-    __m256 data = _mm256_and_ps(wmask, _mm256_set1_ps(source[i]));
-    _mm256_store_ps(tmp, data);
-    dest[i] = tmp[0];
+  if (i < size) {
+    __m256i ld_st_mask = _mm256_load_si256(
+        reinterpret_cast<const __m256i*>(ld_st_masks[size - i]));
+    __m256 data = _mm256_maskload_ps(&source[i], ld_st_mask);
+    _mm256_maskstore_ps(&dest[i], ld_st_mask, _mm256_and_ps(wmask, data));
   }
 }
 
@@ -65,15 +85,17 @@ void fp32_to_bfp16_scalar(const float* source, size_t size, float* dest) {
 
 // convert to IEEE float16
 void fp32_to_fp16(const float* source, size_t size, float* dest) {
-  for (auto i = 0; i < (size / 8) * 8; i += 8) {
+  size_t i = 0;
+  for (; i < (size / 8) * 8; i += 8) {
     __m128i vin_fp16 = _mm256_cvtps_ph(_mm256_loadu_ps(&source[i]), 0);
     _mm256_storeu_ps(&dest[i], _mm256_cvtph_ps(vin_fp16));
   }
-  for (auto i = (size / 8) * 8; i < size; i++) {
-    alignas(64) float tmp[8];
-    __m128i vin_fp16 = _mm256_cvtps_ph(_mm256_set1_ps(source[i]), 0);
-    _mm256_store_ps(tmp, _mm256_cvtph_ps(vin_fp16));
-    dest[i] = tmp[0];
+  if (i < size) {
+    __m256i ld_st_mask = _mm256_load_si256(
+        reinterpret_cast<const __m256i*>(ld_st_masks[size - i]));
+    __m128i vin_fp16 =
+        _mm256_cvtps_ph(_mm256_maskload_ps(&source[i], ld_st_mask), 0);
+    _mm256_maskstore_ps(&dest[i], ld_st_mask, _mm256_cvtph_ps(vin_fp16));
   }
 }
 
@@ -85,20 +107,25 @@ void fp32_to_bfp16_round(const float* source, size_t size, float* dest) {
   __m256i woffset = _mm256_set1_epi32(offset);
   __m256i wmask = _mm256_set1_epi32(mask);
 
-  for (auto i = 0; i < (size / 8) * 8; i += 8) {
+  size_t i = 0;
+  for (; i < (size / 8) * 8; i += 8) {
     __m256i v32int = _mm256_add_epi32(
         _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&source[i])),
         woffset);
     _mm256_storeu_si256(
         reinterpret_cast<__m256i*>(&dest[i]), _mm256_and_si256(wmask, v32int));
   }
-  for (auto i = (size / 8) * 8; i < size; i++) {
-    alignas(64) float tmp[8];
+  if (i < size) {
+    __m256i ld_st_mask = _mm256_load_si256(
+        reinterpret_cast<const __m256i*>(ld_st_masks[size - i]));
     __m256i v32int = _mm256_add_epi32(
-        _mm256_set1_epi32(*reinterpret_cast<const int*>(&source[i])), woffset);
-    _mm256_store_si256(
-        reinterpret_cast<__m256i*>(tmp), _mm256_and_si256(wmask, v32int));
-    dest[i] = tmp[0];
+        _mm256_maskload_epi32(
+            reinterpret_cast<const int*>(&source[i]), ld_st_mask),
+        woffset);
+    _mm256_maskstore_epi32(
+        reinterpret_cast<int*>(&dest[i]),
+        ld_st_mask,
+        _mm256_and_si256(wmask, v32int));
   }
 }
 
diff --git a/caffe2/serialize/inline_container.cc b/caffe2/serialize/inline_container.cc
index 9d1c426e67b6..fb119d625ff8 100644
--- a/caffe2/serialize/inline_container.cc
+++ b/caffe2/serialize/inline_container.cc
@@ -165,6 +165,18 @@ static std::string getPadding(size_t cursor, const std::string& filename, size_t
   return buf;
 }
 
+bool PyTorchStreamReader::hasFile(const std::string& name) {
+  std::stringstream ss;
+  ss << archive_name_ << "/" << name;
+  mz_zip_reader_locate_file(ar_.get(), ss.str().c_str(), nullptr, 0);
+  bool result = ar_->m_last_error != MZ_ZIP_FILE_NOT_FOUND;
+  if (!result) {
+    ar_->m_last_error = MZ_ZIP_NO_ERROR;
+  }
+  valid("attempting to locate file");
+  return result;
+}
+
 size_t PyTorchStreamReader::getFileID(const std::string& name) {
   std::stringstream ss;
   ss << archive_name_ << "/" << name;
diff --git a/caffe2/serialize/inline_container.h b/caffe2/serialize/inline_container.h
index 28f71492780c..5ca9dcde7fa6 100644
--- a/caffe2/serialize/inline_container.h
+++ b/caffe2/serialize/inline_container.h
@@ -106,8 +106,8 @@ class CAFFE2_API PyTorchStreamReader final {
 
   // return dataptr, size
   std::tuple<at::DataPtr, size_t> getRecord(const std::string& name);
-
   size_t getRecordOffset(const std::string& name);
+  bool hasFile(const std::string& name);
 
   ~PyTorchStreamReader();
 
diff --git a/caffe2/serialize/inline_container_test.cc b/caffe2/serialize/inline_container_test.cc
index 6b4c969665ce..70105f5c3bc6 100644
--- a/caffe2/serialize/inline_container_test.cc
+++ b/caffe2/serialize/inline_container_test.cc
@@ -39,6 +39,9 @@ TEST(PyTorchStreamWriterAndReader, SaveAndLoad) {
 
   // read records through readers
   PyTorchStreamReader reader(&iss);
+  ASSERT_TRUE(reader.hasFile("key1"));
+  ASSERT_TRUE(reader.hasFile("key2"));
+  ASSERT_FALSE(reader.hasFile("key2000"));
   at::DataPtr data_ptr;
   int64_t size;
   std::tie(data_ptr, size) = reader.getRecord("key1");
@@ -48,7 +51,6 @@ TEST(PyTorchStreamWriterAndReader, SaveAndLoad) {
   ASSERT_EQ(memcmp(the_file.c_str() + off1, data1.data(), data1.size()), 0);
   ASSERT_EQ(off1 % kFieldAlignment, 0);
 
-
   std::tie(data_ptr, size) = reader.getRecord("key2");
   size_t off2 = reader.getRecordOffset("key2");
   ASSERT_EQ(off2 % kFieldAlignment, 0);
diff --git a/caffe2/sgd/iter_op.cc b/caffe2/sgd/iter_op.cc
index a2fc9e56abf3..6b5d20a189c8 100644
--- a/caffe2/sgd/iter_op.cc
+++ b/caffe2/sgd/iter_op.cc
@@ -50,6 +50,7 @@ OPERATOR_SCHEMA(AtomicIter)
     .NumInputs(2)
     .NumOutputs(1)
     .EnforceInplace({{1, 0}})
+    .IdenticalTypeAndShapeOfInput(1)
     .SetDoc(R"DOC(
 Similar to Iter, but takes a mutex as the first input to make sure that
 updates are carried out atomically. This can be used in e.g. Hogwild sgd
@@ -60,4 +61,4 @@ algorithms.
 
 NO_GRADIENT(Iter);
 NO_GRADIENT(AtomicIter);
-}  // namespace caffe2
+} // namespace caffe2
diff --git a/caffe2/sgd/learning_rate_op.cc b/caffe2/sgd/learning_rate_op.cc
index d6c4260bd79c..c4857884a643 100644
--- a/caffe2/sgd/learning_rate_op.cc
+++ b/caffe2/sgd/learning_rate_op.cc
@@ -6,6 +6,12 @@ REGISTER_CPU_OPERATOR(LearningRate, LearningRateOp<float, CPUContext>);
 OPERATOR_SCHEMA(LearningRate)
     .NumInputs(1)
     .NumOutputs(1)
+    .TensorInferenceFunction([](const OperatorDef&,
+                                const vector<TensorShape>& in) {
+      vector<TensorShape> out(1);
+      out[0] = in[0];
+      return out;
+    })
     .SetDoc(R"DOC(
 Learning rate is a decreasing function of time. With low learning rates the
 improvements will be linear. With high learning rates they will start to look
diff --git a/caffe2/utils/math_gpu.cu b/caffe2/utils/math_gpu.cu
index 836e0b75026c..44966972d130 100644
--- a/caffe2/utils/math_gpu.cu
+++ b/caffe2/utils/math_gpu.cu
@@ -2687,6 +2687,22 @@ CAFFE2_CUDA_EXPORT void CopyVector<float, CUDAContext>(
   }
 }
 
+template <>
+CAFFE2_CUDA_EXPORT void CopyVector<int, CUDAContext>(
+    const int N,
+    const int* src,
+    int* dst,
+    CUDAContext* context) {
+  if (src != dst && N > 0) {
+    cudaMemcpyAsync(
+        dst,
+        src,
+        sizeof(int) * N,
+        cudaMemcpyDeviceToDevice,
+        context->cuda_stream());
+  }
+}
+
 namespace {
 
 template <typename T>
diff --git a/caffe2/utils/signal_handler.cc b/caffe2/utils/signal_handler.cc
index 5620eb801ca5..8b9db0ae0fcd 100644
--- a/caffe2/utils/signal_handler.cc
+++ b/caffe2/utils/signal_handler.cc
@@ -123,15 +123,13 @@ struct {
   const char* name;
   int signum;
   struct sigaction previous;
-} kSignalHandlers[] = {
-  { "SIGABRT",  SIGABRT,  {} },
-  { "SIGINT",   SIGINT,   {} },
-  { "SIGILL",   SIGILL,   {} },
-  { "SIGFPE",   SIGFPE,   {} },
-  { "SIGBUS",   SIGBUS,   {} },
-  { "SIGSEGV",  SIGSEGV,  {} },
-  { nullptr,    0,        {} }
-};
+} kSignalHandlers[] = {{"SIGABRT", SIGABRT, {}},
+                       {"SIGINT", SIGINT, {}},
+                       {"SIGILL", SIGILL, {}},
+                       {"SIGFPE", SIGFPE, {}},
+                       {"SIGBUS", SIGBUS, {}},
+                       {"SIGSEGV", SIGSEGV, {}},
+                       {nullptr, 0, {}}};
 
 struct sigaction* getPreviousSigaction(int signum) {
   for (auto handler = kSignalHandlers; handler->name != nullptr; handler++) {
@@ -433,7 +431,7 @@ REGISTER_CAFFE2_INIT_FUNCTION(
     "Inits signal handlers for fatal signals so we can see what if"
     " caffe2_print_stacktraces is set.");
 
-} // namepsace internal
+} // namespace internal
 #endif // defined(CAFFE2_SUPPORTS_FATAL_SIGNAL_HANDLERS)
 } // namespace caffe2
 
@@ -444,7 +442,12 @@ REGISTER_CAFFE2_INIT_FUNCTION(
 namespace caffe2 {
 SignalHandler::SignalHandler(
     SignalHandler::Action SIGINT_action,
-    SignalHandler::Action SIGHUP_action) {}
+    SignalHandler::Action SIGHUP_action) {
+  SIGINT_action_ = SIGINT_action;
+  SIGHUP_action_ = SIGHUP_action;
+  my_sigint_count_ = 0;
+  my_sighup_count_ = 0;
+}
 SignalHandler::~SignalHandler() {}
 bool SignalHandler::GotSIGINT() {
   return false;
diff --git a/caffe2/video/video_input_op.h b/caffe2/video/video_input_op.h
index c7e0dab5f75e..c5441e600390 100644
--- a/caffe2/video/video_input_op.h
+++ b/caffe2/video/video_input_op.h
@@ -484,8 +484,10 @@ VideoInputOp<Context>::VideoInputOp(
     label_shape[1] = num_of_class_;
     ReinitializeTensor(&prefetched_label_, label_shape, at::dtype<int>().device(CPU));
   } else {
-    prefetched_label_.Resize(
-        vector<int64_t>(1, batch_size_ * clip_per_video_ * multi_crop_count_));
+    ReinitializeTensor(
+        &prefetched_label_,
+        vector<int64_t>(1, batch_size_ * clip_per_video_ * multi_crop_count_),
+        at::dtype<int>().device(CPU));
   }
 
   ReinitializeTensor(&prefetched_video_id_,  vector<int64_t>(1, batch_size_ * clip_per_video_ * multi_crop_count_), at::dtype<int>().device(CPU));
diff --git a/cmake/Codegen.cmake b/cmake/Codegen.cmake
index fc9f1a74fc5f..8a2c3b1a2fc9 100644
--- a/cmake/Codegen.cmake
+++ b/cmake/Codegen.cmake
@@ -101,11 +101,21 @@ if (INTERN_BUILD_ATEN_OPS)
 
   IF(CXX_AVX2_FOUND)
     SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_AVX2_CPU_DEFINITION")
+
+    # Some versions of GCC pessimistically split unaligned load and store
+    # instructions when using the default tuning. This is a bad choice on
+    # new Intel and AMD processors so we disable it when compiling with AVX2.
+    # See https://stackoverflow.com/questions/52626726/why-doesnt-gcc-resolve-mm256-loadu-pd-as-single-vmovupd#tab-top
+    check_cxx_compiler_flag("-mno-avx256-split-unaligned-load -mno-avx256-split-unaligned-store" COMPILER_SUPPORTS_NO_AVX256_SPLIT)
+    IF(COMPILER_SUPPORTS_NO_AVX256_SPLIT)
+      SET(CPU_NO_AVX256_SPLIT_FLAGS "-mno-avx256-split-unaligned-load -mno-avx256-split-unaligned-store")
+    ENDIF(COMPILER_SUPPORTS_NO_AVX256_SPLIT)
+
     LIST(APPEND CPU_CAPABILITY_NAMES "AVX2")
     IF(MSVC)
       LIST(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG}/arch:AVX2")
     ELSE(MSVC)
-      LIST(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG} -mavx2 -mfma")
+      LIST(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG} -mavx2 -mfma ${CPU_NO_AVX256_SPLIT_FLAGS}")
     ENDIF(MSVC)
   ENDIF(CXX_AVX2_FOUND)
 
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index f0fb061b427f..2ddc2abb6580 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -1101,15 +1101,6 @@ if (NOT INTERN_BUILD_MOBILE)
     ENDIF ()
   ENDIF()
 
-  if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-    if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "4.9")
-      if (CUDA_VERSION VERSION_LESS "8.0")
-        MESSAGE(STATUS "Found gcc >=5 and CUDA <= 7.5, adding workaround C++ flags")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_FORCE_INLINES -D_MWAITXINTRIN_H_INCLUDED -D__STRICT_ANSI__")
-      endif()
-    endif()
-  endif()
-
   LIST(APPEND CUDA_NVCC_FLAGS -Wno-deprecated-gpu-targets)
   LIST(APPEND CUDA_NVCC_FLAGS --expt-extended-lambda)
 
diff --git a/cmake/Modules/FindOpenMP.cmake b/cmake/Modules/FindOpenMP.cmake
index 5aace851397c..8ec0b44310fb 100644
--- a/cmake/Modules/FindOpenMP.cmake
+++ b/cmake/Modules/FindOpenMP.cmake
@@ -101,7 +101,7 @@ function(_OPENMP_FLAG_CANDIDATES LANG)
       set(OMP_FLAG_Intel "-qopenmp")
     endif()
     set(OMP_FLAG_MIPSpro "-mp")
-    set(OMP_FLAG_MSVC "-openmp")
+    set(OMP_FLAG_MSVC "-openmp:experimental" "-openmp")
     set(OMP_FLAG_PathScale "-openmp")
     set(OMP_FLAG_NAG "-openmp")
     set(OMP_FLAG_Absoft "-openmp")
diff --git a/cmake/public/cuda.cmake b/cmake/public/cuda.cmake
index f0ca7a5d18f7..ce215a736244 100644
--- a/cmake/public/cuda.cmake
+++ b/cmake/public/cuda.cmake
@@ -28,6 +28,9 @@ endif()
 message(STATUS "Caffe2: CUDA detected: " ${CUDA_VERSION})
 message(STATUS "Caffe2: CUDA nvcc is: " ${CUDA_NVCC_EXECUTABLE})
 message(STATUS "Caffe2: CUDA toolkit directory: " ${CUDA_TOOLKIT_ROOT_DIR})
+if(CUDA_VERSION VERSION_LESS 9.0)
+  message(FATAL_ERROR "PyTorch requires CUDA 9.0 and above.")
+endif()
 
 if(CUDA_FOUND)
   # Sometimes, we may mismatch nvcc with the CUDA headers we are
@@ -299,17 +302,6 @@ if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
   endif()
 endif()
 
-if (${CUDA_VERSION} LESS 8.0) # CUDA 7.x
-  list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
-  list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
-elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x
-  list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
-  list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
-  # CUDA 8 may complain that sm_20 is no longer supported. Suppress the
-  # warning for now.
-  list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets")
-endif()
-
 # Add onnx namepsace definition to nvcc
 if (ONNX_NAMESPACE)
   list(APPEND CUDA_NVCC_FLAGS "-DONNX_NAMESPACE=${ONNX_NAMESPACE}")
@@ -333,16 +325,6 @@ if ((CUDA_VERSION VERSION_EQUAL   9.0) OR
       "variable to use another version (for example): \n"
       "  export CUDAHOSTCXX='/usr/bin/gcc-5'\n")
   endif()
-elseif (CUDA_VERSION VERSION_EQUAL 8.0)
-  # CUDA 8.0 requires GCC version <= 5
-  if (CMAKE_C_COMPILER_ID STREQUAL "GNU" AND
-      NOT CMAKE_C_COMPILER_VERSION VERSION_LESS 6.0 AND
-      CUDA_HOST_COMPILER STREQUAL CMAKE_C_COMPILER)
-    message(FATAL_ERROR
-      "CUDA 8.0 is not compatible with GCC version >= 6. "
-      "Use the following option to use another version (for example): \n"
-      "  -DCUDA_HOST_COMPILER=/usr/bin/gcc-5\n")
-  endif()
 endif()
 
 # setting nvcc arch flags
diff --git a/docker/caffe2/jenkins/common/install_python.sh b/docker/caffe2/jenkins/common/install_python.sh
index da79ab80bcf4..b22702a3a278 100755
--- a/docker/caffe2/jenkins/common/install_python.sh
+++ b/docker/caffe2/jenkins/common/install_python.sh
@@ -165,5 +165,7 @@ pip install --no-cache-dir \
     mock \
     typing \
     typing-extensions \
-    pyyaml
+    pyyaml \
+    librosa>=0.6.2 \
+    psutil
 
diff --git a/docs/cpp/source/check-doxygen.sh b/docs/cpp/source/check-doxygen.sh
index 9959a3fc2ba6..4311227cb91d 100755
--- a/docs/cpp/source/check-doxygen.sh
+++ b/docs/cpp/source/check-doxygen.sh
@@ -34,8 +34,10 @@ popd
 doxygen 2> original-doxygen-log.txt
 cp original-doxygen-log.txt doxygen-log.txt
 
-echo "Original output"
-cat original-doxygen-log.txt
+# Uncomment this if you need it for debugging; we're not printing this
+# by default because it is confusing.
+# echo "Original output"
+# cat original-doxygen-log.txt
 
 # Filter out some warnings.
 ignore_warning "warning: no uniquely matching class member found for"
@@ -44,9 +46,12 @@ ignore_warning "warning: explicit link request to 'Item' could not be resolved"
 # Count the number of remaining warnings.
 warnings="$(grep 'warning:' doxygen-log.txt | wc -l)"
 
+echo "Treating all remaining warnings as errors"
+
 if [[ "$warnings" -ne "0" ]]; then
-  echo "Filtered output"
+  echo "Failing Doxygen test because the following warnings were treated fatally:"
   cat doxygen-log.txt
+  echo "Please fix these warnings.  To run this test locally, use docs/cpp/source/check-doxygen.sh"
   rm -f doxygen-log.txt original-doxygen-log.txt
   exit 1
 fi
diff --git a/docs/cpp/source/installing.rst b/docs/cpp/source/installing.rst
index 4e486fe00cdb..9f196ddda59a 100644
--- a/docs/cpp/source/installing.rst
+++ b/docs/cpp/source/installing.rst
@@ -73,7 +73,6 @@ We can now run the following commands to build the application from within the
   mkdir build
   cd build
   cmake -DCMAKE_PREFIX_PATH=/absolute/path/to/libtorch ..
-  cd ..
   make
 
 where ``/absolute/path/to/libtorch`` should be the absolute (!) path to the unzipped LibTorch
diff --git a/docs/libtorch.rst b/docs/libtorch.rst
index 6b5c411f5363..fe87bdfaeb94 100644
--- a/docs/libtorch.rst
+++ b/docs/libtorch.rst
@@ -18,7 +18,7 @@ You can use a python script/module located in tools package to build libtorch
 Alternatively, you can invoke a shell script in the same directory to achieve the same goal
 ::
    cd <pytorch_root>
-   BUILD_TORCH=ON ONNX_NAMESPACE=onnx_torch bash tools/build_pytorch_libs.sh --use-nnpack caffe2
+   ONNX_NAMESPACE=onnx_torch bash tools/build_pytorch_libs.sh --use-nnpack caffe2
    ls torch/lib/tmp_install # output is produced here
    ls torch/lib/tmp_install/lib/libtorch.so # of particular interest
 
diff --git a/docs/source/_static/img/tensorboard/add_histogram_raw.png b/docs/source/_static/img/tensorboard/add_histogram_raw.png
new file mode 100644
index 000000000000..96ebe5c48038
Binary files /dev/null and b/docs/source/_static/img/tensorboard/add_histogram_raw.png differ
diff --git a/docs/source/jit.rst b/docs/source/jit.rst
index eebd713ca1b8..56b75ae0175c 100644
--- a/docs/source/jit.rst
+++ b/docs/source/jit.rst
@@ -1100,6 +1100,36 @@ Q: How do I store attributes on a ``ScriptModule``?
 
 
 
+Q: I would like to trace module's method but I keep getting this error:
+
+``RuntimeError: Cannot insert a Tensor that requires grad as a constant. Consider making it a parameter or input, or detaching the gradient``
+
+    This error usually means that, the method you are tracing, uses module's parameters and
+    you are passing module's method instead of a module instance (e.g. ``my_module_instance.forward`` vs ``my_module_instance``). 
+      - Invoking ``trace`` with module's method captures module parameters (which may require gradients) as **constants**. 
+      - On the other hand, invoking ``trace`` with module's instance (e.g. ``my_module``) creates a new module and correctly copies parameters into the new module, so they can accumulate gradients if required.
+    Given that ``trace`` treats ``my_module_instance.forward`` as a standalone function, it also means there is **not** currently a way to trace
+    arbitrary methods in the module except for ``forward`` that use module's parameters.
+    Version **1.1.1** will add a new API ``trace_module`` that will allow users to trace any method in the module and more than one method ::
+
+        class Net(nn.Module):
+            def __init__(self):
+                super(Net, self).__init__()
+                self.conv = nn.Conv2d(1, 1, 3)
+
+            def forward(self, x):
+                return self.conv(x)
+
+            def weighted_kernel_sum(self, weight):
+                return weight * self.conv.weight
+
+        example_weight = torch.rand(1, 1, 3, 3)
+        example_forward_input = torch.rand(1, 1, 3, 3)
+        n = Net()
+        inputs = {'forward' : example_forward_input, 'weighted_kernel_sum' : example_weight}
+        module = torch.jit.trace_module(n, inputs)
+        
+
 Builtin Functions
 ~~~~~~~~~~~~~~~~~
 
diff --git a/docs/source/onnx.rst b/docs/source/onnx.rst
index 90bf2878e7b7..ef1dcadd6dc8 100644
--- a/docs/source/onnx.rst
+++ b/docs/source/onnx.rst
@@ -212,10 +212,10 @@ To confirm whether the operator is standardized or not, please check the
 If the operator is an ATen operator, which means you can find the declaration
 of the function in ``torch/csrc/autograd/generated/VariableType.h``
 (available in generated code in PyTorch install dir), you should add the symbolic
-function in ``torch/onnx/symbolic.py`` and follow the instructions listed as below:
+function in ``torch/onnx/symbolic_opset<version>.py`` and follow the instructions listed as below:
 
-* Define the symbolic function in
-  `torch/onnx/symbolic.py <https://github.com/pytorch/pytorch/blob/master/torch/onnx/symbolic.py>`_.
+* Define the symbolic function in ``torch/onnx/symbolic_opset<version>.py``, for example
+  `torch/onnx/symbolic_opset9.py <https://github.com/pytorch/pytorch/blob/master/torch/onnx/symbolic_opset9.py>`_.
   Make sure the function has the same name as the ATen operator/function
   defined in ``VariableType.h``.
 * The first parameter is always the exported ONNX graph.
@@ -303,7 +303,7 @@ The ONNX graph C++ definition is in ``torch/csrc/jit/ir.h``.
 Here is an example of handling missing symbolic function for ``elu`` operator.
 We try to export the model and see the error message as below::
 
-    UserWarning: ONNX export failed on elu because torch.onnx.symbolic.elu does not exist
+    UserWarning: ONNX export failed on elu because torch.onnx.symbolic_opset9.elu does not exist
     RuntimeError: ONNX export failed: Couldn't export operator elu
 
 The export fails because PyTorch does not support exporting ``elu`` operator.
@@ -311,7 +311,7 @@ We find ``virtual Tensor elu(const Tensor & input, Scalar alpha, bool inplace) c
 in ``VariableType.h``. This means ``elu`` is an ATen operator.
 We check the `ONNX operator list <http://https://github.com/onnx/onnx/blob/master/docs/Operators.md>`_,
 and confirm that ``Elu`` is standardized in ONNX.
-We add the following lines to ``symbolic.py``::
+We add the following lines to ``symbolic_opset9.py``::
 
     def elu(g, input, alpha, inplace=False):
         return g.op("Elu", input, alpha_f=_scalar(alpha))
@@ -319,7 +319,7 @@ We add the following lines to ``symbolic.py``::
 Now PyTorch is able to export ``elu`` operator.
 
 There are more examples in
-`symbolic.py <https://github.com/pytorch/pytorch/blob/master/torch/onnx/symbolic.py>`_,
+`symbolic_opset9.py <https://github.com/pytorch/pytorch/blob/master/torch/onnx/symbolic_opset9.py>`_,
 `tensor.py <https://github.com/pytorch/pytorch/blob/99037d627da68cdf53d3d0315deceddfadf03bba/torch/autograd/_functions/tensor.py#L24>`_,
 `padding.py <https://github.com/pytorch/pytorch/blob/99037d627da68cdf53d3d0315deceddfadf03bba/torch/nn/_functions/padding.py#L8>`_.
 
diff --git a/docs/source/tensorboard.rst b/docs/source/tensorboard.rst
index 46cc0810a91b..d51149759527 100644
--- a/docs/source/tensorboard.rst
+++ b/docs/source/tensorboard.rst
@@ -86,6 +86,7 @@ Expected result:
    .. automethod:: add_scalar
    .. automethod:: add_scalars
    .. automethod:: add_histogram
+   .. automethod:: add_histogram_raw
    .. automethod:: add_image
    .. automethod:: add_images
    .. automethod:: add_figure
diff --git a/docs/source/tensors.rst b/docs/source/tensors.rst
index 385b26d4d6c3..a86beffabbf2 100644
--- a/docs/source/tensors.rst
+++ b/docs/source/tensors.rst
@@ -142,6 +142,7 @@ view of a storage and defines numeric operations on it.
    .. autoattribute:: is_cuda
    .. autoattribute:: device
    .. autoattribute:: grad
+   .. autoattribute:: ndim
 
    .. automethod:: abs
    .. automethod:: abs_
@@ -208,6 +209,7 @@ view of a storage and defines numeric operations on it.
    .. automethod:: cumsum
    .. automethod:: data_ptr
    .. automethod:: dequantize
+   .. automethod:: dequantize_linear
    .. automethod:: det
    .. automethod:: dense_dim
    .. automethod:: detach
diff --git a/docs/source/torch.rst b/docs/source/torch.rst
index 27694ce8ab42..cd4d6975101a 100644
--- a/docs/source/torch.rst
+++ b/docs/source/torch.rst
@@ -125,6 +125,8 @@ Parallelism
 ----------------------------------
 .. autofunction:: get_num_threads
 .. autofunction:: set_num_threads
+.. autofunction:: get_num_interop_threads
+.. autofunction:: set_num_interop_threads
 
 Locally disabling gradient computation
 --------------------------------------
@@ -223,10 +225,12 @@ Reduction Ops
 .. autofunction:: norm
 .. autofunction:: prod
 .. autofunction:: std
+.. autofunction:: std_mean
 .. autofunction:: sum
 .. autofunction:: unique
 .. autofunction:: unique_consecutive
 .. autofunction:: var
+.. autofunction:: var_mean
 
 
 Comparison Ops
diff --git a/scripts/build_windows.bat b/scripts/build_windows.bat
index 0d1130dff65b..c89bc2458041 100644
--- a/scripts/build_windows.bat
+++ b/scripts/build_windows.bat
@@ -18,10 +18,6 @@ if NOT DEFINED BUILD_SHARED_LIBS (
   set BUILD_SHARED_LIBS=OFF
 )
 
-if NOT DEFINED BUILD_TORCH (
-  set BUILD_TORCH=OFF
-)
-
 IF NOT DEFINED BUILDING_WITH_TORCH_LIBS (
   set BUILDING_WITH_TORCH_LIBS=OFF
 )
@@ -50,6 +46,10 @@ if NOT DEFINED USE_OBSERVERS (
   set USE_OBSERVERS=OFF
 )
 
+if NOT DEFINED MSVC_Z7_OVERRIDE (
+  set MSVC_Z7_OVERRIDE=OFF
+)
+
 if NOT DEFINED CMAKE_GENERATOR (
   if DEFINED APPVEYOR_BUILD_WORKER_IMAGE (
     if "%APPVEYOR_BUILD_WORKER_IMAGE%" == "Visual Studio 2017" (
diff --git a/scripts/onnx/test.sh b/scripts/onnx/test.sh
index 4ad633bd87f9..45393c62f0e4 100755
--- a/scripts/onnx/test.sh
+++ b/scripts/onnx/test.sh
@@ -23,7 +23,21 @@ do
 done
 set -- "${UNKNOWN[@]}" # leave UNKNOWN
 
-pip install pytest scipy torchvision hypothesis
+pip install pytest scipy hypothesis
+
+install_torchvision() {
+  echo "Installing torchvision at branch master"
+  rm -rf vision
+  # TODO: This git clone is bad, it means pushes to torchvision can break
+  # PyTorch CI
+  git clone https://github.com/pytorch/vision --quiet
+  pushd vision
+  pip install -q --user .
+  popd
+  rm -rf vision
+}
+install_torchvision
+
 if [[ $PARALLEL == 1 ]]; then
     pip install pytest-xdist
 fi
diff --git a/setup.py b/setup.py
index 441bc08d7082..8c0a5cb56858 100644
--- a/setup.py
+++ b/setup.py
@@ -594,10 +594,11 @@ def run(self):
 
 try:
     import numpy as np
-    NUMPY_INCLUDE_DIR = np.get_include()
-    USE_NUMPY = True
 except ImportError:
     USE_NUMPY = False
+else:
+    NUMPY_INCLUDE_DIR = np.get_include()
+    USE_NUMPY = True
 
 if USE_CUDA:
     if IS_WINDOWS:
diff --git a/test/common_methods_invocations.py b/test/common_methods_invocations.py
index 6202d5bdecef..18a787b9e46c 100644
--- a/test/common_methods_invocations.py
+++ b/test/common_methods_invocations.py
@@ -395,6 +395,16 @@ def method_tests():
         ('std', (S, S, S), (1, True, True), 'keepdim_dim', (True,), [0]),
         ('std', (S,), (0,), 'dim_1d', (True,), [0]),
         ('std', (S,), (0, True, True), 'keepdim_dim_1d', (True,), [0]),
+        ('__var_mean__', (S, S, S), NO_ARGS, ''),
+        ('__var_mean__', (S, S, S), (1,), 'dim', [0]),
+        ('__var_mean__', (S, S, S), (1, True, True), 'keepdim_dim', [0]),
+        ('__var_mean__', (S,), (0,), 'dim_1d', [0]),
+        ('__var_mean__', (S,), (0, True, True), 'keepdim_dim_1d', [0]),
+        ('__std_mean__', (S, S, S), NO_ARGS, ''),
+        ('__std_mean__', (S, S, S), (1,), 'dim', [0]),
+        ('__std_mean__', (S, S, S), (1, True, True), 'keepdim_dim', [0]),
+        ('__std_mean__', (S,), (0,), 'dim_1d', [0]),
+        ('__std_mean__', (S,), (0, True, True), 'keepdim_dim_1d', [0]),
         ('renorm', (S, S, S), (2, 1, 0.5), 'dim', (), [1]),
         ('renorm', (S, S, S), (1, 2, 3), 'norm_1'),
         ('renorm', (S, S, S), (inf, 2, 0.5), 'norm_inf'),
@@ -456,8 +466,12 @@ def method_tests():
         ('ger', (S,), ((M,),)),
         ('matmul', (L,), ((L,),), '', (True,)),
         ('matmul', (S, M), ((M,),), "2d_1d", (True,)),
-        ('matmul', (M, ), ((M, S),), "1d_2d", (True,)),
+        ('matmul', (M,), ((M, S),), "1d_2d", (True,)),
         ('matmul', (S, M), ((M, S),), "2d_2d", (True,)),
+        ('matmul', (S, S, M), ((M,),), "3d_1d", (True,)),
+        ('matmul', (S, S, M), ((M, S),), "3d_2d", (True,)),
+        ('matmul', (M,), ((S, M, S),), "1d_3d", (True,)),
+        ('matmul', (S, M), ((S, M, S),), "2d_3d", (True,)),
         ('matmul', (S, S, M, M), ((S, S, M, S),), "4d_4d", (True,)),
         ('matmul', (S, S, M, M), ((M,),), "4d_1d", (True,)),
         ('matmul', (M,), ((S, S, M, S),), "1d_4d", (True,)),
diff --git a/test/common_utils.py b/test/common_utils.py
index ca9b0b9184c2..4ef97e85e936 100644
--- a/test/common_utils.py
+++ b/test/common_utils.py
@@ -20,10 +20,12 @@
 import socket
 import time
 from collections import OrderedDict
+from contextlib import contextmanager
 from functools import wraps
 from itertools import product
 from copy import deepcopy
 from numbers import Number
+import tempfile
 
 import __main__
 import errno
@@ -66,6 +68,24 @@ def run_tests(argv=UNITTEST_ARGS):
 # Environment variable `IS_PYTORCH_CI` is set in `.jenkins/common.sh`.
 IS_PYTORCH_CI = bool(os.environ.get('IS_PYTORCH_CI', 0))
 
+if IS_WINDOWS:
+    @contextmanager
+    def TemporaryFileName():
+        # Ideally we would like to not have to manually delete the file, but NamedTemporaryFile
+        # opens the file, and it cannot be opened multiple times in Windows. To support Windows,
+        # close the file after creation and try to remove it manually
+        f = tempfile.NamedTemporaryFile(delete=False)
+        try:
+            f.close()
+            yield f.name
+        finally:
+            os.unlink(f.name)
+else:
+    @contextmanager  # noqa: T484
+    def TemporaryFileName():
+        with tempfile.NamedTemporaryFile() as f:
+            yield f.name
+
 
 def _check_module_exists(name):
     r"""Returns if a top-level module with :attr:`name` exists *without**
diff --git a/test/cpp/api/CMakeLists.txt b/test/cpp/api/CMakeLists.txt
index ef63856d5680..d59946199b35 100644
--- a/test/cpp/api/CMakeLists.txt
+++ b/test/cpp/api/CMakeLists.txt
@@ -30,7 +30,7 @@ endif()
 
 add_executable(test_api ${TORCH_API_TEST_SOURCES})
 target_include_directories(test_api PRIVATE ${ATen_CPU_INCLUDE})
-target_link_libraries(test_api PRIVATE torch gtest)
+target_link_libraries(test_api PRIVATE caffe2 gtest)
 
 if (USE_CUDA)
   target_link_libraries(test_api PRIVATE
@@ -38,6 +38,9 @@ if (USE_CUDA)
     ${CUDA_NVRTC_LIB}
     ${CUDA_CUDA_LIB}
     ${TORCH_CUDA_LIBRARIES})
+
+  target_link_libraries(test_api PRIVATE caffe2_gpu)
+
   target_compile_definitions(test_api PRIVATE "USE_CUDA")
 endif()
 
diff --git a/test/cpp/api/torch_include.cpp b/test/cpp/api/torch_include.cpp
index 1bcde267cc17..d85e728de88e 100644
--- a/test/cpp/api/torch_include.cpp
+++ b/test/cpp/api/torch_include.cpp
@@ -9,4 +9,6 @@ TEST(TorchIncludeTest, GetSetNumThreads) {
   torch::init_num_threads();
   torch::set_num_threads(2);
   ASSERT_EQ(torch::get_num_threads(), 2);
+  torch::set_num_interop_threads(2);
+  ASSERT_EQ(torch::get_num_interop_threads(), 2);
 }
diff --git a/test/cpp/jit/CMakeLists.txt b/test/cpp/jit/CMakeLists.txt
index 66860ebef0ab..263a306dc848 100644
--- a/test/cpp/jit/CMakeLists.txt
+++ b/test/cpp/jit/CMakeLists.txt
@@ -4,7 +4,8 @@ add_executable(test_jit
   ${TORCH_ROOT}/test/cpp/common/main.cpp
   ${JIT_TEST_ROOT}/test.cpp)
 
-target_link_libraries(test_jit PRIVATE torch gtest)
+target_link_libraries(test_jit PRIVATE caffe2 gtest)
+target_include_directories(test_jit PRIVATE ${ATen_CPU_INCLUDE})
 target_compile_definitions(test_jit PRIVATE USE_GTEST)
 
 if (USE_CUDA)
@@ -13,5 +14,8 @@ if (USE_CUDA)
     ${CUDA_NVRTC_LIB}
     ${CUDA_CUDA_LIB}
     ${TORCH_CUDA_LIBRARIES})
+
+  target_link_libraries(test_jit PRIVATE caffe2_gpu)
+
   target_compile_definitions(test_jit PRIVATE USE_CUDA)
 endif()
diff --git a/test/cpp/jit/test.cpp b/test/cpp/jit/test.cpp
index 78a527976f24..7a7b45b3c386 100644
--- a/test/cpp/jit/test.cpp
+++ b/test/cpp/jit/test.cpp
@@ -31,6 +31,7 @@
 #include <test/cpp/jit/test_qualified_name.h>
 #include <test/cpp/jit/test_subgraph_matcher.h>
 #include <test/cpp/jit/test_subgraph_utils.h>
+#include <torch/csrc/WindowsTorchApiMacro.h>
 
 using namespace torch::jit::script;
 using namespace torch::jit::test;
@@ -75,12 +76,14 @@ namespace jit {
   _(NoneSchemaMatch)               \
   _(ClassParser)                   \
   _(Profiler)                      \
+  _(InsertGuards)                  \
   _(PeepholeOptimize)              \
   _(RecordFunction)                \
   _(SubgraphMatching)              \
   _(ModuleDefine)                  \
   _(QualifiedName)                 \
-  _(ClassImport)
+  _(ClassImport)                   \
+  _(ScriptObject)
 
 #define TH_FORALL_TESTS_CUDA(_) \
   _(ArgumentSpec)               \
@@ -108,7 +111,7 @@ TH_FORALL_TESTS_CUDA(JIT_GTEST_CUDA)
 #endif
 
 #define JIT_TEST(name) test##name();
-void runJITCPPTests(bool runCuda) {
+TORCH_API void runJITCPPTests(bool runCuda) {
   TH_FORALL_TESTS(JIT_TEST)
   if (runCuda) {
     TH_FORALL_TESTS_CUDA(JIT_TEST)
diff --git a/test/cpp/jit/test_alias_analysis.h b/test/cpp/jit/test_alias_analysis.h
index d3d9550971e9..87a07f548ae9 100644
--- a/test/cpp/jit/test_alias_analysis.h
+++ b/test/cpp/jit/test_alias_analysis.h
@@ -680,6 +680,48 @@ graph():
     AT_ASSERT(!aliasDb.mayContainAlias(first_st, second_st));
     AT_ASSERT(!aliasDb.mayContainAlias(second_st, tup_st));
   }
+  {
+    // Test list container aliasing
+    auto graph = std::make_shared<Graph>();
+    std::unordered_map<std::string, Value*> vmap;
+    script::parseIR(
+        R"IR(
+graph():
+  %10 : bool? = prim::Constant()
+  %8 : Device? = prim::Constant()
+  %4 : int? = prim::Constant()
+  %0 : int = prim::Constant[value=2]()
+  %1 : int = prim::Constant[value=3]()
+  %2 : int[] = prim::ListConstruct(%0, %1)
+  %x : Tensor = aten::rand(%2, %4, %4, %8, %10)
+  %12 : int[] = prim::ListConstruct(%0, %1)
+  %y : Tensor = aten::rand(%12, %4, %4, %8, %10)
+  %22 : int[] = prim::ListConstruct(%0, %1)
+  %z : Tensor = aten::rand(%22, %4, %4, %8, %10)
+  %32 : int[] = prim::ListConstruct(%0, %1)
+  %fresh : Tensor = aten::rand(%32, %4, %4, %8, %10)
+  %foo : Tensor[] = prim::ListConstruct(%x, %y)
+  %43 : Tensor[] = aten::append(%foo, %z)
+  return ()
+)IR",
+        graph.get(),
+        vmap);
+    AliasDb aliasDb(graph);
+    auto x = vmap["x"];
+    auto y = vmap["y"];
+    auto z = vmap["z"];
+    // Tensors x, y, and z went into a list, so they all may alias each other.
+    ASSERT_TRUE(aliasDb.mayAlias(x, y));
+    ASSERT_TRUE(aliasDb.mayAlias(y, z));
+    ASSERT_TRUE(aliasDb.mayAlias(x, z));
+
+    // But we know `fresh` didn't go into a list, so x, y, and z should not
+    // alias it.
+    auto fresh = vmap["fresh"];
+    ASSERT_FALSE(aliasDb.mayAlias(x, fresh));
+    ASSERT_FALSE(aliasDb.mayAlias(y, fresh));
+    ASSERT_FALSE(aliasDb.mayAlias(z, fresh));
+  }
 }
 
 void testWildcards() {
@@ -707,7 +749,7 @@ void testWildcards() {
     AliasDb aliasDb(graph);
 
     ASSERT_FALSE(aliasDb.mayAlias(a, fresh));
-    ASSERT_TRUE(aliasDb.mayAlias(wildcard, fresh));
+    ASSERT_FALSE(aliasDb.mayAlias(wildcard, fresh));
     ASSERT_TRUE(aliasDb.mayAlias(wildcard, a));
     ASSERT_FALSE(aliasDb.mayAlias(
         std::unordered_set<const Value*>({wildcard}),
@@ -719,8 +761,7 @@ void testWildcards() {
   {
     graph->lint();
     AliasDb aliasDb(graph);
-    // Any write should be considered a write to the wildcard
-    ASSERT_TRUE(aliasDb.hasWriters(wildcard->node()));
+    ASSERT_FALSE(aliasDb.hasWriters(wildcard->node()));
   }
 
   const auto wildcardWrite = graph->insert(writes, {wildcard})->node();
@@ -728,9 +769,9 @@ void testWildcards() {
     graph->lint();
     AliasDb aliasDb(graph);
     // Test writes to wildcards
-    ASSERT_TRUE(aliasDb.writesToAlias(
+    ASSERT_FALSE(aliasDb.writesToAlias(
         wildcardWrite, std::unordered_set<const Value*>{fresh}));
-    ASSERT_TRUE(aliasDb.writesToAlias(
+    ASSERT_FALSE(aliasDb.writesToAlias(
         wildcardWrite, std::unordered_set<const Value*>{fresh2}));
     ASSERT_TRUE(aliasDb.writesToAlias(
         wildcardWrite, std::unordered_set<const Value*>{a}));
diff --git a/test/cpp/jit/test_base.h b/test/cpp/jit/test_base.h
index 7726308982f8..64e7203dc96e 100644
--- a/test/cpp/jit/test_base.h
+++ b/test/cpp/jit/test_base.h
@@ -10,9 +10,9 @@
 #include <test/cpp/common/support.h>
 #else
 #include "c10/util/Exception.h"
-#define ASSERT_EQ(x, y) AT_ASSERT((x) == (y))
-#define ASSERT_NE(x, y) AT_ASSERT((x) != (y))
-#define ASSERT_TRUE AT_ASSERT
+#define ASSERT_EQ(x, y) TORCH_INTERNAL_ASSERT((x) == (y))
+#define ASSERT_NE(x, y) TORCH_INTERNAL_ASSERT((x) != (y))
+#define ASSERT_TRUE TORCH_INTERNAL_ASSERT
 #define ASSERT_FALSE(x) ASSERT_TRUE(!(x))
 #define ASSERT_THROWS_WITH(statement, substring)                         \
   try {                                                                  \
diff --git a/test/cpp/jit/test_class_import.h b/test/cpp/jit/test_class_import.h
index d8b3f8423d2b..0ba81b22b345 100644
--- a/test/cpp/jit/test_class_import.h
+++ b/test/cpp/jit/test_class_import.h
@@ -1,8 +1,11 @@
 #pragma once
 
-#include <ATen/core/qualified_name.h>
 #include <test/cpp/jit/test_base.h>
+#include <test/cpp/jit/test_utils.h>
+
+#include <ATen/core/qualified_name.h>
 #include <torch/csrc/jit/import_source.h>
+#include <torch/torch.h>
 
 namespace torch {
 namespace jit {
@@ -13,10 +16,12 @@ op_version_set = 1
 class FooNestedTest:
     def __init__(self, y):
         self.y = y
+
 class FooNestedTest2:
     def __init__(self, y):
         self.y = y
         self.nested = __torch__.FooNestedTest(y)
+
 class FooTest:
     def __init__(self, x):
         self.class_attr = __torch__.FooNestedTest(x)
@@ -58,6 +63,37 @@ void testClassImport() {
   ASSERT_FALSE(c);
 }
 
+void testScriptObject() {
+  Module m1;
+  Module m2;
+  std::vector<at::Tensor> constantTable;
+  import_libs(
+      m1.class_compilation_unit(),
+      "__torch__",
+      classSrcs1,
+      constantTable,
+      nullptr);
+  import_libs(
+      m2.class_compilation_unit(),
+      "__torch__",
+      classSrcs2,
+      constantTable,
+      nullptr);
+
+  // Incorrect arguments for constructor should throw
+  c10::QualifiedName base("__torch__");
+  ASSERT_ANY_THROW(m1.create_class(c10::QualifiedName(base, "FooTest"), {1}));
+  auto x = torch::ones({2, 3});
+  auto obj = m2.create_class(c10::QualifiedName(base, "FooTest"), x).toObject();
+  auto dx = obj->getAttr("dx");
+  ASSERT_TRUE(test::almostEqual(x, dx.toTensor()));
+
+  auto new_x = torch::rand({2, 3});
+  obj->setAttr("dx", new_x);
+  auto new_dx = obj->getAttr("dx");
+  ASSERT_TRUE(test::almostEqual(new_x, new_dx.toTensor()));
+}
+
 } // namespace script
 } // namespace jit
 } // namespace torch
diff --git a/test/cpp/jit/test_fuser.h b/test/cpp/jit/test_fuser.h
index 29ed1a68ccec..6f28520820f5 100644
--- a/test/cpp/jit/test_fuser.h
+++ b/test/cpp/jit/test_fuser.h
@@ -197,7 +197,7 @@ void testRegisterFusionCachesKernel(std::ostream& out = std::cout) {
         std::find_if(nodes.begin(), nodes.end(), [](const Node* node) {
           return node->kind() == prim::FusionGroup;
         });
-    AT_CHECK(
+    TORCH_CHECK(
         maybe_fusion_group != nodes.end(),
         "testRegisterFusionCachesKernel: could not create FusionGroup");
     return *maybe_fusion_group;
diff --git a/test/cpp/jit/test_misc.h b/test/cpp/jit/test_misc.h
index e2de3a4973ab..cf3ec10b1c1a 100644
--- a/test/cpp/jit/test_misc.h
+++ b/test/cpp/jit/test_misc.h
@@ -23,6 +23,7 @@
 #include "torch/csrc/jit/passes/create_autodiff_subgraphs.h"
 #include "torch/csrc/jit/passes/dead_code_elimination.h"
 #include "torch/csrc/jit/passes/graph_fuser.h"
+#include "torch/csrc/jit/passes/insert_guards.h"
 #include "torch/csrc/jit/passes/lower_grad_of.h"
 #include "torch/csrc/jit/passes/lower_tuples.h"
 #include "torch/csrc/jit/passes/requires_grad_analysis.h"
@@ -642,22 +643,22 @@ void checkTracedInputs(const TracedTestInputs& inputs) {
     const auto& sizes = std::get<1>(input);
     if (fn == "test") {
       found_test = true;
-      AT_CHECK(sizes.size() == 1);
-      AT_CHECK(sizes[0] == std::vector<int64_t>({1, 2, 3}));
+      TORCH_CHECK(sizes.size() == 1);
+      TORCH_CHECK(sizes[0] == std::vector<int64_t>({1, 2, 3}));
     } else if (fn == "test::pow") {
       found_pow = true;
-      AT_CHECK(sizes.size() == 2);
-      AT_CHECK(sizes[0] == std::vector<int64_t>({1, 2, 3}));
-      AT_CHECK(sizes[1].empty());
+      TORCH_CHECK(sizes.size() == 2);
+      TORCH_CHECK(sizes[0] == std::vector<int64_t>({1, 2, 3}));
+      TORCH_CHECK(sizes[1].empty());
     } else if (fn.find("::mul") != std::string::npos) {
       found_mul = true;
-      AT_CHECK(sizes.size() > 1);
-      AT_CHECK(sizes[0] == std::vector<int64_t>({1, 2, 3}));
+      TORCH_CHECK(sizes.size() > 1);
+      TORCH_CHECK(sizes[0] == std::vector<int64_t>({1, 2, 3}));
     }
   }
-  AT_CHECK(found_test);
-  AT_CHECK(found_pow);
-  AT_CHECK(found_mul);
+  TORCH_CHECK(found_test);
+  TORCH_CHECK(found_pow);
+  TORCH_CHECK(found_mul);
 }
 
 std::string getFullName(const autograd::profiler::RecordFunction* fn_ptr) {
@@ -683,13 +684,17 @@ void testRecordFunction() {
         for (const auto& input : inputs) {
           if (input.isTensor()) {
             sizes.push_back(input.toTensor().sizes().vec());
-          } else if (input.isScalar()){
+          } else if (input.isScalar()) {
             sizes.push_back(std::vector<int64_t>());
           }
         }
         traced_inputs.push_back(
             std::make_tuple(std::string(getFullName(&fn)), sizes));
-      }, [](const autograd::profiler::RecordFunction&) {}, true);
+      },
+      [](const autograd::profiler::RecordFunction&) {},
+      true);
+
+  autograd::profiler::setSamplingProbability(1.0);
 
   auto t = torch::randn({1, 2, 3}, at::kCPU);
   t.set_requires_grad(true);
@@ -736,7 +741,7 @@ void testAutogradProfiler() {
   for (size_t pos = 0; (pos = result.find("tanh", pos)) != std::string::npos;
        count++, pos++) {
   }
-  AT_CHECK(count == 200);
+  TORCH_CHECK(count == 200);
 }
 
 void testNoneSchemaMatch() {
@@ -797,7 +802,7 @@ void testModuleConversion() {
     // test cuda to cpu for params and buffers
     m->register_parameter("foo", torch::ones({}, at::kCUDA), false);
     m->register_buffer("bar", torch::ones({}, at::kCUDA));
-    
+
     m->to(at::kCUDA);
     m->to(at::kCPU);
     AT_ASSERT(m->get_parameter("foo").data().device().is_cpu());
@@ -807,14 +812,13 @@ void testModuleConversion() {
     // test cpu to cuda for params and buffers
     m->register_parameter("foo", torch::ones({}), false);
     m->register_buffer("bar", torch::ones({}));
-    
+
     m->to(at::kCUDA);
     AT_ASSERT(m->get_parameter("foo").data().device().is_cuda());
     AT_ASSERT(m->get_buffer("bar").data().device().is_cuda());
   }
 }
 
-
 static int testPassValue = 0;
 void fakePass(std::shared_ptr<Graph>& g) {
   testPassValue++;
@@ -841,12 +845,53 @@ graph(%a):
   AT_ASSERT(testPassValue);
 }
 
-static void checkShape(Node* n, std::vector<int64_t> expected) {
-  auto tp = n->output()->type();
+static void checkShape(
+    Node* n,
+    std::vector<int64_t> expected,
+    bool prev = true) {
+  auto profile = (prev) ? n->inputs().at(0)->node() : n;
+  auto tp = profile->output()->type();
   auto ptp = tp->expect<ProfiledTensorType>();
   ASSERT_EQ(ptp->sizes().concrete_sizes().value(), expected);
 }
 
+void testInsertGuards() {
+  static const auto basic_example = R"JIT(
+  def basic(x, y):
+    a = x + y
+    b = x * y
+    c = x + 1
+    d = a - c
+    e = b - c
+    return d + e
+  )JIT";
+
+  auto cu = compile(basic_example);
+  auto& fun = cu->get_function("basic");
+  auto pr = ProfilingRecord::instrumentGraph(fun.graph());
+  auto x = at::randn({2, 3}, at::kCPU);
+  auto y = at::randn({2, 3}, at::kCPU);
+  auto v = [](at::Tensor t) { return autograd::make_variable(t, false); };
+  auto stack = createStack({v(x), v(y)});
+  // introduce some profiling information
+  Code cd(pr->profiled_graph_);
+  InterpreterState is{cd};
+  is.run(stack);
+  auto copy = pr->profiled_graph_->copy();
+  InsertGuards(copy);
+  auto nodes = copy->block()->nodes();
+  auto guard = std::find_if(nodes.begin(), nodes.end(), [](Node* n) {
+    return n->kind() == prim::Guard;
+  });
+  ASSERT_NE(guard, nodes.end());
+  ASSERT_EQ(guard->input()->type()->cast<ProfiledTensorType>(), nullptr);
+  checkShape(*guard, {2, 3}, false);
+  int num_guards = std::count_if(nodes.begin(), nodes.end(), [](Node* n) {
+    return n->kind() == prim::Guard;
+  });
+  ASSERT_EQ(num_guards, 11);
+}
+
 void testProfiler() {
   constexpr int batch_size = 4;
   constexpr int input_size = 256;
@@ -879,7 +924,7 @@ void testProfiler() {
   auto mm =
       std::find_if(begin, end, [](Node* n) { return n->kind() == aten::mm; });
   ASSERT_NE(mm, end);
-  std::vector<int64_t> mm_expected{4, 2048};
+  std::vector<int64_t> mm_expected{4, 256};
   std::vector<int64_t> eltwise{4, 512};
   checkShape(*mm, mm_expected);
   auto sigmoid_n = std::find_if(
diff --git a/test/cpp/jit/test_subgraph_matcher.h b/test/cpp/jit/test_subgraph_matcher.h
index ee157de9f8bd..f33e19e5c7ba 100644
--- a/test/cpp/jit/test_subgraph_matcher.h
+++ b/test/cpp/jit/test_subgraph_matcher.h
@@ -361,6 +361,80 @@ graph(%x, %y):
   AT_ASSERT(findPatternMatches(pattern1, graph).size() == 0);
 }
 
+void testAttributes() {
+  Graph graph;
+  script::parseIR(
+      R"IR(
+graph(%0):
+  %a = a::a[isattr=[1,2]](%0)
+  %b = a::b[intattr=10, floatattr=3.14](%0)
+  %c = a::c[myattr="qqq"](%a, %b)
+  return (%c))IR",
+      &graph);
+
+  {
+    Graph pattern;
+    script::parseIR(
+        R"IR(
+graph(%a, %b):
+  %c = a::c[myattr="qqq"](%a, %b)
+  return (%c))IR",
+        &pattern);
+    AT_ASSERT(!findPatternMatches(pattern, graph).empty());
+  }
+  {
+    Graph pattern;
+    script::parseIR(
+        R"IR(
+graph(%a, %b):
+  %c = a::c[myattr="zzz"](%a, %b)
+  return (%c))IR",
+        &pattern);
+    AT_ASSERT(findPatternMatches(pattern, graph).empty());
+  }
+  {
+    Graph pattern;
+    script::parseIR(
+        R"IR(
+graph(%0):
+  %b = a::b[extraattr=10](%0)
+  return (%b))IR",
+        &pattern);
+    AT_ASSERT(findPatternMatches(pattern, graph).empty());
+  }
+  {
+    Graph pattern;
+    script::parseIR(
+        R"IR(
+graph(%0):
+  %b = a::b[intattr=10, floatattr=3.14](%0)
+  return (%b))IR",
+        &pattern);
+    AT_ASSERT(!findPatternMatches(pattern, graph).empty());
+  }
+  {
+    Graph pattern;
+    script::parseIR(
+        R"IR(
+graph(%0):
+  %b = a::b[intattr=10, floatattr=3.14, strattr="rrr"](%0)
+  return (%b))IR",
+        &pattern);
+    AT_ASSERT(findPatternMatches(pattern, graph).empty());
+  }
+  {
+    Graph pattern;
+    script::parseIR(
+        R"IR(
+graph(%0):
+  %a = a::a[isattr=[1,2]](%0)
+  return (%a))IR",
+        &pattern);
+    // Lists are not supported yet, thus we shouldn't match for now.
+    AT_ASSERT(findPatternMatches(pattern, graph).empty());
+  }
+}
+
 void testBadPattern() {
   Graph graph, pattern1, pattern2;
   script::parseIR(
@@ -405,6 +479,7 @@ void testSubgraphMatching() {
   testOverlappingMatches();
   testMatchInBasicBlocks1();
   testMatchInBasicBlocks2();
+  testAttributes();
   testBadPattern();
 }
 
diff --git a/test/cpp_extensions/complex_registration_extension.cpp b/test/cpp_extensions/complex_registration_extension.cpp
index 459ffb30dc21..f512dc6dbc97 100644
--- a/test/cpp_extensions/complex_registration_extension.cpp
+++ b/test/cpp_extensions/complex_registration_extension.cpp
@@ -40,7 +40,7 @@ struct ComplexCPUType : public at::CPUTypeDefault {
     AT_ASSERT(options.device().is_cpu());
 
     for (auto x: size) {
-      AT_CHECK(x >= 0, "Trying to create tensor using size with negative dimension: ", size);
+      TORCH_CHECK(x >= 0, "Trying to create tensor using size with negative dimension: ", size);
     }
     auto* allocator = at::getCPUAllocator();
     int64_t nelements = at::prod_intlist(size);
diff --git a/test/cpp_extensions/cuda_extension.cpp b/test/cpp_extensions/cuda_extension.cpp
index ad7396fe7f45..d6349b8aa0b3 100644
--- a/test/cpp_extensions/cuda_extension.cpp
+++ b/test/cpp_extensions/cuda_extension.cpp
@@ -6,8 +6,8 @@
 void sigmoid_add_cuda(const float* x, const float* y, float* output, int size);
 
 torch::Tensor sigmoid_add(torch::Tensor x, torch::Tensor y) {
-  AT_CHECK(x.type().is_cuda(), "x must be a CUDA tensor");
-  AT_CHECK(y.type().is_cuda(), "y must be a CUDA tensor");
+  TORCH_CHECK(x.type().is_cuda(), "x must be a CUDA tensor");
+  TORCH_CHECK(y.type().is_cuda(), "y must be a CUDA tensor");
   auto output = torch::zeros_like(x);
   sigmoid_add_cuda(
       x.data<float>(), y.data<float>(), output.data<float>(), output.numel());
diff --git a/test/expect/TestScript.test_print-stdout.expect b/test/expect/TestScript.test_print-stdout.expect
index 0131b4bcc523..f4449c73b849 100644
--- a/test/expect/TestScript.test_print-stdout.expect
+++ b/test/expect/TestScript.test_print-stdout.expect
@@ -2,4 +2,4 @@
  0.9526
  0.9975
  0.9999
-[ Variable[CPUType]{4} ] 1 2 [1, 2] [1., 2.]
+[ Variable[CPUDoubleType]{4} ] 1 2 [1, 2] [1., 2.]
diff --git a/test/expect/TestScript.test_string_print-stdout.expect b/test/expect/TestScript.test_string_print-stdout.expect
index c8a75f4ca842..19f670510f10 100644
--- a/test/expect/TestScript.test_string_print-stdout.expect
+++ b/test/expect/TestScript.test_string_print-stdout.expect
@@ -1,2 +1,2 @@
 1
-[ Variable[CPUType]{} ] abcd 2 1.5
+[ Variable[CPULongType]{} ] abcd 2 1.5
diff --git a/test/onnx/expect/TestOperators.test_c2_op.expect b/test/onnx/expect/TestOperators.test_c2_op.expect
index 568df7594c6c..bfd22835e355 100644
--- a/test/onnx/expect/TestOperators.test_c2_op.expect
+++ b/test/onnx/expect/TestOperators.test_c2_op.expect
@@ -55,6 +55,11 @@ graph {
       f: 1
       type: FLOAT
     }
+    attribute {
+      name: "legacy_plus_one"
+      i: 1
+      type: INT
+    }
     domain: "org.pytorch._caffe2"
   }
   name: "torch-jit-export"
diff --git a/test/onnx/expect/TestOperators.test_maxpool_dilations.expect b/test/onnx/expect/TestOperators.test_maxpool_dilations.expect
index 61b6d01a99eb..470a5d94d2d9 100644
--- a/test/onnx/expect/TestOperators.test_maxpool_dilations.expect
+++ b/test/onnx/expect/TestOperators.test_maxpool_dilations.expect
@@ -6,6 +6,11 @@ graph {
     input: "0"
     output: "1"
     op_type: "MaxPool"
+    attribute {
+      name: "ceil_mode"
+      i: 0
+      type: INT
+    }
     attribute {
       name: "dilations"
       ints: 2
diff --git a/test/onnx/expect/TestOperators.test_sign.expect b/test/onnx/expect/TestOperators.test_sign.expect
new file mode 100644
index 000000000000..5fb611054ec6
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_sign.expect
@@ -0,0 +1,46 @@
+ir_version: 4
+producer_name: "pytorch"
+producer_version: "1.1"
+graph {
+  node {
+    input: "0"
+    output: "1"
+    op_type: "Sign"
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: 1
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: 1
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 9
+}
diff --git a/test/onnx/test_onnx_opset.py b/test/onnx/test_onnx_opset.py
new file mode 100644
index 000000000000..37a87af255be
--- /dev/null
+++ b/test/onnx/test_onnx_opset.py
@@ -0,0 +1,111 @@
+from test_pytorch_common import TestCase, run_tests
+
+import torch
+import torch.onnx
+from torch.nn import Module
+
+import onnx
+
+import io
+
+from torch.onnx.symbolic_helper import _export_onnx_opset_version
+from torch.onnx import ir_version, producer_name, producer_version
+
+
+def check_onnx_opset_operator(model, ops, opset_version=_export_onnx_opset_version):
+    # check_onnx_components
+    assert model.ir_version == ir_version and \
+        model.producer_name == producer_name and \
+        model.producer_version == producer_version and \
+        model.opset_import[0].version == opset_version
+
+    # check the schema with the onnx checker
+    onnx.checker.check_model(model)
+
+    # check target type and attributes 
+    graph = model.graph
+    # ops should contain an object for each node
+    # in graph.node, in the right order.
+    # At least the op_name should be specified,
+    # but the op's attributes can optionally be
+    # specified as well
+    assert len(ops) == len(graph.node)
+    for i in range(0, len(ops)):
+        assert graph.node[i].op_type == ops[i]['op_name']
+        if "attributes" in ops[i] :
+            attributes = ops[i]['attributes']
+            assert len(attributes) == len(graph.node[i].attribute)
+            for j in range(0, len(attributes)):
+                for attribute_field in attributes[j].keys():
+                    assert attributes[j][attribute_field] == getattr(graph.node[i].attribute[j], attribute_field)
+
+
+def check_onnx_opsets_operator(module, x, ops, opset_versions):
+    for opset_version in opset_versions:
+        f = io.BytesIO()
+        torch.onnx.export(module, x, f, opset_version=opset_version)
+        model = onnx.load(io.BytesIO(f.getvalue()))
+        check_onnx_opset_operator(model, ops[opset_version], opset_version)
+
+
+class TestONNXOpset(TestCase):
+
+    def test_opset_fallback(self):
+        class MyModule(Module):
+            def forward(self, x):
+                return torch.isnan(x)
+
+        ops = [{"op_name" : "IsNaN"},
+               {"op_name" : "Cast", "attributes" : [{"name" : "to", "i" : 2, "type" : 2}]}]
+        ops = {9 : ops, 10 : ops}
+        x = torch.tensor([1.0, float('nan'), 2.0])
+        check_onnx_opsets_operator(MyModule(), x, ops, opset_versions=[9, 10])
+
+    def test_topk(self):
+        class MyModule(Module):
+            def forward(self, x):
+                return torch.topk(x, 3)
+
+        ops_9 = [{"op_name" : "TopK", "attributes" : [{"name" : "axis", "i" : -1, "type" : 2},
+                 {"name" : "k", "i" : 3, "type" : 2}]}]
+        ops_10 = [{"op_name" : "Constant", "attributes" : [{"name" : "value", "type" : 4}]},
+                  {"op_name" : "Unsqueeze", "attributes" : [{"name" : "axes", "ints" : [0], "type" : 7}]},
+                  {"op_name" : "TopK", "attributes" : [{"name" : "axis", "i" : -1, "type" : 2}]}]
+        ops = {9 : ops_9, 10 : ops_10}
+        x = torch.arange(1., 6., requires_grad=True)
+        check_onnx_opsets_operator(MyModule(), x, ops, opset_versions=[9, 10])
+
+    def test_maxpool(self):
+        module = torch.nn.MaxPool1d(2, stride=1)
+
+        ops_9 = [{"op_name" : "MaxPool",
+                  "attributes" :
+                  [{"name": "kernel_shape", "ints": [2], "type": 7},
+                   {"name": "pads", "ints": [0, 0], "type": 7},
+                   {"name": "strides", "ints": [1], "type": 7}]}]
+        ops_10 = [{"op_name" : "MaxPool",
+                   "attributes" :
+                   [{"name": "ceil_mode", "i": 0, "type": 2},
+                    {"name": "kernel_shape", "ints": [2], "type": 7},
+                    {"name": "pads", "ints": [0, 0], "type": 7},
+                    {"name": "strides", "ints": [1], "type": 7}]}]
+        ops = {9 : ops_9, 10 : ops_10}
+        x = torch.randn(20, 16, 50)
+        check_onnx_opsets_operator(module, x, ops, opset_versions=[10])
+
+        # add test with dilations
+        module = torch.nn.MaxPool1d(2, stride=1, dilation=2)
+
+        ops_10 = [{"op_name" : "MaxPool",
+                   "attributes" :
+                   [{"name": "ceil_mode", "i": 0, "type": 2},
+                    {"name": "dilations", "ints": [2], "type": 7},
+                    {"name": "kernel_shape", "ints": [2], "type": 7},
+                    {"name": "pads", "ints": [0, 0], "type": 7},
+                    {"name": "strides", "ints": [1], "type": 7}]}]
+        ops = {9 : ops_9, 10 : ops_10}
+        x = torch.randn(20, 16, 50)
+        check_onnx_opsets_operator(module, x, ops, opset_versions=[10])
+
+if __name__ == '__main__':
+    run_tests()
diff --git a/test/onnx/test_operators.py b/test/onnx/test_operators.py
index 955892e2da6a..4ab0b2fa4b6a 100644
--- a/test/onnx/test_operators.py
+++ b/test/onnx/test_operators.py
@@ -416,6 +416,10 @@ def test_slice(self):
         x = torch.rand(3, 4, requires_grad=True)
         self.assertONNX(lambda x: x[:, 1:2], x)
 
+    def test_sign(self):
+        x = torch.rand(3, 4, requires_grad=True)
+        self.assertONNX(lambda x: x.sign(), x)
+
     def test_narrow(self):
         x = torch.randn(3, 3, requires_grad=True)
         self.assertONNX(lambda x: torch.narrow(x, 0, 0, 2), x)
@@ -581,7 +585,7 @@ def __init__(self):
             def forward(self, scores, bbox_deltas, im_info, anchors):
                 a, b = torch.ops._caffe2.GenerateProposals(
                     (scores), (bbox_deltas), (im_info), (anchors),
-                    2.0, 6000, 300, 0.7, 16, True, -90, 90, 1.0,
+                    2.0, 6000, 300, 0.7, 16, True, -90, 90, 1.0, True,
                 )
                 return a, b
 
diff --git a/test/onnx/test_pytorch_onnx_caffe2.py b/test/onnx/test_pytorch_onnx_caffe2.py
index 529b186f2352..deecd3630838 100644
--- a/test/onnx/test_pytorch_onnx_caffe2.py
+++ b/test/onnx/test_pytorch_onnx_caffe2.py
@@ -77,6 +77,11 @@ def wrapper(self):
 def do_export(model, inputs, *args, **kwargs):
     f = io.BytesIO()
     out = torch.onnx._export(model, inputs, f, *args, **kwargs)
+    if isinstance(model, torch.jit.ScriptModule):
+        # Special case for common case of passing a single Tensor
+        if isinstance(inputs, torch.Tensor):
+            inputs = (inputs,)
+        out = model(*inputs)
     return f.getvalue(), out
 
 
@@ -178,7 +183,7 @@ def run_actual_test(self, model, train, batch_size, state_dict=None,
 
         # Verify the model runs the same in Caffe2
         verify.verify(model, input, c2, rtol=rtol, atol=atol,
-                      do_constant_folding=do_constant_folding)
+                      example_outputs=example_outputs, do_constant_folding=do_constant_folding)
 
     def run_model_test(self, model, train, batch_size, state_dict=None,
                        input=None, use_gpu=True, rtol=0.001, atol=1e-7,
@@ -1249,6 +1254,17 @@ def forward(self, input):
         x = torch.tensor([1.0, float('nan'), 2.0])
         self.run_model_test(IsNaNModel(), train=False, input=x, batch_size=BATCH_SIZE, use_gpu=False)
 
+    def test_scatter(self):
+        class ScatterModel(torch.nn.Module):
+            def forward(self, input, indices, values):
+                return input.scatter(1, indices, values)
+
+        input = torch.tensor([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]])
+        indices = torch.tensor([[1, 0], [0, 1], [0, 1]], dtype=torch.int64)
+        values = torch.tensor([[1.0, 1.1], [2.0, 2.1], [3.0, 3.1]])
+        self.run_model_test(ScatterModel(), train=False, input=(input, indices, values),
+                            batch_size=BATCH_SIZE, use_gpu=False)
+
     def test_flatten(self):
         class FlattenModel(torch.nn.Module):
             def forward(self, input):
@@ -1368,7 +1384,7 @@ def forward(self, feature, im_info, anchors):
                 bbox_deltas = self.conv(feature)
                 a, b = torch.ops._caffe2.GenerateProposals(
                     feature, bbox_deltas, im_info, anchors,
-                    2.0, 6000, 300, 0.7, 16, True, -90, 90, 1.0,
+                    2.0, 6000, 300, 0.7, 16, True, -90, 90, 1.0, True,
                 )
                 output = torch.ops._caffe2.RoIAlign(
                     feature, a,
@@ -1424,7 +1440,7 @@ def __init__(self):
             def forward(self, scores, bbox_deltas, im_info, anchors):
                 a, b = torch.ops._caffe2.GenerateProposals(
                     scores, bbox_deltas, im_info, anchors,
-                    2.0, 6000, 300, 0.7, 16, True, -90, 90, 1.0,
+                    2.0, 6000, 300, 0.7, 16, True, -90, 90, 1.0, True,
                 )
                 return a, b
 
@@ -1458,6 +1474,7 @@ def forward(self, rois, deltas, im_info):
                     angle_bound_lo=-90,
                     angle_bound_hi=90,
                     clip_angle_thresh=0.5,
+                    legacy_plus_one=True,
                 )
                 return a, b
 
@@ -1475,7 +1492,7 @@ def forward(self, rois, deltas, im_info):
         im_info[:, 2] = 1.0
         im_info = torch.zeros((batch_size, 3))
         inputs = (torch.tensor(rois), torch.tensor(deltas), torch.tensor(im_info))
-        self.run_model_test(MyModel(), train=False, input=inputs, batch_size=3)
+        self.run_model_test(MyModel(), train=False, input=inputs, batch_size=3, use_gpu=False)
 
     # BoxWithNMSLimits has requirements for the inputs, so randomly generated inputs
     # in Caffe2BackendTestEmbed doesn't work with this op.
@@ -1502,6 +1519,7 @@ def test_c2_box_with_nms_limits(self):
                 -90,
                 90,
                 clip_angle_thresh,
+                legacy_plus_one=True,
             )
         ]
         class_prob = np.random.randn(sum(roi_counts), num_classes).astype(np.float32)
@@ -1529,11 +1547,12 @@ def forward(self, class_prob, pred_bbox, batch_splits):
                     cls_agnostic_bbox_reg=False,
                     input_boxes_include_bg_cls=True,
                     output_classes_include_bg_cls=True,
+                    legacy_plus_one=True,
                 )
                 return a, b, c, d
 
         inputs = (torch.tensor(class_prob), torch.tensor(pred_bbox), torch.tensor(batch_splits))
-        self.run_model_test(MyModel(), train=False, input=inputs, batch_size=3)
+        self.run_model_test(MyModel(), train=False, input=inputs, batch_size=3, use_gpu=False)
 
     def test_c2_inference_lstm(self):
         num_layers = 4
@@ -1572,7 +1591,24 @@ def forward(self, lstm_in):
             torch.from_numpy(hx),
         ] + [param.detach() for param in torch_lstm._flat_weights]
 
-        self.run_model_test(MyModel(), train=False, input=lstm_in, batch_size=3)
+        self.run_model_test(MyModel(), train=False, input=lstm_in, batch_size=3, use_gpu=False)
+
+    def test_topk(self):
+        class TopKModel(torch.nn.Module):
+            def forward(self, input):
+                return torch.topk(input, 3)
+
+        x = torch.arange(1., 6.)
+        self.run_model_test(TopKModel(), train=False, input=x, batch_size=BATCH_SIZE)
+
+    def test_topk_script(self):
+        class TopKModel(torch.jit.ScriptModule):
+            @torch.jit.script_method
+            def forward(self, input):
+                return torch.topk(input, 3, dim=0)
+
+        x = torch.randn(4, 3, requires_grad=True)
+        self.run_model_test(TopKModel(), train=False, input=(x,), batch_size=BATCH_SIZE, example_outputs=torch.topk(x, 3, dim=0))
 
     def test_floor(self):
         class FloorModel(torch.nn.Module):
@@ -1643,6 +1679,24 @@ def forward(self, input):
                 return view_by_prim_shape(input)
         self.run_model_test(PrimShapeModel(), train=False, input=x, batch_size=BATCH_SIZE)
 
+    def test_and(self):
+        class AndModel(torch.nn.Module):
+            def forward(self, x, y):
+                return x & y
+
+        x = torch.randint(0, 1, (3, 5))
+        y = torch.randint(0, 1, (3, 5))
+        self.run_model_test(AndModel(), train=False, input=(x, y), batch_size=BATCH_SIZE)
+
+    def test_or(self):
+        class OrModel(torch.nn.Module):
+            def forward(self, x, y):
+                return x | y
+
+        x = torch.randint(0, 1, (3, 5))
+        y = torch.randint(0, 1, (3, 5))
+        self.run_model_test(OrModel(), train=False, input=(x, y), batch_size=BATCH_SIZE)
+
 # a bit of metaprogramming to set up all the rnn tests
 
 
diff --git a/test/onnx/test_utility_funs.py b/test/onnx/test_utility_funs.py
index 4d3ee99a1532..0470669a21ec 100644
--- a/test/onnx/test_utility_funs.py
+++ b/test/onnx/test_utility_funs.py
@@ -4,7 +4,7 @@
 import torch
 import torch.onnx
 from torch.onnx import utils
-from torch.onnx.symbolic import _set_opset_version
+from torch.onnx.symbolic_helper import _set_opset_version
 
 import onnx
 
diff --git a/test/onnx/verify.py b/test/onnx/verify.py
index b687a99962c1..b104dca726cb 100644
--- a/test/onnx/verify.py
+++ b/test/onnx/verify.py
@@ -244,7 +244,7 @@ def set_training(model, mode):
 
 
 def verify(model, args, backend, verbose=False, training=False, rtol=1e-3, atol=1e-7,
-           test_args=2, do_constant_folding=False):
+           test_args=2, do_constant_folding=False, example_outputs=None):
     """
     Export a model into ONNX, import it into a specified ONNX backend, and then
     on a few random inputs verify that PyTorch and the backend produced the same
@@ -358,14 +358,18 @@ def load_bytes(b):
     with set_training(model, training):
         proto_bytes = io.BytesIO()
         torch_out = torch.onnx._export(model, args, proto_bytes, verbose=verbose,
-                                       do_constant_folding=do_constant_folding)
+                                       do_constant_folding=do_constant_folding, example_outputs=example_outputs)
+        if isinstance(model, torch.jit.ScriptModule):
+            torch_out = model(*args)
         proto = load_bytes(proto_bytes)
         prepared = backend.prepare(proto)
 
         def run(args):
             alt_proto_bytes = io.BytesIO()
             torch_out = torch.onnx._export(model, args, alt_proto_bytes, verbose=verbose,
-                                           do_constant_folding=do_constant_folding)
+                                           do_constant_folding=do_constant_folding, example_outputs=example_outputs)
+            if isinstance(model, torch.jit.ScriptModule):
+                torch_out = model(*args)
             alt_proto = load_bytes(alt_proto_bytes)
             if proto.SerializeToString() != alt_proto.SerializeToString():
                 # OK, let's try to figure out what happened.
diff --git a/test/test_autograd.py b/test/test_autograd.py
index 971f4a03bdbb..ddf3c4b54813 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -2162,6 +2162,24 @@ def f(a, b):
             run_functional_checks(self, "test_cdist", "cdist", f,
                                   True, f_args_variable, f_args_tensor)
 
+    def test_var_mean_differentiable(self):
+        dim = [2, 4]
+        keepdim = False
+        input1 = torch.randn(3, 4, 5, 6, 2, 3, requires_grad=True)
+        input2 = deepcopy(input1)
+        var1, mean1 = torch.var_mean(input1, dim=dim, keepdim=keepdim)
+        var2 = input2.var(dim=dim, keepdim=keepdim)
+        mean2 = input2.mean(dim=dim, keepdim=keepdim)
+        grad = torch.randn(3, 4, 6, 3, requires_grad=True)
+
+        r1 = var1 * var1 * mean1 * mean1
+        r2 = var2 * var2 * mean2 * mean2
+        self.assertTrue(torch.allclose(r1, r2, rtol=0.01, atol=0.0))
+
+        torch.autograd.backward(r1, grad)
+        torch.autograd.backward(r2, grad)
+        self.assertTrue(torch.allclose(input1.grad, input2.grad, rtol=0.01, atol=0.0))
+
     @skipIfNoLapack
     def test_cholesky(self):
         def func(root, upper):
@@ -3006,6 +3024,34 @@ def parent_on_cpu(inp):
         # gpu thread ReadyQueue
         out.sum().backward()
 
+    def test_version_counter(self):
+        x = torch.randn(1, 2)
+
+        # In-place op bumps version
+        x_saved_version = x._version
+        x.add_(1).add_(1)
+        self.assertTrue(x._version > x_saved_version)
+
+        # Differentiable view shares version counter
+        xz = x[:]
+        self.assertTrue(x._version == xz._version)
+        xz.add_(1)
+        self.assertTrue(x._version == xz._version)
+
+        # `x.data = y` preserves version counter of `x`
+        x_saved_version = x._version
+        x.data = torch.randn(2, 3)
+        self.assertTrue(x._version == x_saved_version)
+        x.add_(1)
+        self.assertTrue(x._version > x_saved_version)
+        # Make sure `x` is still using the same version counter it shares with `xz`
+        self.assertTrue(x._version == xz._version)
+
+        # In-place op on `xz` also updates version of `x`,
+        # because they share the version counter
+        xz.add_(1)
+        self.assertTrue(x._version == xz._version)
+
 
 def index_variable(shape, max_indices):
     if not isinstance(shape, tuple):
diff --git a/test/test_c10d.py b/test/test_c10d.py
index e21eb211ab82..0af1979099b6 100644
--- a/test/test_c10d.py
+++ b/test/test_c10d.py
@@ -579,6 +579,14 @@ def opts(self, threads=2):
         opts.threads = threads
         return opts
 
+    def test_empty_tensors(self):
+        store = c10d.FileStore(self.file.name, self.world_size)
+        pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts())
+
+        xs = [torch.FloatTensor([])]
+        pg.broadcast(xs).wait()
+        self.assertEqual(0, xs[0].numel())
+
     def test_broadcast_checks(self):
         store = c10d.FileStore(self.file.name, self.world_size)
         pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts())
@@ -1344,6 +1352,30 @@ def setUp(self):
     def tearDown(self):
         pass
 
+    def test_empty_tensors(self):
+        store = c10d.FileStore(self.file.name, self.world_size)
+        pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
+
+        xs = [torch.cuda.FloatTensor([])]
+        pg.broadcast(xs).wait()
+        self.assertEqual(0, xs[0].numel())
+
+        pg.allreduce(xs).wait()
+        self.assertEqual(0, xs[0].numel())
+
+        pg.reduce(xs).wait()
+        self.assertEqual(0, xs[0].numel())
+
+        ys = [[torch.cuda.FloatTensor([]) for _ in range(self.world_size)]]
+        pg.allgather(ys, xs).wait()
+        for y in ys[0]:
+            self.assertEqual(0, y.numel())
+
+        ys = [torch.cuda.FloatTensor([])]
+        xs = [[torch.cuda.FloatTensor([]) for _ in range(self.world_size)]]
+        pg.reduce_scatter(ys, xs).wait()
+        self.assertEqual(0, ys[0].numel())
+
     def test_broadcast_ops(self):
         store = c10d.FileStore(self.file.name, self.world_size)
         pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
@@ -1646,52 +1678,60 @@ def tearDown(self):
     def world_size(self):
         return 2
 
-    def _prepare_single_device_module(self, process_group, gpus, global_batch_size):
+    def _prepare_single_device_module(self, process_group, devices, device_ids, global_batch_size):
         model = Net()
         ddp_model = DistributedDataParallel(
-            copy.deepcopy(model).cuda(gpus[0]),
-            device_ids=gpus,
+            copy.deepcopy(model).to(devices[0]),
+            device_ids=device_ids,
             process_group=process_group,
             bucket_cap_mb=0.001)
 
-        model.cuda(gpus[0])
+        model.to(devices[0])
 
-        input = torch.randn(global_batch_size, 2).cuda(gpus[0])
-        target = torch.randn(global_batch_size, 4).cuda(gpus[0])
+        input = torch.randn(global_batch_size, 2).to(devices[0])
+        target = torch.randn(global_batch_size, 4).to(devices[0])
 
         return model, ddp_model, input, target
 
-    def _prepare_multi_device_module(self, process_group, gpus, global_batch_size):
+    def _prepare_multi_device_module(self, process_group, devices, device_ids, global_batch_size):
         self.assertTrue(
-            len(gpus) == 2 or len(gpus) == 4,
-            "unexpected devices for ddp tests {}".format(gpus))
-        if len(gpus) == 2:
-            model = DoubleGpuNet(gpus)
-        elif len(gpus) == 4:
-            model = QuadraGpuNet(gpus)
+            len(devices) == 2 or len(devices) == 4,
+            "unexpected devices for ddp tests {}".format(devices))
+        if len(devices) == 2:
+            model = DoubleGpuNet(devices)
+        elif len(devices) == 4:
+            model = QuadraGpuNet(devices)
 
         ddp_model = DistributedDataParallel(
             copy.deepcopy(model),
+            device_ids=device_ids,
             process_group=process_group,
             bucket_cap_mb=0.001)
 
-        input = torch.randn(global_batch_size, 2).to(gpus[0])
+        input = torch.randn(global_batch_size, 2).cuda(devices[0])
         target = torch.randn(global_batch_size, 4)
 
         return model, ddp_model, input, target
 
-    def _test_ddp_with_process_group(self, process_group, gpus, multi_gpu=False):
-        local_batch_size = len(gpus)
+    def _test_ddp_with_process_group(self, process_group, devices, device_ids, multi_device=False):
+        """
+        Note: we pass down `device_ids` all the way to DistributedDataParallel
+        as part of the test. Below you find tests that either use a list of
+        integers, a list of `torch.Device` instances, or an empty list.
+        The `devices` argument is used to control placement of the model and
+        must always be specified as list of `torch.Device` instances.
+        """
+        local_batch_size = len(devices)
         global_batch_size = self.world_size * local_batch_size
 
-        if multi_gpu:
+        if multi_device:
             model, ddp_model, input, target = \
                 self._prepare_multi_device_module(
-                    process_group, gpus, global_batch_size)
+                    process_group, devices, device_ids, global_batch_size)
         else:
             model, ddp_model, input, target = \
                 self._prepare_single_device_module(
-                    process_group, gpus, global_batch_size)
+                    process_group, devices, device_ids, global_batch_size)
 
         def step_model(model, input, target):
             model.train()
@@ -1725,87 +1765,72 @@ def update_parameters(model):
             torch.manual_seed(1337 + iteration)
             input = input[torch.randperm(global_batch_size)]
 
-    def _test_gloo_backend(self, gpus, multi_gpu=False, use_str=False):
-        if use_str:
-            gpus = list(map(lambda i: torch.device('cuda:' + str(i)), gpus))
+    def _test_gloo_backend(self, devices, device_ids, multi_device=False):
         store = c10d.FileStore(self.file.name, self.world_size)
         options = c10d.ProcessGroupGloo.Options()
         options.devices = [c10d.ProcessGroupGloo.create_tcp_device(interface="lo")]
         process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options)
-        self._test_ddp_with_process_group(process_group, gpus, multi_gpu)
+        self._test_ddp_with_process_group(process_group, devices, device_ids, multi_device)
+
+    def test_gloo_backend_cpu_module(self):
+        self._test_gloo_backend([torch.device('cpu')], [])
 
     @skip_if_not_multigpu
-    def test_gloo_backend(self):
-        gpus = gpus_for_rank(self.world_size)[self.rank]
-        self._test_gloo_backend(gpus)
+    def test_gloo_backend_1gpu_module_device_ids_integer_list(self):
+        int_devices = gpus_for_rank(self.world_size)[self.rank][:1]
+        devices = list([torch.device('cuda:' + str(i)) for i in int_devices])
+        self._test_gloo_backend(devices, int_devices)
 
     @skip_if_not_multigpu
-    def test_gloo_backend_str(self):
-        gpus = gpus_for_rank(self.world_size)[self.rank]
-        self._test_gloo_backend(gpus, use_str=True)
+    def test_gloo_backend_1gpu_module_device_ids_torch_device_list(self):
+        int_devices = gpus_for_rank(self.world_size)[self.rank][:1]
+        devices = list([torch.device('cuda:' + str(i)) for i in int_devices])
+        self._test_gloo_backend(devices, devices)
 
     @skip_if_lt_x_gpu(4)
     def test_gloo_backend_2gpu_module(self):
-        gpus = gpus_for_rank(self.world_size)[self.rank]
-        self._test_gloo_backend(gpus[:2], multi_gpu=True)
-
-    @skip_if_lt_x_gpu(4)
-    def test_gloo_backend_2gpu_module_str(self):
-        gpus = gpus_for_rank(self.world_size)[self.rank]
-        self._test_gloo_backend(gpus[:2], multi_gpu=True, use_str=True)
+        int_devices = gpus_for_rank(self.world_size)[self.rank][:2]
+        devices = list([torch.device('cuda:' + str(i)) for i in int_devices])
+        self._test_gloo_backend(devices, [], multi_device=True)
 
     @skip_if_lt_x_gpu(8)
     def test_gloo_backend_4gpu_module(self):
-        gpus = gpus_for_rank(self.world_size)[self.rank]
-        self._test_gloo_backend(gpus[:4], multi_gpu=True)
-
-    @skip_if_lt_x_gpu(8)
-    def test_gloo_backend_4gpu_module_str(self):
-        gpus = gpus_for_rank(self.world_size)[self.rank]
-        self._test_gloo_backend(gpus[:4], multi_gpu=True, use_str=True)
+        int_devices = gpus_for_rank(self.world_size)[self.rank][:4]
+        devices = list([torch.device('cuda:' + str(i)) for i in int_devices])
+        self._test_gloo_backend(devices, [], multi_device=True)
 
-    def _test_nccl_backend(self, gpus, multi_gpu=False, use_str=False):
-        if use_str:
-            gpus = list(map(lambda i: torch.device('cuda:' + str(i)), gpus))
+    def _test_nccl_backend(self, devices, device_ids, multi_device=False):
         store = c10d.FileStore(self.file.name, self.world_size)
         process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
-        self._test_ddp_with_process_group(process_group, gpus, multi_gpu)
+        self._test_ddp_with_process_group(process_group, devices, device_ids, multi_device)
 
     @skip_if_not_multigpu
     @skip_if_not_nccl
-    def test_nccl_backend(self):
-        gpus = gpus_for_rank(self.world_size)[self.rank]
-        self._test_nccl_backend(gpus)
+    def test_nccl_backend_1gpu_module_device_ids_integer_list(self):
+        int_devices = gpus_for_rank(self.world_size)[self.rank][:1]
+        devices = list([torch.device('cuda:' + str(i)) for i in int_devices])
+        self._test_nccl_backend(devices, int_devices)
 
     @skip_if_not_multigpu
     @skip_if_not_nccl
-    def test_nccl_backend_str(self):
-        gpus = gpus_for_rank(self.world_size)[self.rank]
-        self._test_nccl_backend(gpus, use_str=True)
+    def test_nccl_backend_1gpu_module_device_ids_torch_device_list(self):
+        int_devices = gpus_for_rank(self.world_size)[self.rank][:1]
+        devices = list([torch.device('cuda:' + str(i)) for i in int_devices])
+        self._test_nccl_backend(devices, devices)
 
     @skip_if_lt_x_gpu(4)
     @skip_if_not_nccl
     def test_nccl_backend_2gpu_module(self):
-        gpus = gpus_for_rank(self.world_size)[self.rank]
-        self._test_nccl_backend(gpus[:2], multi_gpu=True)
-
-    @skip_if_lt_x_gpu(4)
-    @skip_if_not_nccl
-    def test_nccl_backend_2gpu_module_str(self):
-        gpus = gpus_for_rank(self.world_size)[self.rank]
-        self._test_nccl_backend(gpus[:2], multi_gpu=True, use_str=True)
+        int_devices = gpus_for_rank(self.world_size)[self.rank][:2]
+        devices = list([torch.device('cuda:' + str(i)) for i in int_devices])
+        self._test_nccl_backend(devices, [], multi_device=True)
 
     @skip_if_lt_x_gpu(8)
     @skip_if_not_nccl
     def test_nccl_backend_4gpu_module(self):
-        gpus = gpus_for_rank(self.world_size)[self.rank]
-        self._test_nccl_backend(gpus[:4], multi_gpu=True)
-
-    @skip_if_lt_x_gpu(8)
-    @skip_if_not_nccl
-    def test_nccl_backend_4gpu_module_str(self):
-        gpus = gpus_for_rank(self.world_size)[self.rank]
-        self._test_nccl_backend(gpus[:4], multi_gpu=True, use_str=True)
+        int_devices = gpus_for_rank(self.world_size)[self.rank][:4]
+        devices = list([torch.device('cuda:' + str(i)) for i in int_devices])
+        self._test_nccl_backend(devices, [], multi_device=True)
 
     @skip_if_lt_x_gpu(4)
     @skip_if_not_nccl
@@ -2373,16 +2398,15 @@ def check_no_grads():
     @skip_if_not_multigpu
     @skip_if_not_nccl
     def test_accumulate_gradients(self):
-        gpus = gpus_for_rank(self.world_size)[self.rank][0:1]
-        self.assertEqual(len(gpus), 1)
+        int_devices = gpus_for_rank(self.world_size)[self.rank][:1]
+        devices = list([torch.device('cuda:' + str(i)) for i in int_devices])
         store = c10d.FileStore(self.file.name, self.world_size)
         process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
-        local_batch_size = len(gpus)
-        global_batch_size = self.world_size * local_batch_size
+        global_batch_size = self.world_size
 
         model, ddp_model, input, target = \
             self._prepare_single_device_module(
-                process_group, gpus, global_batch_size)
+                process_group, devices, devices, global_batch_size)
 
         def step_model(model, input, target):
             model.train()
@@ -2395,25 +2419,25 @@ def step_model(model, input, target):
             ddp_model.train()
             ddp_model.module(input)
 
-        # check two model parameters over 2 iterations
+        # Check two model parameters over 4 iterations.
+        # Use 4 iterations because we alternate between reducing and
+        # not reducing and want to make sure we switch both ways.
         for iteration in range(4):
-            # single cpu/gpu training
             step_model(model, input, target)
 
             if iteration % 2 == 0:
                 # Skip gradients sync without calling prepare_for_backward
-                step_model(ddp_model.module,
-                           input[self.rank * local_batch_size: (self.rank + 1) * local_batch_size],
-                           target[self.rank * local_batch_size: (self.rank + 1) * local_batch_size])
-
+                step_model(
+                    ddp_model.module,
+                    input[self.rank : (self.rank + 1)],
+                    target[self.rank : (self.rank + 1)])
                 for i, j in zip(model.parameters(), ddp_model.parameters()):
                     self.assertNotEqual(i.grad, j.grad)
             else:
-                # DDP training, DDP scatters subsets of input_cpu to nodes/GPUs
-                step_model(ddp_model,
-                           input[self.rank * local_batch_size: (self.rank + 1) * local_batch_size],
-                           target[self.rank * local_batch_size: (self.rank + 1) * local_batch_size])
-
+                step_model(
+                    ddp_model,
+                    input[self.rank : (self.rank + 1)],
+                    target[self.rank : (self.rank + 1)])
                 for i, j in zip(model.parameters(), ddp_model.parameters()):
                     self.assertEqual(i.grad, j.grad)
 
@@ -2646,6 +2670,72 @@ def test_multi_limit_multi_dtype(self):
         self.assertEqual([[0], [1], [2, 4], [3, 5]], result)
 
 
+class CommTest(MultiProcessTestCase):
+    def tearDown(self):
+        super(CommTest, self).tearDown()
+        try:
+            os.remove(self.file.name)
+        except OSError:
+            pass
+
+    @property
+    def world_size(self):
+        return 2
+
+    def _test_broadcast_coalesced(self, process_group, device):
+        half = torch.float16
+
+        # No support for float16 for CPU tensors
+        if device == torch.device('cpu'):
+            half = torch.float32
+
+        target = torch.arange(60, dtype=half, device=device).chunk(5)
+        target += torch.arange(60, dtype=torch.float32, device=device).chunk(5)
+        target += torch.arange(60, dtype=half, device=device).chunk(5)
+        target += torch.arange(60, dtype=torch.float64, device=device).chunk(5)
+        target += torch.arange(60, dtype=half, device=device).chunk(5)
+        target += torch.arange(60, dtype=torch.float32, device=device).chunk(5)
+
+        # The tensors to pass to broadcast are idential to the target
+        # only on the process that is the root of the broadcast.
+        if self.rank == 0:
+            tensors = list(tensor.clone() for tensor in target)
+        else:
+            tensors = list(torch.empty_like(tensor) for tensor in target)
+
+        c10d._broadcast_coalesced(
+            process_group,
+            tensors,
+            buffer_size=256)
+
+        self.assertEqual(tensors, target)
+
+    @skip_if_not_multigpu
+    @skip_if_not_nccl
+    def test_broadcast_coalesced_nccl(self):
+        store = c10d.FileStore(self.file.name, self.world_size)
+        process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
+        device = torch.device('cuda:%d' % self.rank)
+        self._test_broadcast_coalesced(process_group, device)
+
+    @skip_if_not_multigpu
+    def test_broadcast_coalesced_gloo_cuda(self):
+        store = c10d.FileStore(self.file.name, self.world_size)
+        options = c10d.ProcessGroupGloo.Options()
+        options.devices = [c10d.ProcessGroupGloo.create_tcp_device(interface="lo")]
+        process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options)
+        device = torch.device('cuda:%d' % self.rank)
+        self._test_broadcast_coalesced(process_group, device)
+
+    def test_broadcast_coalesced_gloo_cpu(self):
+        store = c10d.FileStore(self.file.name, self.world_size)
+        options = c10d.ProcessGroupGloo.Options()
+        options.devices = [c10d.ProcessGroupGloo.create_tcp_device(interface="lo")]
+        process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options)
+        device = torch.device('cpu')
+        self._test_broadcast_coalesced(process_group, device)
+
+
 if __name__ == '__main__':
     assert not torch.cuda._initialized, "test_distributed must not have initialized CUDA context on main process"
 
diff --git a/test/test_cuda.py b/test/test_cuda.py
index 5afcfd1b74bf..2a400e47bd72 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -974,15 +974,47 @@ def test_copy_streams(self):
         self._test_copy_sync_current_stream(x0, x2)
 
     def test_copy_non_blocking(self):
-        x = torch.randn(5, 5).cuda()
-        y = torch.zeros(5, 5)
-        y.copy_(x, non_blocking=True)
-        self.assertEqual(x, y)
+        def _test_copy_non_blocking(a, b):
+            event = torch.cuda.Event()
+            a.copy_(b, non_blocking=True)
+            event.record()
+            self.assertFalse(event.query())
+            event.synchronize()
+            self.assertEqual(a, b)
 
-        x = torch.randn(5, 5)
-        y = torch.zeros(5, 5).cuda()
-        y.copy_(x, non_blocking=True)
-        self.assertEqual(x, y)
+        # 10MB copies
+        x = torch.ones(10000000, dtype=torch.uint8).cuda()
+        y = torch.zeros(10000000, dtype=torch.uint8).pin_memory()
+        _test_copy_non_blocking(x, y)
+
+        x = torch.zeros(10000000, dtype=torch.uint8).pin_memory()
+        y = torch.ones(10000000, dtype=torch.uint8).cuda()
+        _test_copy_non_blocking(x, y)
+
+    def test_copy_broadcast(self):
+        x = torch.randn(10, 5)
+        y = torch.randn(5, device='cuda')
+        x.copy_(y)
+        self.assertEqual(x[3], y.cpu())
+
+        x = torch.randn(10, 5, device='cuda')
+        y = torch.randn(5)
+        x.copy_(y)
+        self.assertEqual(x[3].cpu(), y)
+
+    def test_copy_noncontig(self):
+        def do_test(d0, d1):
+            x = torch.tensor([1.5, 2.5, 3.5, 4.5, 5.5, 6.5], device=d0)
+            y = torch.tensor([0, 0, 0, 0, 0, 0], device=d1)
+            self.assertNotEqual(x.dtype, y.dtype)
+
+            y[::2].copy_(x[::2])
+            self.assertEqual(y, [1, 0, 3, 0, 5, 0])
+
+        do_test('cpu', 'cuda')
+        do_test('cuda', 'cpu')
+        if TEST_MULTIGPU:
+            do_test('cuda:0', 'cuda:1')
 
     def test_serialization_array_with_storage(self):
         x = torch.randn(5, 5).cuda()
@@ -2720,9 +2752,6 @@ def test_bincount_cuda(self):
         self.assertEqual(t.cpu().bincount(), t.bincount())
         self.assertEqual(t.cpu().bincount(w_cpu), t.bincount(w))
 
-    def test_histc_cuda(self):
-        _TestTorchMixin._test_histc(self, device='cuda')
-
     def test_tiny_half_norm_(self):
         a = torch.arange(25).cuda().float()
         a /= 100000000
diff --git a/test/test_dataloader.py b/test/test_dataloader.py
index 825689ffced6..2cf3c4b5c7a1 100644
--- a/test/test_dataloader.py
+++ b/test/test_dataloader.py
@@ -454,6 +454,10 @@ def kill_pid(pid):
 def init_fn(worker_id):
     torch.manual_seed(12345)
 
+# used with test_error_in_init
+def error_worker_init_fn(_):
+    raise RuntimeError("Error in worker_init_fn")
+
 
 class TestDataLoader(TestCase):
 
@@ -509,6 +513,11 @@ def fn():
 
             self.assertRaises(ValueError, fn)
 
+    def test_error_in_init(self):
+        loader = DataLoader(self.dataset, num_workers=2, worker_init_fn=error_worker_init_fn)
+        with self.assertRaisesRegex(RuntimeError, 'Error in worker_init_fn'):
+            list(iter(loader))
+
     def test_sequential(self):
         self._test_sequential(DataLoader(self.dataset))
 
diff --git a/test/test_distributions.py b/test/test_distributions.py
index c35dec77ef12..3f74918d0d8a 100644
--- a/test/test_distributions.py
+++ b/test/test_distributions.py
@@ -953,6 +953,18 @@ def ref_log_prob(idx, x, log_prob):
             logits = probs_to_logits(probs, is_binary=True)
             self._check_log_prob(Binomial(total_count, logits=logits), ref_log_prob)
 
+    def test_binomial_stable(self):
+        logits = torch.tensor([-100., 100.], dtype=torch.float)
+        total_count = 1.
+        x = torch.tensor([0., 0.], dtype=torch.float)
+        log_prob = Binomial(total_count, logits=logits).log_prob(x)
+        self.assertTrue(torch.isfinite(log_prob).all())
+
+        # make sure that the grad at logits=0, value=0 is 0.5
+        x = torch.tensor(0., requires_grad=True)
+        y = Binomial(total_count, logits=x).log_prob(torch.tensor(0.))
+        self.assertEqual(grad(y, x)[0], torch.tensor(-0.5))
+
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
     def test_binomial_log_prob_vectorized_count(self):
         probs = torch.tensor([0.2, 0.7, 0.9])
diff --git a/test/test_fake_quant.py b/test/test_fake_quant.py
index b8d8dbb5dba1..7c39ee2b9b8c 100644
--- a/test/test_fake_quant.py
+++ b/test/test_fake_quant.py
@@ -1,6 +1,7 @@
 from __future__ import absolute_import, division, print_function, unicode_literals
 
 import torch
+import torch.cuda
 import torch.jit
 import numpy as np
 import unittest
@@ -66,6 +67,9 @@ def test_backward(self):
         np.testing.assert_allclose(dX, dX_prime, rtol=tolerance, atol=tolerance)
 
     def test_numerical_consistency(self):
+        '''
+        Comparing numerical consistency between CPU quantize/dequantize op and the CPU fake quantize op
+        '''
         np.random.seed(NP_RANDOM_SEED)
         fake_quantize_per_tensor_affine_forward = torch.ops.quantized.fake_quantize_per_tensor_affine_forward
 
@@ -74,13 +78,72 @@ def test_numerical_consistency(self):
         num_bits = 8
         X = np.random.rand(20, 20) * 125
         X_torch = torch.from_numpy(X).float()
-        Y = X_torch.quantize_linear(scale, zero_point).dequantize()
+        Y = torch.dequantize(torch.quantize_linear(X_torch, scale, zero_point, torch.qint8))
         Y_prime = fake_quantize_per_tensor_affine_forward(
             X=X_torch, scale=scale, zero_point=zero_point, num_bits=num_bits,
             quant_delay=0, iter=0)
         tolerance = 1e-6
         np.testing.assert_allclose(Y, Y_prime, rtol=tolerance, atol=tolerance)
 
+    """Tests the forward path of the FakeQuantizePerTensorAffine CUDA op."""
+    @unittest.skipIf(not torch.cuda.is_available(), 'no CUDA')
+    def test_forward_cuda(self):
+        np.random.seed(NP_RANDOM_SEED)
+        fake_quantize_per_tensor_affine_forward = torch.ops.quantized.fake_quantize_per_tensor_affine_forward
+
+        scale = 3
+        zero_point = 2
+        num_bits = 8
+        X = np.random.rand(20, 20) * 125
+        X_torch = torch.from_numpy(X).float().cuda()
+        Y = _fake_quantize_per_tensor_affine_reference(X, scale, zero_point, num_bits)
+        Y_prime = fake_quantize_per_tensor_affine_forward(
+            X=X_torch, scale=scale, zero_point=zero_point, num_bits=num_bits,
+            quant_delay=0, iter=0)
+        tolerance = 1e-6
+        np.testing.assert_allclose(Y, Y_prime.cpu(), rtol=tolerance, atol=tolerance)
+
+    """Tests the backward method. Note that this runs the reference quantization
+    and thus the errors might be originating there."""
+    @unittest.skipIf(not torch.cuda.is_available(), 'no CUDA')
+    def test_backward_cuda(self):
+        np.random.seed(NP_RANDOM_SEED)
+        fake_quantize_per_tensor_affine_backward = torch.ops.quantized.fake_quantize_per_tensor_affine_backward
+
+        scale = 3
+        zero_point = 2
+        num_bits = 8
+        X = np.random.rand(20, 20) * 125
+        Y = _fake_quantize_per_tensor_affine_reference(X, scale, zero_point, num_bits)
+        dY = Y - X  # Fake gradient
+        dX = _fake_quantize_per_tensor_affine_grad_reference(X, dY, scale, zero_point, num_bits)
+        X_torch = torch.from_numpy(X).float().cuda()
+        dY_torch = torch.from_numpy(dY).float().cuda()
+        dX_prime = fake_quantize_per_tensor_affine_backward(
+            X=X_torch, dY=dY_torch, scale=scale, zero_point=zero_point,
+            num_bits=num_bits, quant_delay=0, iter=0)
+        tolerance = 1e-6
+        np.testing.assert_allclose(dX, dX_prime.cpu(), rtol=tolerance, atol=tolerance)
+
+    @unittest.skipIf(not torch.cuda.is_available(), 'no CUDA')
+    def test_numerical_consistency_cuda(self):
+        '''
+        Comparing numerical consistency between CPU quantize/dequantize op and the CUDA fake quantize op
+        '''
+        np.random.seed(NP_RANDOM_SEED)
+        fake_quantize_per_tensor_affine_forward = torch.ops.quantized.fake_quantize_per_tensor_affine_forward
+
+        scale = 3
+        zero_point = 2
+        num_bits = 8
+        X = np.random.rand(20, 20) * 125
+        X_torch = torch.from_numpy(X).float()
+        Y = torch.dequantize(torch.quantize_linear(X_torch, scale, zero_point, torch.qint8))
+        Y_prime = fake_quantize_per_tensor_affine_forward(
+            X=X_torch.cuda(), scale=scale, zero_point=zero_point, num_bits=num_bits,
+            quant_delay=0, iter=0)
+        tolerance = 1e-6
+        np.testing.assert_allclose(Y, Y_prime.cpu(), rtol=tolerance, atol=tolerance)
 
 if __name__ == '__main__':
     run_tests()
diff --git a/test/test_jit.py b/test/test_jit.py
index 7ce71658f287..bbf826bd1668 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -17,7 +17,7 @@
 from torch._six import inf, PY2, builtins, StringIO
 from common_utils import TestCase, run_tests, IS_WINDOWS, TEST_WITH_UBSAN, \
     skipIfRocm, skipIfNoLapack, suppress_warnings, load_tests, IS_SANDCASTLE, \
-    freeze_rng_state, set_rng_seed, slowTest
+    freeze_rng_state, set_rng_seed, slowTest, TemporaryFileName
 from common_nn import module_tests, new_module_tests, criterion_tests
 from textwrap import dedent
 from functools import wraps, reduce
@@ -75,9 +75,7 @@
     CUDA_VERSION = torch._C._cuda_getCompiledVersion()
     for d in range(torch.cuda.device_count()):
         major = torch.cuda.get_device_capability(d)[0]
-        if (CUDA_VERSION < 8000 and major >= 6) or (CUDA_VERSION < 9000 and major >= 7):
-            RUN_CUDA = False
-        if (CUDA_VERSION < 9000 or major < 6):
+        if (major < 6):
             RUN_CUDA_HALF = False
 
 RUN_CUDA_MULTI_GPU = RUN_CUDA and torch.cuda.device_count() > 1
@@ -86,25 +84,6 @@
 WINDOWS = sys.platform == 'win32'
 
 
-if WINDOWS:
-    @contextmanager
-    def TemporaryFileName():
-        # Ideally we would like to not have to manually delete the file, but NamedTemporaryFile
-        # opens the file, and it cannot be opened multiple times in Windows. To support Windows,
-        # close the file after creation and try to remove it manually
-        f = tempfile.NamedTemporaryFile(delete=False)
-        try:
-            f.close()
-            yield f.name
-        finally:
-            os.unlink(f.name)
-else:
-    @contextmanager  # noqa: T484
-    def TemporaryFileName():
-        with tempfile.NamedTemporaryFile() as f:
-            yield f.name
-
-
 def LSTMCellF(input, hx, cx, *params):
     return LSTMCell(input, (hx, cx), *params)
 
@@ -267,6 +246,11 @@ def wrapper(*args, **kwargs):
             return wrapper
         return noop_fuser
 
+@contextmanager
+def enable_profiling_mode():
+    torch._C._jit_set_profiling_mode(True)
+    yield
+    torch._C._jit_set_profiling_mode(False)
 
 # note: not re-entrant, use unnested only
 @contextmanager
@@ -1316,15 +1300,15 @@ def forward(self, x):
                 x = F.relu(self.conv1(x))
                 return x
 
-        trace = testModule()
+        scriptM = testModule()
 
         # Constant Propagation step is performed because this pass is intended
         # to insert quant-dequant nodes for quantizable tensors. The type analysis
         # happens as part of this jit pass
-        torch._C._jit_pass_constant_propagation(trace.graph)
+        torch._C._jit_pass_constant_propagation(scriptM.graph)
         # TODO: Build the qparam_dict from parse_ir directly for this pass
-        qparam_dict = _helper_generate_qparam(trace, input_data)
-        torch._C._jit_pass_insert_quantdequant(trace.graph, qparam_dict)
+        qparam_dict = _helper_generate_qparam(scriptM, input_data)
+        torch._C._jit_pass_insert_quantdequant(scriptM.graph, qparam_dict)
 
         # We expect to see quant-dequant node before and after
         # both conv and relu nodes and at external output since relu
@@ -1332,14 +1316,11 @@ def forward(self, x):
         # quantization nodes
         FileCheck().check("quantize_linear").check_next("int_repr") \
                    .check_next("dequantize_linear") \
-                   .check("conv2d").check_next("Constant") \
-                   .check_next("Constant").check_next("quantize_linear") \
+                   .check("conv2d").check("quantize_linear") \
                    .check_next("int_repr").check_next("dequantize_linear") \
-                   .run(str(trace.graph))
-        FileCheck().check("relu").check_next("Constant") \
-                   .check_next("Constant").check_next("quantize_linear") \
+                   .check("relu").check("quantize_linear") \
                    .check_next("int_repr").check_next("dequantize_linear") \
-                   .check_next("return").run(str(trace.graph))
+                   .check_next("return").run(str(scriptM.graph))
 
     def test_insert_quantdequant_consecutive_qnodes_trace(self):
         input_data = torch.ones([1, 1, 5, 5])
@@ -1353,12 +1334,12 @@ def forward(self, x):
                 x = F.relu(self.conv1(x))
                 return x
 
-        trace = torch.jit.trace(testModule(), (input_data))
+        scriptM = torch.jit.trace(testModule(), (input_data))
 
-        qparam_dict = _helper_generate_qparam(trace, input_data)
+        qparam_dict = _helper_generate_qparam(scriptM, input_data)
         if not len(qparam_dict):
             return
-        torch._C._jit_pass_insert_quantdequant(trace.graph, qparam_dict)
+        torch._C._jit_pass_insert_quantdequant(scriptM.graph, qparam_dict)
 
         # We expect to see quant-dequant node before and after
         # both conv and relu nodes and at external output since relu
@@ -1366,14 +1347,11 @@ def forward(self, x):
         # quantization nodes
         FileCheck().check("quantize_linear").check_next("int_repr") \
                    .check_next("dequantize_linear") \
-                   .check("_convolution").check_next("Constant") \
-                   .check_next("Constant").check_next("quantize_linear") \
+                   .check("_convolution").check("quantize_linear") \
                    .check_next("int_repr").check_next("dequantize_linear") \
-                   .run(str(trace.graph))
-        FileCheck().check("relu").check_next("Constant") \
-                   .check_next("Constant").check_next("quantize_linear") \
+                   .check("relu").check("quantize_linear") \
                    .check_next("int_repr").check_next("dequantize_linear") \
-                   .check_next("return").run(str(trace.graph))
+                   .check_next("return").run(str(scriptM.graph))
 
     def test_insert_quantdequant_single_qnode(self):
         input_data = torch.ones([1, 1, 5, 5])
@@ -1389,26 +1367,24 @@ def forward(self, x):
                 x1 = torch.add(x, 1)
                 return x1
 
-        trace = testModule()
+        scriptM = testModule()
 
         # Constant Propagation step is performed because this pass is intended
         # to insert quant-dequant nodes for quantizable tensors. The type analysis
         # happens as part of this jit pass
-        torch._C._jit_pass_constant_propagation(trace.graph)
+        torch._C._jit_pass_constant_propagation(scriptM.graph)
 
-        qparam_dict = _helper_generate_qparam(trace, input_data)
-        torch._C._jit_pass_insert_quantdequant(trace.graph, qparam_dict)
+        qparam_dict = _helper_generate_qparam(scriptM, input_data)
+        torch._C._jit_pass_insert_quantdequant(scriptM.graph, qparam_dict)
 
         # We expect to see quant-dequant node before and after
         # both conv and no quant-dequant after add. Constant nodes correspond
         # to params for the quantization nodes
         FileCheck().check("quantize_linear").check_next("int_repr") \
                    .check_next("dequantize_linear") \
-                   .check("conv2d").check_next("Constant") \
-                   .check_next("Constant").check_next("quantize_linear") \
+                   .check("conv2d").check("quantize_linear") \
                    .check_next("int_repr").check_next("dequantize_linear") \
-                   .check_next("add").check_next("return") \
-                   .run(str(trace.graph))
+                   .check_next("add").check_next("return").run(str(scriptM.graph))
 
     def test_insert_quantdequant_alternate_qnode(self):
         input_data = torch.ones([1, 1, 5, 5])
@@ -1425,28 +1401,105 @@ def forward(self, x):
                 x2 = F.relu(x1)
                 return x2
 
-        trace = testModule()
+        scriptM = testModule()
 
         # Constant Propagation step is performed because this pass is intended
         # to insert quant-dequant nodes for quantizable tensors. The type analysis
         # happens as part of this jit pass
-        torch._C._jit_pass_constant_propagation(trace.graph)
+        torch._C._jit_pass_constant_propagation(scriptM.graph)
 
-        qparam_dict = _helper_generate_qparam(trace, input_data)
-        torch._C._jit_pass_insert_quantdequant(trace.graph, qparam_dict)
+        qparam_dict = _helper_generate_qparam(scriptM, input_data)
+        torch._C._jit_pass_insert_quantdequant(scriptM.graph, qparam_dict)
 
         # We expect to see quant-dequant node before and after
         # conv, relu and add. Constant nodes correspond to params for the
         # quantization nodes
         FileCheck().check("quantize_linear").check_next("int_repr") \
-                   .check_next("dequantize_linear") \
-                   .check("conv2d").check_next("Constant") \
-                   .check_next("Constant").check_next("quantize_linear") \
-                   .check_next("int_repr").run(str(trace.graph))
-        FileCheck().check("add").check_next("Constant")\
-                   .check_next("Constant").check_next("quantize_linear") \
+                   .check_next("dequantize_linear").check("conv2d") \
+                   .check("quantize_linear").check_next("int_repr") \
+                   .check_next("dequantize_linear").run(str(scriptM.graph))
+        FileCheck().check("add").check("quantize_linear") \
                    .check_next("int_repr").check("dequantize_linear") \
-                   .run(str(trace.graph))
+                   .run(str(scriptM.graph))
+
+    def test_insert_quantdequant_for_weight(self):
+        input_data = torch.ones([1, 1, 1, 1])
+
+        class testModule(torch.jit.ScriptModule):
+            def __init__(self):
+                super(testModule, self).__init__()
+                self.conv1 = nn.Conv2d(1, 1, 1, 1)
+
+            @torch.jit.script_method
+            def forward(self, x):
+                x = self.conv1(x)
+                return x
+
+        def getQParamFunc(value):
+            scale = 0.5
+            zero_point = 1
+            return 'per_tensor_quant', scale, zero_point
+
+        scriptModule = testModule()
+
+        # Constant Propagation step is performed because this pass is intended
+        # to insert quant-dequant nodes for quantizable tensors. The type analysis
+        # happens as part of this jit pass
+        torch._C._jit_pass_constant_propagation(scriptModule.graph)
+        torch._C._jit_pass_insert_quantdequant_for_weight_bias(scriptModule._c,
+                                                               "forward",
+                                                               "weight",
+                                                               getQParamFunc)
+
+        # We expect to see quant-dequant node before conv node for weight.
+        FileCheck().check("quantize_linear").check_next("int_repr") \
+                   .check_next("dequantize_linear") \
+                   .check("conv2d").run(str(scriptModule.graph))
+
+    def test_insert_quantdequant_for_bias(self):
+        # Inserting quant-dequant nodes for bias requires scale info present for
+        # activation and weight so q-dq pass done first for these inputs.
+
+        class testModule(torch.jit.ScriptModule):
+            def __init__(self):
+                super(testModule, self).__init__()
+                self.conv1 = nn.Conv2d(1, 1, 1, 1).float()
+
+            @torch.jit.script_method
+            def forward(self, x):
+                x = x.quantize_linear(1.0, 0, torch.uint8)
+                x = x.int_repr()
+                x = x.dequantize_linear(1.0, 0, torch.uint8)
+                x = self.conv1(x)
+                return x
+
+        def getQParamFuncW(value):
+            return 'per_tensor_quant', 0.5, 1
+
+        def getQParamFunc(input_scale, weight_scale):
+            scale = 1 / input_scale / weight_scale
+            zero_point = 0
+            return 'per_tensor_quant', scale, zero_point
+
+        scriptModule = testModule()
+
+        torch._C._jit_pass_constant_propagation(scriptModule.graph)
+        torch._C._jit_pass_insert_quantdequant_for_weight_bias(scriptModule._c,
+                                                               "forward",
+                                                               "weight",
+                                                               getQParamFuncW)
+        torch._C._jit_pass_insert_quantdequant_for_weight_bias(scriptModule._c,
+                                                               "forward",
+                                                               "bias",
+                                                               getQParamFunc)
+        # We expect to see 3 pairs of quant-dequant nodes.
+
+        FileCheck().check("quantize_linear").check_next("int_repr") \
+                   .check_next("dequantize_linear").check("quantize_linear") \
+                   .check_next("int_repr").check_next("dequantize_linear") \
+                   .check("quantize_linear").check_next("int_repr") \
+                   .check_next("dequantize_linear").check("conv2d") \
+                   .run(str(scriptModule.graph))
 
     def test_pattern_based_rewrite(self):
         # mul(mul(mul(mul(x,y),z),x),y) --> mul(mul(mulmul(x,y,z), x), y) -->
@@ -2138,7 +2191,7 @@ def __init__(self):
             def forward(self, scores, bbox_deltas, im_info, anchors):
                 a, b = torch.ops._caffe2.GenerateProposals(
                     (scores), (bbox_deltas), (im_info), (anchors),
-                    2.0, 6000, 300, 0.7, 16, True, -90, 90, 1.0,
+                    2.0, 6000, 300, 0.7, 16, True, -90, 90, 1.0, True,
                 )
                 return a, b
         model = MyModel()
@@ -3109,6 +3162,21 @@ def fn(x):
         warns = [str(w.message) for w in warns]
         self.assertEqual(len(warns), 0)
 
+    @unittest.skipIf(sys.platform == "win32", "temp file name on windows")
+    def test_trace_save(self):
+        def fn(x):
+            return x + 2
+
+        def check(func):
+            with tempfile.NamedTemporaryFile() as f:
+                func.save(f.name)
+                loaded = torch.jit.load(f.name)
+                input = torch.randn(2, 2)
+                self.assertEqual(func(input), loaded(input))
+
+        out = torch.jit.trace(fn, (torch.ones(2, 2),))
+        check(out)
+
     @unittest.skipIf(sys.platform == "win32", "TODO: need to fix this test case for Windows")
     def test_torch_load_error(self):
         class J(torch.jit.ScriptModule):
@@ -3266,6 +3334,45 @@ def foo(x):
             else:
                 cu.define(full)
 
+    def test_inherit_method(self):
+        class A(torch.jit.ScriptModule):
+            def __init__(self):
+                super(A, self).__init__()
+
+            @torch.jit.script_method
+            def forward(self, x):
+                return x + self.bar(x)
+
+        class B(A):
+            def __init__(self):
+                super(B, self).__init__()
+
+            @torch.jit.script_method
+            def bar(self, x):
+                return x * x
+
+        with self.assertRaisesRegex(RuntimeError, 'attribute'):
+            A()  # cannot use because bar is not defined
+
+        v = torch.rand(3, 4)
+        b = B()
+        self.assertEqual(b(v), v + v * v)
+
+        class C(torch.jit.ScriptModule):
+            def __init__(self):
+                super(C, self).__init__()
+
+            @torch.jit.script_method
+            def bar(self, x):
+                return x
+
+        class D(C, B):
+            def __init__(self):
+                super(D, self).__init__()
+
+        self.assertEqual(D()(v), v + v)
+
+
     def test_tracing_multiple_methods(self):
         class Net(nn.Module):
             def __init__(self):
@@ -3291,6 +3398,11 @@ def weighted_kernel_sum(self, weight):
             check_inputs.append({'forward' : check_forward_input, 'weighted_kernel_sum' : check_weight})
         module = torch.jit.trace_module(n, inputs, True, True, check_inputs)
 
+        module = torch.jit.trace(n.forward, example_forward_input)
+        module = torch.jit.trace(n.forward, example_forward_input, True, True, [example_forward_input])
+        with self.assertRaisesRegex(AttributeError, "trace doesn't support compiling individual module's functions"):
+            module = torch.jit.trace(n.weighted_kernel_sum, inputs)
+
     def test_submodule_twice(self):
         @torch.jit.script
         def foo(x):
@@ -3742,12 +3854,17 @@ def func(a, b):
         def func2(a, b, c, d):
             return c + a ** b ** d
 
+        def func3(a, b):
+            # type: (int, float) -> float
+            return a ** b
+
         a = torch.rand(1, requires_grad=True)
         b = torch.rand(1, requires_grad=True)
         c = torch.rand(1, requires_grad=True)
         d = torch.rand(1, requires_grad=True)
         self.checkScript(func, (a, b), optimize=True)
         self.checkScript(func2, (a, b, c, d), optimize=True)
+        self.checkScript(func3, (4, -0.5), optimize=True)
 
     @unittest.skipIf(not RUN_CUDA, "device tests require CUDA")
     def test_pow_scalar_backward_cuda(self):
@@ -4338,7 +4455,7 @@ def reassign_from_empty_literal():
             if True:
                 x = [1, 2, 3]
             return
-        with self.assertRaisesRegex(RuntimeError, r"previously has type Tensor\[\]"):
+        with self.assertRaisesRegex(RuntimeError, r"previously has type List\[Tensor\]"):
             self.checkScript(reassign_from_empty_literal, (), optimize=False)
 
         def reassign_from_empty_builtin():
@@ -5006,6 +5123,31 @@ def func(alpha, beta, x, y):
         # NOTE: cannot optimize yet because broadcasts are not inserted before the fuser runs
         self.checkScript(script, [alpha, beta, x, y], optimize=False, outputs=outputs)
 
+    def test_profiling_graph_executor(self):
+        @torch.jit.script
+        def basic(x, y):
+            a = x + y
+            b = x * y
+            c = x + 1
+            d = a - c
+            e = b - c
+            return d + e
+
+        a = torch.rand(2, 3)
+        b = torch.rand(2, 3)
+
+        with enable_profiling_mode():
+            basic(a, b)
+            basic(a, b)
+            basic(a, b)
+
+            # this tests that a profiling count is being decrement by
+            # a profile instruction.
+            # this is the easiest way to test that a graph was instrumented
+            # from python
+            with self.assertRaisesRegex(RuntimeError, "Not yet implemented"):
+                basic(a, b)
+
     def test_resize_input_ops(self):
         # resize_ and resize_as resize the input tensor. because our shape analysis
         # is flow invariant, we set any Tensor that can alias a resized Tensor
@@ -5510,7 +5652,7 @@ def test_not_cast(x):
         self.checkScript(test_not_cast, (torch.tensor(1),))
         self.checkScript(test_not_cast, (torch.tensor(0),))
 
-        with self.assertRaisesRegex(RuntimeError, "expected"):
+        with self.assertRaisesRegex(RuntimeError, "Could not cast value of type Tuple\[Tensor, Tensor\]"):  # noqa: W605
             @torch.jit.script
             def test_mult(x, y):
                 return not(x, y)
@@ -5535,7 +5677,7 @@ def test_cast_float(x):
         self.checkScript(test_cast_float, (0.,))
         self.checkScript(test_cast_float, (-1.,))
 
-        with self.assertRaisesRegex(RuntimeError, "expected a bool, int, float, or Tensor"):
+        with self.assertRaisesRegex(RuntimeError, "Could not cast value of type Tuple\[int, int\] to bool"):  # noqa: W605
             @torch.jit.script
             def test_bad_conditional(x):
                 if (1, 2):
@@ -5860,6 +6002,61 @@ def test_pow_int(x, y):
         self.checkScript(test_pow_float, (2.0, 2.0))
         self.checkScript(test_pow_int, (2.0, 2))
 
+    @unittest.skipIf(PY2, "Requires python 3")
+    def test_math_gcd(self):
+        def test_gcd(x, y):
+            # type: (int, int) -> int
+            return math.gcd(x, y)
+
+        for inputs in [(2, 4), (-5, -15), (-5, 15), (10, 0), (0, 10), (-5, 0), (0, -5), (0, 0), (0, -0)]:
+            self.checkScript(test_gcd, inputs)
+
+    def test_math_ops1(self):
+        funcs_template = dedent('''
+        def func():
+            return math.{func}({scalar})
+        ''')
+
+        def run_test(code):
+            scope = {}
+            execWrapper(code, globals(), scope)
+            cu = torch.jit.CompilationUnit(code)
+            self.assertEqual(cu.func(), scope['func']())
+
+        special_domain = ['gamma', 'lgamma']
+
+        for func in ['erf', 'erfc', 'expm1', 'fabs', 'gamma', 'lgamma']:
+            for scalar in [1, 10, 0, -1, -1.5, 5.0, 1.5]:
+                if func in special_domain and scalar in [0, -1]:
+                    continue
+                code = funcs_template.format(func=func, scalar=scalar)
+                run_test(code)
+
+    def test_math_copysign(self):
+
+        def func1(x, y):
+            # type: (int, int) -> float
+            return math.copysign(x, y)
+
+        def func2(x, y):
+            # type: (int, float) -> float
+            return math.copysign(x, y)
+
+        def func3(x, y):
+            # type: (float, int) -> float
+            return math.copysign(x, y)
+
+        def func4(x, y):
+            # type: (float, float) -> float
+            return math.copysign(x, y)
+
+        inputs = [(3.3, 5.5), (3.3, -5.5), (-3.3, 5.5), (-3.3, -5.5), (3.3, 0.0), (0.0, 3.3)]
+        for a, b in inputs:
+            self.checkScript(func1, (int(a), int(b)))
+            self.checkScript(func2, (int(a), b))
+            self.checkScript(func3, (a, int(b)))
+            self.checkScript(func4, (a, b))
+
     def test_if_nest_while(self):
         def func(a, b):
             # type: (int, int) -> int
@@ -8089,7 +8286,7 @@ def foo(i):
         v = torch.rand(10, 3)
         self.checkScript(foo, (v,))
 
-        with self.assertRaisesRegex(RuntimeError, r"variable 'a' previously has type \(Tensor, Tensor\)"):
+        with self.assertRaisesRegex(RuntimeError, r"variable 'a' previously has type Tuple"):
             @torch.jit.script
             def mixtypes(x):
                 a = (x, x)
@@ -8347,7 +8544,7 @@ def fn(x : torch.Tensor, y : Tensor, z) -> Tuple[Tensor, Tensor, Tensor]:
             fn = get_fn('test_type_annotation_py3', script_path)
 
             with self.assertRaisesRegex(RuntimeError, r"expected a value of type Tensor for argument"
-                                                      r" '0' but found \(Tensor, Tensor\)"):
+                                                      r" '0' but found Tuple\[Tensor,"):
                 @torch.jit.script
                 def bad_fn(x):
                     x, y = fn((x, x), x, x)
@@ -9285,7 +9482,7 @@ def f3(a):
             def f4(a):
                 torch.cat(a)
 
-        with self.assertRaisesRegex(RuntimeError, r'argument \'tensors\' but found int\[\]'):
+        with self.assertRaisesRegex(RuntimeError, r'argument \'tensors\' but found List\[int\]'):
             @torch.jit.script
             def f5(a):
                 torch.cat([3])
@@ -9342,7 +9539,7 @@ def foo(x, y):
         self.assertExpected(str(cu.foo.schema))
 
     def test_parser_type_annotations_unknown_type(self):
-        with self.assertRaisesRegex(RuntimeError, r'Unknown type name Foo'):
+        with self.assertRaisesRegex(RuntimeError, "Unknown type name 'Foo'"):
             cu = torch.jit.CompilationUnit('''
                 def foo(x : Tensor, y : Tuple[Tuple[Foo, Tensor], Tensor]) -> Tuple[Tensor, Tensor]:
                     return x, x
@@ -10759,6 +10956,61 @@ def foo(x : {input}, y : Tuple[Tensor, Tensor]) -> Tuple[{output}, {output}]:
             test_str.append(str(fn.schema))
         self.assertExpected("\n".join(test_str))
 
+    @unittest.skipIf(not PY35, "Python 3.5 needed")
+    def test_multiline_annot_ast_py3_fn(self):
+        code = dedent('''
+            from typing import Tuple, List, Optional
+            from torch import Tensor
+            from torch.jit.annotations import BroadcastingList2, BroadcastingList3
+            import torch
+            @torch.jit.script
+            def foo(x,  # type: {input}
+                    y   # type: Tuple[Tensor, Tensor]
+                    ):
+                # type: (...) -> Tuple[{output}, {output}]
+                return x, x
+        ''')
+        test_str = []
+
+        for pair in self.type_input_return_pairs():
+            fn = self._get_py3_code(self.format_code(code, pair), 'foo')
+            args = fn.schema.arguments
+            returns = fn.schema.returns
+            self.assertEqual(str(args[0].type), pair[1])
+            self.assertEqual(str(args[1].type), "Tuple[Tensor, Tensor]")
+            self.assertEqual(str(returns[0].type), "Tuple[{}, {}]".format(pair[1], pair[1]))
+
+    def test_bad_multiline_annotations(self):
+        with self.assertRaisesRegex(RuntimeError, "Return type line"):
+            @torch.jit.script
+            def bad_type_line(a,  # type: Tensor
+                              b,  # type: Tensor
+                              c   # type: Tensor
+                              ):
+                # type: (int, int, int) -> Tensor
+                # type: bad type line  # noqa: F723
+
+                return a + b + c
+
+        with self.assertRaisesRegex(RuntimeError, "Return type line"):
+            @torch.jit.script
+            def bad_return_line(a,  # type: Tensor
+                                b,
+                                c   # type: Tensor
+                                ):
+                # type: (int, int, int) -> Tensor
+                return a + b + c
+
+        # TODO: this should be supported but is difficult to parse
+        with self.assertRaisesRegex(RuntimeError, "Number of type annotations"):
+            @torch.jit.script
+            def missing_type(a,  # type: Tensor
+                             b,
+                             c   # type: Tensor
+                             ):
+                # type: (...) -> Tensor
+                return a + b + c
+
     #  Python AST Frontend , Python 3-style type annotations , Script method
     @unittest.skipIf(not PY35, "Python 3.5 needed")
     def test_annot_ast_py3_method(self):
@@ -11471,6 +11723,28 @@ def forward(self, x):
         weak_mod.weight = torch.nn.Parameter(torch.ones(5, 5) * 100)
         self.assertFalse(strong_mod(inp).allclose(weak_mod(inp)))
 
+    def test_weak_module_isinstance(self):
+        tester = self
+
+        class M(torch.jit.ScriptModule):
+            def __init__(self):
+                super(M, self).__init__()
+                self.linear = nn.Linear(2, 2)
+                tester.assertTrue(isinstance(self.linear, nn.Linear))
+
+        m = M()
+
+    def test_weak_module_attributes(self):
+        tester = self
+
+        class M(torch.jit.ScriptModule):
+            def __init__(self):
+                super(M, self).__init__()
+                self.linear = nn.Linear(2, 2)
+                tester.assertEqual(self.linear.in_features, 2)
+
+        m = M()
+
     def test_backend_cudnn_enabled(self):
         # Only test that this compiles
         @torch.jit.script
@@ -11796,6 +12070,30 @@ def fn(x):
 
         self.checkScript(fn, ("abcde",))
 
+    def test_str_ops(self):
+        def test_str_is(s):
+            # type: (str) -> Tuple[bool, bool, bool, bool, bool, bool]
+            return s.isupper(), s.islower(), s.isdigit(), s.isspace(), \
+                s.isalnum(), s.isalpha()
+
+        def test_str_to(s):
+            # type: (str) -> Tuple[str, str]
+            return s.upper(), s.lower()
+
+        inputs = ["", "12a", "!B", "12", "a", "B", "aB", "$12", "B12", "AB ",
+                  "  \t", "  \n", "\na", "abc"]
+
+        for input in inputs:
+            self.checkScript(test_str_is, (input,))
+            self.checkScript(test_str_to, (input,))
+
+        def test_str_cmp(a, b):
+            # type: (str, str) -> Tuple[bool, bool, bool, bool, bool, bool]
+            return a != b, a == b, a < b, a > b, a <= b, a >= b
+
+        for i in range(len(inputs) - 1):
+            self.checkScript(test_str_cmp, (inputs[i], inputs[i + 1]))
+
     def test_ord(self):
         def fn(x):
             # type: (str) -> int
@@ -11811,6 +12109,48 @@ def index_str_to_tensor(s):
         s = u'\u00a3'.encode('utf8')[:1]
         self.checkScript(index_str_to_tensor, (s,))
 
+    @unittest.skipIf(IS_WINDOWS or IS_SANDCASTLE, "NYI: TemporaryFileName support for Windows or Sandcastle")
+    def test_get_set_state(self):
+        class M(torch.jit.ScriptModule):
+            __constants__ = ['number']
+
+            def __init__(self, number, submodule=None):
+                super(M, self).__init__()
+                self.register_buffer('buffer1', torch.ones(2, 2))
+                self.register_buffer('buffer2', torch.ones(2, 2))
+                self.number = number
+                if submodule:
+                    self.submodule = submodule
+
+            @torch.jit.script_method
+            def __getstate__(self):
+                # type: () -> Tuple[Tensor, Tensor, int]
+                return (self.buffer1, self.buffer2, 74)
+
+            @torch.jit.script_method
+            def __setstate__(self, state):
+                # type: (Tuple[Tensor, Tensor, int]) -> None
+                self.buffer1 = state[0] + 10
+                self.buffer2 = state[1] + 10
+
+        with TemporaryFileName() as fname:
+            m = M(23, submodule=M(99))
+            m.save(fname)
+            loaded = torch.jit.load(fname)
+
+        # Check original module
+        self.assertEqual(m.buffer1, torch.ones(2, 2))
+        self.assertEqual(m.buffer2, torch.ones(2, 2))
+
+        # Check top level module
+        self.assertEqual(loaded.buffer1, torch.ones(2, 2) + 10)
+        self.assertEqual(loaded.buffer2, torch.ones(2, 2) + 10)
+
+        # Check submodule
+        self.assertEqual(loaded.submodule.buffer1, torch.ones(2, 2) + 10)
+        self.assertEqual(loaded.submodule.buffer2, torch.ones(2, 2) + 10)
+
+
     def test_string_slicing(self):
         def fn1(x):
             # type: (str) -> str
@@ -12244,22 +12584,35 @@ def foo(x):
         m = self.createFunctionFromGraph(foo.graph)
         self.getExportImportCopy(m)
 
+    def get_pickle_values(self):
+        return (('dict', {"I": "am", "a test": "test"}, Dict[str, str]),
+                ('float', 2.3, float),
+                ('int', 99, int),
+                ('bool', False, bool),
+                ('tuple', (1, 2, 3, 4), Tuple[int, int, int, int]),
+                ('list', [(1, 2), (3, 4)], List[Tuple[int, int]]),
+                ('tensor', torch.randn(2, 2), torch.Tensor),
+                ('int_list', [1, 2, 3, 4], List[int]),
+                ('tensor_list', [torch.ones(2, 2) + i for i in range(4)], List[torch.Tensor]),
+                ('bool_list', [True, True, False, True], List[bool]),
+                ('float_list', [1., 2., 3., 4.], List[float]),
+                ('str_list', ['hello', 'bye'], List[str]),
+                ('none', None, Optional[int]),)
+
     def test_attribute_serialization(self):
+        tester = self
+
         class M(torch.jit.ScriptModule):
             def __init__(self):
                 super(M, self).__init__()
-                self.table = torch.jit.Attribute({"I": "am", "a test": "test"}, Dict[str, str])
-                self.float = torch.jit.Attribute(2.3, float)
-                self.int = torch.jit.Attribute(99, int)
-                self.bool = torch.jit.Attribute(False, bool)
-                self.tuple = torch.jit.Attribute((1, 2, 3, 4), Tuple[int, int, int, int])
-                self.list = torch.jit.Attribute([(1, 2), (3, 4)], List[Tuple[int, int]])
-                self.tensor = torch.jit.Attribute(torch.randn(2, 2), torch.Tensor)
-                self.int_list = torch.jit.Attribute([1, 2, 3, 4], List[int])
+                for name, value, the_type in tester.get_pickle_values():
+                    setattr(self, name, torch.jit.Attribute(value, the_type))
 
             @torch.jit.script_method
             def forward(self):
-                return (self.table, self.float, self.int, self.bool, self.tuple, self.list, self.int_list)
+                return (self.dict, self.float, self.int, self.bool, self.tuple,
+                        self.list, self.int_list, self.tensor_list, self.bool_list,
+                        self.float_list, self.str_list, self.none)
 
         m = M()
         imported_m = self.getExportImportCopy(m)
@@ -12277,21 +12630,19 @@ def fn(x):
     @unittest.skipIf(IS_WINDOWS or IS_SANDCASTLE, "NYI: TemporaryFileName support for Windows or Sandcastle")
     def test_attribute_unpickling(self):
         tensor = torch.randn(2, 2)
+        tester = self
 
         class M(torch.jit.ScriptModule):
             def __init__(self):
                 super(M, self).__init__()
-                self.table = torch.jit.Attribute({"I": "am", "a test": "test"}, Dict[str, str])
-                self.float = torch.jit.Attribute(2.3, float)
-                self.int = torch.jit.Attribute(99, int)
-                self.tuple = torch.jit.Attribute((1, 2, 3, 4), Tuple[int, int, int, int])
-                self.list = torch.jit.Attribute([(1, 2), (3, 4)], List[Tuple[int, int]])
-                self.tensor = torch.jit.Attribute(tensor, torch.Tensor)
-                self.int_list = torch.jit.Attribute([1, 2, 3, 4], List[int])
+                for name, value, the_type in tester.get_pickle_values():
+                    setattr(self, name, torch.jit.Attribute(value, the_type))
 
             @torch.jit.script_method
             def forward(self):
-                return (self.table, self.float, self.int, self.tuple, self.list, self.int_list)
+                return (self.dict, self.float, self.int, self.bool, self.tuple,
+                        self.list, self.int_list, self.tensor_list, self.bool_list,
+                        self.float_list, self.str_list, self.none)
 
         with TemporaryFileName() as fname:
             M().save(fname)
@@ -12300,10 +12651,32 @@ def forward(self):
             pickled_data = archive.read(os.path.join(archive_name, 'attributes.pkl'))
             out = pickle.load(io.BytesIO(pickled_data))
 
-            self.assertEqual(out[0], {"I": "am", "a test": "test"})
-            self.assertEqual(out[1], 2.3)
-            self.assertEqual(out[2], 99)
-            self.assertEqual(out[6], [1, 2, 3, 4])
+            def is_tensor_value(item):
+                if isinstance(item, torch.Tensor):
+                    return True
+                if isinstance(item, list):
+                    return is_tensor_value(item[0])
+                return False
+
+            for loaded_item, item in zip(out, self.get_pickle_values()):
+                if is_tensor_value(item[1]):
+                    continue
+                self.assertEqual(item[1], loaded_item)
+
+    def test_script_recurse(self):
+        def a_python_fn(a, b, c):
+            return a + b + c
+
+        with torch.jit._enable_recursive_script():
+            @torch.jit.script
+            def a_script_fn(d, e, f):
+                return a_python_fn(d, e, f)
+
+        graph = str(a_script_fn.graph)
+        FileCheck().check("aten::add").run(graph)
+        FileCheck().check_not("a_python_fn").run(graph)
+        t = torch.ones(2, 2)
+        self.assertEqual(a_script_fn(t, t, t), t + t + t)
 
     @unittest.skipIf(IS_WINDOWS or IS_SANDCASTLE, "NYI: TemporaryFileName support for Windows or Sandcastle")
     def test_old_models_bc(self):
@@ -12521,6 +12894,13 @@ def test_pickle_checkpoint(self):
         self._test_pickle_checkpoint('cpu')
         self._test_pickle_checkpoint_views('cpu')
 
+    def test_string_list(self):
+        def fn(string):
+            # type: (str) -> List[str]
+            return list(string)
+
+        self.checkScript(fn, ("abcdefgh",))
+
     def test_split(self):
         def split_two(tensor):
             a, b, c = torch.split(tensor, 2, dim=1)
@@ -12529,6 +12909,14 @@ def split_two(tensor):
         y = torch.randn(3, 6)
         self.checkScript(split_two, [(x + y)])
 
+    def test_python_op_name(self):
+        import random
+
+        with self.assertRaisesRegex(RuntimeError, "randint"):
+            @torch.jit.script
+            def fn():
+                return random.randint()
+
 
 class MnistNet(nn.Module):
     def __init__(self):
@@ -13820,14 +14208,14 @@ class TestJitGeneratedFunctional(JitTestCase):
         '', (True, 'aten::_batch_norm_impl_index')),
     ('instance_norm', (S, S, S), (non_differentiable(torch.zeros(S)), non_differentiable(torch.ones(S))),),
     ('layer_norm', (S, S, S, S), ([5],), '',
-     (True, ['aten::contiguous', 'aten::_batch_norm_impl_index'])),
+     (False, ['aten::contiguous', 'aten::_batch_norm_impl_index'])),
     ('layer_norm', (S, S, S, S), ([5], non_differentiable(torch.rand(S)),), 'with_only_weight',
-     (True, ['aten::contiguous', 'aten::_batch_norm_impl_index'])),
+     (False, ['aten::contiguous', 'aten::_batch_norm_impl_index'])),
     ('layer_norm', (S, S, S, S), ([5], None, non_differentiable(torch.rand(S)),), 'with_only_bias',
-     (True, ['aten::contiguous', 'aten::_batch_norm_impl_index'])),
+     (False, ['aten::contiguous', 'aten::_batch_norm_impl_index'])),
     ('layer_norm', (S, S, S, S), ([5], non_differentiable(torch.rand(S)),
                                   non_differentiable(torch.rand(S))), 'with_weight_and_bias',
-     (True, ['aten::contiguous', 'aten::_batch_norm_impl_index', 'aten::addcmul'])),
+     (False, ['aten::contiguous', 'aten::_batch_norm_impl_index', 'aten::addcmul'])),
     ('group_norm', (S, S, S), (1, torch.rand(5),),),
     ('local_response_norm', (S, S, S), (2, ),),
     ('nll_loss', F.log_softmax(torch.randn(3, 5), dim=0), (torch.tensor([1, 0, 4]),), '', (True, 'aten::nll_loss_forward')),
@@ -15273,6 +15661,53 @@ def _xor():  # noqa: E306
             def test():
                 return Foo(torch.tensor(1)) + Foo(torch.tensor(1))
 
+    def test_cast_overloads(self):
+        @torch.jit.script
+        class Foo(object):
+            def __init__(self, val):
+                # type: (float) -> None
+                self.val = val
+
+            def __int__(self):
+                return int(self.val)
+
+            def __float__(self):
+                return self.val
+
+            def __bool__(self):
+                return bool(self.val)
+
+            def __str__(self):
+                return str(self.val)
+
+        def test(foo):
+            # type: (Foo) -> Tuple[int, float, bool]
+            if foo:
+                pass
+            return int(foo), float(foo), bool(foo)
+
+        fn = torch.jit.script(test)
+        self.assertEqual(fn(Foo(0.5)), test(0.5))
+        self.assertEqual(fn(Foo(0.)), test(0.0))
+        # str has slightly different formatting
+        self.assertTrue("0.5" in (str(Foo(0.5))))
+        self.assertTrue("0." in (str(Foo(0.0))))
+
+        @torch.jit.script
+        class BadBool(object):
+            def __init__(self):
+                pass
+
+            def __bool__(self):
+                return (1, 2)
+
+        with self.assertRaisesRegex(RuntimeError, "expected a bool expression for condition"):
+            @torch.jit.script
+            def test():
+                if BadBool():
+                    print(1)
+                    pass
+
     def test_init_compiled_first(self):
         @torch.jit.script  # noqa: B903
         class Foo(object):
diff --git a/test/test_jit_fuser.py b/test/test_jit_fuser.py
index d5b93381eae8..e992caee55c4 100644
--- a/test/test_jit_fuser.py
+++ b/test/test_jit_fuser.py
@@ -282,6 +282,22 @@ def funcOptMax(a, b):
             graph = backward_graph(s)
             self.assertAllFused(graph)
 
+    @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows")
+    @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
+    @skipIfRocm
+    def test_dropout(self):
+        def func(x):
+            x = torch.nn.functional.dropout(x)
+            return torch.nn.functional.relu(x)
+
+        a = torch.randn(4, 4, dtype=torch.float, device='cuda', requires_grad=True)
+        s = torch.jit.script(func, (a,))
+        self.assertAllFused(s.graph_for(a,), except_for={'aten::div', 'prim::Constant'})
+        c = s(a)
+        c.sum().backward()
+        graph = backward_graph(s)
+        self.assertAllFused(graph, except_for={'aten::div', 'prim::Constant'})
+
     @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows")
     @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
     @skipIfRocm
@@ -478,7 +494,7 @@ def test_norm_decompose(nm, in_opt_graph, not_in_opt_graph, in_fusegraph):
         # test for layernorm decompose
         lm = nn.LayerNorm(8)
         test_norm_decompose(lm, ['aten::batch_norm_stats'],
-                            ['aten::layer_norm('], ['aten::sub', 'aten::mul', 'aten::addcmul'])
+                            ['aten::layer_norm('], ['aten::sub', 'aten::mul', 'aten::add'])
 
     @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows")
     @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
diff --git a/test/test_mkldnn.py b/test/test_mkldnn.py
index ea3ae4228aa9..c82fcaf51795 100644
--- a/test/test_mkldnn.py
+++ b/test/test_mkldnn.py
@@ -3,8 +3,10 @@
 import unittest
 
 import torch
+import torch.jit
 from torch.utils import mkldnn as mkldnn_utils
-from common_utils import TestCase, run_tests
+from common_utils import TestCase, run_tests, TemporaryFileName
+
 from torch.autograd.gradcheck import gradgradcheck, gradcheck
 
 
@@ -88,7 +90,7 @@ def test_detach(self):
 
     def test_repr(self):
         self.assertTrue("layout=torch._mkldnn" in str(torch.randn((1, 2, 3, 4),
-                        dtype=torch.float, device=torch.device('cpu')).to_mkldnn()))
+                                                                  dtype=torch.float, device=torch.device('cpu')).to_mkldnn()))
 
     def test_conv2d(self):
         for groups in [1, 4]:
@@ -109,6 +111,9 @@ def test_conv2d(self):
                     conv2d(x),
                     mkldnn_conv2d(x.to_mkldnn()).to_dense())
 
+                self._test_serialization(mkldnn_conv2d, (x.to_mkldnn(),))
+                self._test_tracing(mkldnn_conv2d, (x.to_mkldnn(),))
+
     def test_relu(self):
         x = torch.randn((4, 5), dtype=torch.float32) * 10
         self.assertEqual(torch.relu(x), torch.relu(x.to_mkldnn()).to_dense())
@@ -172,6 +177,9 @@ def test_batch_norm2d(self):
                 bn(x),
                 mkldnn_bn(x.to_mkldnn()).to_dense())
 
+            self._test_serialization(mkldnn_bn, (x.to_mkldnn(),))
+            self._test_tracing(mkldnn_bn, (x.to_mkldnn(),))
+
     def test_add(self):
         N = torch.randint(3, 10, (1,)).item()
         C = torch.randint(3, 100, (1,)).item()
@@ -231,12 +239,41 @@ def test_linear(self):
         x = torch.randn(3, in_features, dtype=torch.float32) * 10
 
         for bias in [True, False]:
-            linear = torch.nn.Linear(in_features, out_features).float()
+            linear = torch.nn.Linear(in_features, out_features, bias=bias).float()
             mkldnn_linear = mkldnn_utils.to_mkldnn(copy.deepcopy(linear))
             self.assertEqual(
                 linear(x),
                 mkldnn_linear(x.to_mkldnn()).to_dense())
 
+            self._test_serialization(mkldnn_linear, (x.to_mkldnn(),))
+            self._test_tracing(mkldnn_linear, (x.to_mkldnn(),))
+
+    def test_sigmoid(self):
+        x = torch.randn(4, 5, dtype=torch.float32) * 10
+        mkldnn_x = x.to_mkldnn()
+        self.assertEqual(
+            torch.sigmoid(x),
+            torch.sigmoid(mkldnn_x).to_dense(),
+        )
+        # inplace
+        torch.sigmoid_(x)
+        torch.sigmoid_(mkldnn_x)
+        self.assertEqual(x, mkldnn_x.to_dense())
+
+    def _test_serialization(self, module, inputs):
+        with TemporaryFileName() as fname:
+            torch.jit.save(module, fname)
+            loaded = torch.jit.load(fname)
+            self.assertEqual(
+                module(*inputs).to_dense(),
+                loaded(*inputs).to_dense())
+
+    def _test_tracing(self, module, inputs):
+        traced = torch.jit.trace(module, inputs, check_trace=False)
+        self.assertEqual(
+            module(*inputs).to_dense(),
+            traced(*inputs).to_dense())
+
 
 if __name__ == '__main__':
     run_tests()
diff --git a/test/test_nn.py b/test/test_nn.py
index 9ef82d54b160..5105dabb69d1 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -2070,24 +2070,61 @@ def test_embedding_dense_grad(self):
     def test_embedding_dense_grad_cuda(self):
         self._test_embedding_dense_grad("cuda")
 
+    def test_move_sparse_half_embedding(self):
+        embedding = nn.Embedding(10, 3, sparse=True)
+        self.assertEqual(embedding.weight.device.type, 'cpu')
+        self.assertEqual(embedding.weight.dtype, torch.float64)
+        embedding.to(torch.float16)
+        self.assertEqual(embedding.weight.dtype, torch.float16)
+        self.assertEqual(embedding.embedding_dim, 3)
+        self.assertEqual(embedding.num_embeddings, 10)
+
+        if torch.cuda.is_available():
+            embedding.to('cuda')
+            self.assertEqual(embedding.weight.device.type, 'cuda')
+            embedding.to('cpu')
+            self.assertEqual(embedding.weight.device.type, 'cpu')
+
     def test_embedding_sparse_backward(self):
+        self._test_embedding_backward()
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    def test_embedding_sparse_half_backward(self):
+        # same as test_embedding_sparse_backward above but testing half types in
+        # cuda. cpu sum not supported for half types.
+        self._test_embedding_backward('cuda', torch.float16)
+
+    def _test_embedding_backward(self, device='cpu', dtype=torch.float64):
         embedding = nn.Embedding(10, 3, sparse=True)
+        tensor = torch.tensor([[7, 1, 3]])
+        ones = torch.tensor(1.).expand(3, 3)
+        tensorTwice = tensor.repeat(1, 2)
+        onesTwice = torch.cat((ones, ones))
+
+        embedding = embedding.to(dtype=dtype).to(device)
+        tensor = tensor.to(device)
+        ones = ones.to(device)
+        tensorTwice = tensorTwice.to(device)
+        onesTwice = onesTwice.to(device)
+
         embedding.zero_grad()
-        embedding(torch.LongTensor([7, 1, 3])).sum().backward()
-        self.assertEqual(embedding.weight.grad._indices(), torch.LongTensor([[7, 1, 3]]))
-        self.assertEqual(embedding.weight.grad._values(), torch.tensor(1.).expand(3, 3))
+        embedding(tensor[0]).sum().backward()
+        self.assertEqual(embedding.weight.grad._indices(), tensor)
+        self.assertEqual(embedding.weight.grad._values(), ones)
 
         embedding.zero_grad()
-        embedding(torch.LongTensor([7, 1, 3])).sum().backward()
-        embedding(torch.LongTensor([7, 1, 3])).sum().backward()
-        self.assertEqual(embedding.weight.grad._indices(), torch.LongTensor([[7, 1, 3, 7, 1, 3]]))
-        self.assertEqual(embedding.weight.grad._values(), torch.tensor(1.).expand(6, 3))
+        embedding(tensor[0]).sum().backward()
+        embedding(tensor[0]).sum().backward()
+        self.assertEqual(embedding.weight.grad._indices(), tensorTwice)
+        self.assertEqual(embedding.weight.grad._values(), onesTwice)
 
         embedding.zero_grad()
-        embedding(torch.LongTensor([7, 1, 3])).sum().backward()
-        embedding(torch.LongTensor([8, 1, 3])).sum().backward()
-        self.assertEqual(embedding.weight.grad._indices(), torch.LongTensor([[7, 1, 3, 8, 1, 3]]))
-        self.assertEqual(embedding.weight.grad._values(), torch.tensor(1.).expand(6, 3))
+        embedding(tensor[0]).sum().backward()
+        tensor[0, 0] = 8
+        embedding(tensor[0]).sum().backward()
+        tensorTwice[0, 3] = 8
+        self.assertEqual(embedding.weight.grad._indices(), tensorTwice)
+        self.assertEqual(embedding.weight.grad._values(), onesTwice)
 
     def test_embedding_padding_idx(self):
         embedding = nn.Embedding(10, 20, padding_idx=0)
@@ -2377,6 +2414,7 @@ def _test_EmbeddingBag_vs_Embedding(self, N, D, B, L, max_norm=None,
             needed_prec = dtype2prec[dtype] * 2
         else:
             needed_prec = backward_prec
+
         self.assertEqual(es_weight_grad, e.weight.grad, needed_prec)
 
         if test_per_sample_weights and trainable_per_sample_weights:
@@ -2564,12 +2602,13 @@ def test_contig_wrong_stride_cudnn(self):
 
     def test_embedding_bag(self):
         for dtype in [torch.double, torch.float]:
-            # TODO: figure out why backward on float breaks
-            test_backward = dtype is not torch.float
-            self._test_EmbeddingBag(False, 'sum', False, test_backward=test_backward, dtype=dtype)
-            self._test_EmbeddingBag(False, 'mean', False, test_backward=test_backward, dtype=dtype)
-            self._test_EmbeddingBag(False, 'max', False, test_backward=test_backward, dtype=dtype)
+            self._test_EmbeddingBag(False, 'sum', False, dtype=dtype)
+            self._test_EmbeddingBag(False, 'mean', False, dtype=dtype)
+            self._test_EmbeddingBag(False, 'max', False, dtype=dtype)
 
+            # TODO: figure out why precision on sparse embeddings isn't the
+            # same as for dense.
+            test_backward = dtype is not torch.float
             self._test_EmbeddingBag(False, 'sum', True, test_backward=test_backward, dtype=dtype)
             self._test_EmbeddingBag(False, 'mean', True, test_backward=test_backward, dtype=dtype)
 
@@ -2733,10 +2772,11 @@ def test_embedding_bag_cuda(self, dtype=torch.float):
         self._test_EmbeddingBag(True, 'sum', False, dtype)
         self._test_EmbeddingBag(True, 'mean', False, dtype)
         self._test_EmbeddingBag(True, 'max', False, dtype)
-        if dtype != torch.half:
-            # torch.cuda.sparse.HalfTensor is not enabled.
-            self._test_EmbeddingBag(True, 'sum', True, dtype)
-            self._test_EmbeddingBag(True, 'mean', True, dtype)
+
+        # see 'todo' in test_embedding_bag.
+        test_backward = dtype is not torch.float16
+        self._test_EmbeddingBag(True, 'sum', True, dtype, test_backward=test_backward)
+        self._test_EmbeddingBag(True, 'mean', True, dtype, test_backward=test_backward)
 
     def test_fractional_max_pool2d(self):
         x = torch.randn(1, 2, 7, 7, requires_grad=True)
@@ -3197,29 +3237,36 @@ def verify_reduction_scalars(input, reduction, output):
                         output = m(sigmoid(input), target)
                         verify_reduction_scalars(input, reduction, output)
 
+    @unittest.skipIf((not TEST_NUMPY) or (not TEST_SCIPY) or (scipy.__version__ < '1.0.0'),
+                     "Scipy v1.0 and/or numpy not found")
     def test_multihead_attention(self):
-        def _scaled_dot_attn_ref(Q, K, V, dims, unseen_mask=False, src_lengths=None):
+        def _scaled_dot_attn_ref(Q, K, V, dims, unseen_mask=None, src_lengths=None, 
+                                 attn_mask=None, add_zero_attn=False):
             """ Numpy-based reference implementation of scaled dot attention
             for testing"""
+
             QKT = _batchmatmul(
                 Q,
                 np.transpose(K, axes=[0, 1, 3, 2])
                 / np.sqrt(dims[3], dtype=np.float32),  # divide by sqrt(d_head)
             )
-            if unseen_mask or src_lengths is not None:
-                b1, b2, s1, s2 = QKT.shape
+            b1, b2, s1, s2 = QKT.shape
+            if unseen_mask is not None or src_lengths is not None:
                 # assert s1 == s2
                 for i in range(b1):
                     for j in range(b2):
                         for m in range(s1):
                             for n in range(s2):
-                                if unseen_mask and n > m:
+                                if unseen_mask[m][n] == 0:
                                     QKT[i, j, m, n] = -np.inf
                                 if src_lengths is not None and n >= src_lengths[i]:
                                     QKT[i, j, m, n] = -np.inf
+
             reference = _softmax(QKT)
+            ref_attn_weight = reference
+            ref_attn_weight = np.sum(ref_attn_weight, axis=1) / b2
             reference = _batchmatmul(reference, V)
-            return reference
+            return reference, ref_attn_weight
 
         def _batchmatmul(a, b):  # batchmatmul over 4 dim matrix
             """ Numpy-based batch matrix multiply over 4 dim matrix"""
@@ -3235,7 +3282,8 @@ def _batchmatmul(a, b):  # batchmatmul over 4 dim matrix
 
         def _softmax(x):  # softmax over 4 dim matrix
             """ Numpy-based reference softmax over 4 dim matrix"""
-            output = np.zeros(x.shape, dtype=np.float32)
+            np.seterr(invalid='ignore')
+            output = np.zeros(x.shape, dtype=np.float64)
             for i in range(x.shape[0]):
                 for j in range(x.shape[1]):
                     for k in range(x.shape[2]):
@@ -3298,7 +3346,7 @@ def _create_src_lengths_mask(batch_size, src_lengths):
             # returns [batch_size, max_seq_len]
             return (src_indices < src_lengths).int().detach()
 
-        def _multihead_attn_test_helper(use_src_lengths):
+        def _multihead_attn_test_helper(add_key_padding_mask, add_bias_kv=False, add_zero_attn=False):
             for _ in range(100):
                 batch_sz, seq_len = [random.randint(2, 10) for r in range(2)]
                 d_head = random.randint(3, 10)
@@ -3308,7 +3356,7 @@ def _multihead_attn_test_helper(use_src_lengths):
 
                 src_lengths = None
                 src_lengths_tensor = None
-                if use_src_lengths:
+                if add_key_padding_mask:
                     src_lengths, src_lengths_tensor = _generate_src_lengths(
                         batch_size=batch_sz, seq_len=seq_len
                     )
@@ -3317,28 +3365,44 @@ def _multihead_attn_test_helper(use_src_lengths):
                 K = np.random.rand(*dims).astype(np.float64)
                 V = K
                 Q = np.expand_dims(decoder_state, 1)
+                attn_mask = np.random.randint(0 , 2, size=(1, seq_len))
+                attn_mask_tensor = torch.from_numpy(attn_mask).float()
+                attn_mask_tensor.masked_fill_(attn_mask_tensor == 0, float('-inf'))
+                attn_mask_tensor.masked_fill_(attn_mask_tensor > 0, float('0.0'))
+                attn_mask_tensor = attn_mask_tensor.double()
 
                 decoder_state_tensor = torch.from_numpy(decoder_state).double()
                 source_hid_tensor = torch.from_numpy(K).double().transpose(0, 1)
 
-                multihead_attn_module = MultiheadAttention(d_model, nheads)
+                multihead_attn_module = MultiheadAttention(d_model, nheads, 
+                                                           add_bias_kv=add_bias_kv,
+                                                           add_zero_attn=add_zero_attn)
+
+                if add_bias_kv:
+                    bias_k = multihead_attn_module.bias_k.detach().numpy()
+                    bias_v = multihead_attn_module.bias_v.detach().numpy()
+                else:
+                    bias_k = None
+                    bias_v = None
 
                 _batch_size = decoder_state_tensor.shape[0]
                 _Q = decoder_state_tensor.unsqueeze(1).transpose(0, 1)
                 _V = source_hid_tensor
                 _K = source_hid_tensor
                 src_len_mask = None
-                if src_lengths is not None and use_src_lengths:
+                if src_lengths is not None and add_key_padding_mask:
                     # [batch_size, 1, seq_len]
                     src_len_mask_int = _create_src_lengths_mask(
                         batch_size=_batch_size, src_lengths=src_lengths_tensor
                     )
                     src_len_mask = src_len_mask_int != 1
-
-                result = multihead_attn_module(
+                result, result_weight = multihead_attn_module(
                     _Q, _K, _V,
                     key_padding_mask=src_len_mask,
-                    need_weights=True)[0].squeeze(0).detach().numpy()
+                    need_weights=True,
+                    attn_mask=attn_mask_tensor)
+
+                result = result.squeeze(0).detach().numpy()
 
                 Q_fc = _fc(Q, "in_proj_", multihead_attn_module, end=d_model)
                 K_fc = _fc(
@@ -3346,20 +3410,31 @@ def _multihead_attn_test_helper(use_src_lengths):
                 )
                 V_fc = _fc(V, "in_proj_", multihead_attn_module, start=2 * d_model)
 
+                if add_bias_kv:
+                    K_fc = np.concatenate((K_fc, np.repeat(bias_k, K_fc.shape[0], axis=0)), axis=1)
+                    V_fc = np.concatenate((V_fc, np.repeat(bias_v, V_fc.shape[0], axis=0)), axis=1)
+                    attn_mask = np.concatenate((attn_mask, np.ones([1, 1])), axis=1)
+                    dims[1] += 1
                 Q_split = _split_heads_ref(
                     Q_fc, [batch_sz, 1, d_model], nheads, d_head
                 )
                 K_split = _split_heads_ref(K_fc, dims, nheads, d_head)
                 V_split = _split_heads_ref(V_fc, dims, nheads, d_head)
 
-                attn_heads = _scaled_dot_attn_ref(
+                if add_zero_attn:
+                    dims[1] += 1
+                    K_split = np.concatenate((K_split, np.zeros([K_split.shape[0], K_split.shape[1], 1, K_split.shape[3]])), axis=2)
+                    V_split = np.concatenate((V_split, np.zeros([V_split.shape[0], V_split.shape[1], 1, V_split.shape[3]])), axis=2)
+                    attn_mask = np.concatenate((attn_mask, np.ones([1, 1])), axis=1)
+
+                attn_heads, ref_attn_weight = _scaled_dot_attn_ref(
                     Q=Q_split,
                     K=K_split,
                     V=V_split,
                     dims=Q_split.shape,
-                    src_lengths=src_lengths,
+                    unseen_mask=attn_mask,
+                    src_lengths=src_lengths
                 )
-
                 combined_attn_heads = _combine_heads_ref(
                     X=attn_heads, dims=[batch_sz, 1], nheads=nheads, d_head=d_head
                 )
@@ -3373,14 +3448,27 @@ def _multihead_attn_test_helper(use_src_lengths):
                 self.assertEqual(tuple(result.shape), (batch_sz, d_model))
                 np.testing.assert_allclose(result, reference, atol=1e-5)
 
+                # result_weight = ref_attn_weight
+                result_weight = result_weight.detach().numpy()
+                self.assertEqual(tuple(result_weight.shape), tuple(ref_attn_weight.shape))
+                np.testing.assert_allclose(result_weight, ref_attn_weight, atol=1e-5)
+
+        def test_multihead_attn_add_bias_kv():
+            _multihead_attn_test_helper(add_key_padding_mask=None, add_bias_kv=True)
+
+        def test_multihead_attn_add_zero_attn():
+            _multihead_attn_test_helper(add_key_padding_mask=None, add_zero_attn=True)
+
         def test_multihead_attn_no_masking():
-            _multihead_attn_test_helper(use_src_lengths=None)
+            _multihead_attn_test_helper(add_key_padding_mask=None)
 
-        def test_multihead_attn_with_src_lengths():
-            _multihead_attn_test_helper(use_src_lengths=True)
+        def test_multihead_attn_key_padding_mask():
+            _multihead_attn_test_helper(add_key_padding_mask=True)
 
+        test_multihead_attn_add_zero_attn()  # Test MultiheadAttention with add_zero_attn
+        test_multihead_attn_add_bias_kv()  # Test MultiheadAttention with add_bias_kv
         test_multihead_attn_no_masking()   # Test MultiheadAttention without masking
-        test_multihead_attn_with_src_lengths()  # Test MultiheadAttention with src lengths
+        test_multihead_attn_key_padding_mask()  # Test MultiheadAttention with src lengths
 
     def test_normalize(self):
         inputs = torch.randn(1, 3, 4, 4, requires_grad=True)
@@ -4308,6 +4396,19 @@ def test_load_state_dict_BC(self):
         self.assertEqual(bn.num_batches_tracked.dtype, torch.long)
         self.assertEqual(bn.num_batches_tracked.item(), 0)
 
+    @unittest.skipIf(not PY3, 'Python 2.7 generates cyclic trash')
+    def test_load_state_dict_ref_cycle(self):
+        # load_state_dict shouldn't cause a reference cycle involving Tensors
+        import gc
+
+        m = torch.nn.LSTM(16, 16, bidirectional=True)
+
+        gc.collect()
+        m.load_state_dict(deepcopy(m).state_dict())
+        refcycles = gc.collect()
+
+        self.assertEqual(refcycles, 0)
+
     def test_parameter_assignment(self):
         l = nn.Linear(5, 5)
 
@@ -4939,6 +5040,17 @@ def test_invalid_dropout_p(self):
         self.assertRaises(ValueError, lambda: F.dropout(v, -0.1))
         self.assertRaises(ValueError, lambda: F.dropout(v, 1.1))
 
+    def test_empty_dropout(self):
+        x = torch.Tensor([])
+        out = torch.nn.functional.dropout(x)
+        self.assertEqual(out.size(), x.size())
+
+    @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
+    def test_empty_dropout_cuda(self):
+        x = torch.Tensor([]).to('cuda')
+        out = torch.nn.functional.dropout(x)
+        self.assertEqual(out.size(), x.size())
+
     def test_pad_sequence(self):
         def pad(tensor, length):
             return torch.cat(
diff --git a/test/test_nn_quantized.py b/test/test_nn_quantized.py
index 845fc0f7b00b..fa7fafaa8831 100644
--- a/test/test_nn_quantized.py
+++ b/test/test_nn_quantized.py
@@ -18,7 +18,7 @@ def test_functional_api(self):
         Y = X.numpy().copy()
         Y[Y < 0] = 0
         qY = _quantize(Y, scale, zero_point)
-        qX = X.quantize_linear(scale=scale, zero_point=zero_point)
+        qX = X.quantize_linear(scale=scale, zero_point=zero_point, dtype=torch.quint8)
         qY_hat = F.relu(qX)
         np.testing.assert_equal(qY, qY_hat.int_repr())
 
diff --git a/test/test_numba_integration.py b/test/test_numba_integration.py
index 105e181a128f..deacc58902e9 100644
--- a/test/test_numba_integration.py
+++ b/test/test_numba_integration.py
@@ -256,6 +256,94 @@ def test_active_device(self):
                 numba.cuda.as_cuda_array(cudat), numba.cuda.devicearray.DeviceNDArray
             )
 
+    @unittest.skipIf(not TEST_NUMPY, "No numpy")
+    @unittest.skipIf(not TEST_CUDA, "No cuda")
+    @unittest.skipIf(not TEST_NUMBA_CUDA, "No numba.cuda")
+    def test_from_cuda_array_interface(self):
+        """torch.as_tensor() and torch.tensor() supports the __cuda_array_interface__ protocol.
+
+        If an object exposes the __cuda_array_interface__, .as_tensor() and .tensor()
+        will use the exposed device memory.
+
+        See:
+        https://numba.pydata.org/numba-doc/latest/cuda/cuda_array_interface.html
+        """
+
+        dtypes = [
+            numpy.float64,
+            numpy.float32,
+            numpy.int64,
+            numpy.int32,
+            numpy.int16,
+            numpy.int8,
+            numpy.uint8,
+        ]
+        for dtype in dtypes:
+            numpy_arys = [
+                numpy.arange(6).reshape(2, 3).astype(dtype),
+                numpy.arange(6).reshape(2, 3).astype(dtype)[1:],  # View offset should be ignored
+                numpy.arange(6).reshape(2, 3).astype(dtype)[:, None],  # change the strides but still contiguous
+            ]
+            # Zero-copy when using `torch.as_tensor()`
+            for numpy_ary in numpy_arys:
+                numba_ary = numba.cuda.to_device(numpy_ary)
+                torch_ary = torch.as_tensor(numba_ary, device="cuda")
+                self.assertEqual(numba_ary.__cuda_array_interface__, torch_ary.__cuda_array_interface__)
+                self.assertEqual(torch_ary.cpu().data.numpy(), numpy.asarray(numba_ary))
+
+                # Check that `torch_ary` and `numba_ary` points to the same device memory
+                torch_ary += 42
+                self.assertEqual(torch_ary.cpu().data.numpy(), numpy.asarray(numba_ary))
+
+            # Implicit-copy because `torch_ary` is a CPU array
+            for numpy_ary in numpy_arys:
+                numba_ary = numba.cuda.to_device(numpy_ary)
+                torch_ary = torch.as_tensor(numba_ary, device="cpu")
+                self.assertEqual(torch_ary.data.numpy(), numpy.asarray(numba_ary))
+
+                # Check that `torch_ary` and `numba_ary` points to different memory
+                torch_ary += 42
+                self.assertEqual(torch_ary.data.numpy(), numpy.asarray(numba_ary) + 42)
+
+            # Explict-copy when using `torch.tensor()`
+            for numpy_ary in numpy_arys:
+                numba_ary = numba.cuda.to_device(numpy_ary)
+                torch_ary = torch.tensor(numba_ary, device="cuda")
+                self.assertEqual(torch_ary.cpu().data.numpy(), numpy.asarray(numba_ary))
+
+                # Check that `torch_ary` and `numba_ary` points to different memory
+                torch_ary += 42
+                self.assertEqual(torch_ary.cpu().data.numpy(), numpy.asarray(numba_ary) + 42)
+
+    @unittest.skipIf(not TEST_NUMPY, "No numpy")
+    @unittest.skipIf(not TEST_CUDA, "No cuda")
+    @unittest.skipIf(not TEST_NUMBA_CUDA, "No numba.cuda")
+    def test_from_cuda_array_interface_lifetime(self):
+        """torch.as_tensor(obj) tensor grabs a reference to obj so that the lifetime of obj exceeds the tensor"""
+        numba_ary = numba.cuda.to_device(numpy.arange(6))
+        torch_ary = torch.as_tensor(numba_ary, device="cuda")
+        self.assertEqual(torch_ary.__cuda_array_interface__, numba_ary.__cuda_array_interface__)  # No copy
+        del numba_ary
+        self.assertEqual(torch_ary.cpu().data.numpy(), numpy.arange(6))  # `torch_ary` is still alive
+
+    @unittest.skipIf(not TEST_NUMPY, "No numpy")
+    @unittest.skipIf(not TEST_CUDA, "No cuda")
+    @unittest.skipIf(not TEST_NUMBA_CUDA, "No numba.cuda")
+    @unittest.skipIf(not TEST_MULTIGPU, "No multigpu")
+    def test_from_cuda_array_interface_active_device(self):
+        """torch.as_tensor() tensor device must match active numba context."""
+
+        # Both torch/numba default to device 0 and can interop freely
+        numba_ary = numba.cuda.to_device(numpy.arange(6))
+        torch_ary = torch.as_tensor(numba_ary, device="cuda")
+        self.assertEqual(torch_ary.cpu().data.numpy(), numpy.asarray(numba_ary))
+        self.assertEqual(torch_ary.__cuda_array_interface__, numba_ary.__cuda_array_interface__)
+
+        # Torch should raise `RuntimeError` when the Numba and Torch device differ
+        numba_ary = numba.cuda.to_device(numpy.arange(6))
+        with self.assertRaises(RuntimeError):
+            torch.as_tensor(numba_ary, device=torch.device("cuda", 1))
+
 
 if __name__ == "__main__":
     common.run_tests()
diff --git a/test/test_quantized.py b/test/test_quantized.py
index e1c6b0ec2f74..8553265d372e 100644
--- a/test/test_quantized.py
+++ b/test/test_quantized.py
@@ -25,13 +25,21 @@ def _dequantize(qx, scale, zero_point):
     return x
 
 
+def _requantize(x, multiplier, zero_point, qmin=0, qmax=255, qtype=np.uint8):
+    """Requantizes a numpy array, i.e., intermediate int32 or int16 values are
+    converted back to given type"""
+    qx = (x * multiplier).round() + zero_point
+    qx = np.clip(qx, qmin, qmax).astype(qtype)
+    return qx
+
+
 # Make sure we won't have overflows from vpmaddubsw instruction used in FBGEMM.
 # On the current Intel x86 architecture, we need to utilize vpmaddubsw instruction
 # for the 8-bit int multiplication. This instruction vertically multiplies each
 # unsigned 8-bit integer from a with the corresponding signed 8-bit integer from
 # b, producing intermediate signed 16-bit integers. This function modifies the
 # weights to eliminate the overflow on the signed 16-bit integers.
-def avoid_vpmaddubsw_overflow_fc(
+def avoid_vpmaddubsw_overflow_linear(
     batch_size, input_channels, output_channels, X, X_min, X_max, W, W_min, W_max
 ):
     for i, j in np.ndindex((batch_size, output_channels)):
@@ -57,8 +65,8 @@ def avoid_vpmaddubsw_overflow_fc(
             assert -(1 << 15) <= x0 * w0 + x1 * w1 < (1 << 15)
 
 
-# Reference quantized FC operator
-def qfc_ref(X_q, X_scale, X_zp, W_q, W_scale, W_zp, b_q, Y_scale, Y_zp):
+# Reference quantized Linear operator
+def qlinear_ref(X_q, X_scale, X_zp, W_q, W_scale, W_zp, b_q, Y_scale, Y_zp):
     row_offsets_ref = X_q.sum(axis=1).astype(np.int32).reshape((-1, 1))
     col_offsets_ref = W_q.sum(axis=1).astype(np.int32).reshape((1, -1))
     assert X_q.ndim == 2
@@ -122,9 +130,7 @@ def test_qrelu(self):
         X = torch.arange(-5, 5, dtype=torch.float)
         scale = 2.0
         zero_point = 1
-        qX = X.quantize_linear(scale=scale, zero_point=zero_point)
-        # print("X:\n{}".format(X))
-        # print("\nQuantized:\n{}\nFake:\n{}".format(qX.int_repr(), _quantize(X.numpy(), scale, zero_point)))
+        qX = X.quantize_linear(scale=scale, zero_point=zero_point, dtype=torch.quint8)
 
         Y = X.numpy().copy()
         Y[Y < 0] = 0
@@ -132,28 +138,37 @@ def test_qrelu(self):
         qY_hat = relu(qX)
         np.testing.assert_equal(qY, qY_hat.int_repr())
 
-    """Tests the correctness of the quantized::sum_relu op."""
-    def test_qsumrelu_same_qparams(self):
-        sum_relu = torch.ops.quantized.sum_relu
+    """Tests the correctness of the add and add_relu op."""
+    def test_qadd_relu_same_qparams(self):
+        add_relu = torch.ops.quantized.add_relu
+        add = torch.ops.quantized.add
 
         A = torch.arange(-25, 25, dtype=torch.float)
         B = torch.arange(-25, 25, dtype=torch.float)
         scale = 2.0
         zero_point = 127
-        qA = A.quantize_linear(scale=scale, zero_point=zero_point)
-        qB = A.quantize_linear(scale=scale, zero_point=zero_point)
+        qA = A.quantize_linear(scale=scale, zero_point=zero_point, dtype=torch.quint8)
+        qB = A.quantize_linear(scale=scale, zero_point=zero_point, dtype=torch.quint8)
 
-        # Sum + ReLU ground truth
+        # Add ReLU ground truth
         C = (qA.dequantize() + qB.dequantize()).numpy()
-        C[C < 0] = 0
         qC = _quantize(C, scale, zero_point)
-
-        qC_hat = sum_relu(qA, qB, scale=scale, zero_point=zero_point)
-        np.testing.assert_equal(qC, qC_hat.int_repr())
-
-    """Tests the correctness of the quantized::sum_relu op."""
-    def test_qsumrelu_different_qparams(self):
-        sum_relu = torch.ops.quantized.sum_relu
+        qC_hat = add(qA, qB, scale=scale, zero_point=zero_point)
+        np.testing.assert_equal(qC, qC_hat.int_repr(),
+                                "Quantized addition failed.")
+
+        # Add + ReLU ground truth
+        Crelu = C.copy()
+        Crelu[C < 0] = 0
+        qCrelu = _quantize(Crelu, scale, zero_point)
+        qCrelu_hat = add_relu(qA, qB, scale=scale, zero_point=zero_point)
+        np.testing.assert_equal(qCrelu, qCrelu_hat.int_repr(),
+                                "Quantized addition with ReLU failed.")
+
+    """Tests the correctness of the add and add_relu op."""
+    def test_qadd_relu_different_qparams(self):
+        add_relu = torch.ops.quantized.add_relu
+        add = torch.ops.quantized.add
 
         A = torch.arange(-25, 25, dtype=torch.float)
         B = torch.arange(-25, 25, dtype=torch.float)
@@ -165,30 +180,37 @@ def test_qsumrelu_different_qparams(self):
         scale_C = 0.5
         zero_point_C = 5
 
-        qA = A.quantize_linear(scale=scale_A, zero_point=zero_point_A)
-        qB = A.quantize_linear(scale=scale_B, zero_point=zero_point_B)
+        qA = A.quantize_linear(scale=scale_A, zero_point=zero_point_A, dtype=torch.quint8)
+        qB = A.quantize_linear(scale=scale_B, zero_point=zero_point_B, dtype=torch.quint8)
 
-        # Sum + ReLU ground truth
+        # Add ground truth
         C = (qA.dequantize() + qB.dequantize()).numpy()
-        C[C < 0] = 0
         qC = _quantize(C, scale_C, zero_point_C)
+        qC_hat = add(qA, qB, scale=scale_C, zero_point=zero_point_C)
+        np.testing.assert_equal(qC, qC_hat.int_repr(),
+                                "Quantized addition failed.")
 
-        qC_hat = sum_relu(qA, qB, scale=scale_C, zero_point=zero_point_C)
-        np.testing.assert_equal(qC, qC_hat.int_repr())
+        # Add + ReLU ground truth
+        Crelu = C.copy()
+        Crelu[C < 0] = 0
+        qCrelu = _quantize(Crelu, scale_C, zero_point_C)
+        qCrelu_hat = add_relu(qA, qB, scale=scale_C, zero_point=zero_point_C)
+        np.testing.assert_equal(qCrelu, qCrelu_hat.int_repr(),
+                                "Quantized addition with ReLU failed.")
 
 
 @unittest.skipIf(
     TEST_WITH_UBSAN or not torch.fbgemm_is_cpu_supported(),
-    " Quantized FC requires FBGEMM. FBGEMM does not play"
+    " Quantized Linear requires FBGEMM. FBGEMM does not play"
     " well with UBSAN at the moment, so we skip the test if"
     " we are in a UBSAN environment.",
 )
-class TestQuantizedFC(unittest.TestCase):
-    """Tests the correctness of the quantized::fc op."""
+class TestQuantizedLinear(unittest.TestCase):
+    """Tests the correctness of the quantized::fbgemm_linear op."""
 
-    def test_qfc(self):
-        qfc_prepack = torch.ops.quantized.fbgemm_linear_prepack
-        qfc = torch.ops.quantized.fbgemm_linear
+    def test_qlinear(self):
+        qlinear_prepack = torch.ops.quantized.fbgemm_linear_prepack
+        qlinear = torch.ops.quantized.fbgemm_linear
 
         batch_size = 4
         input_channels = 16
@@ -204,7 +226,6 @@ def test_qfc(self):
         ).astype(np.uint8)
 
         W_scale = 0.4
-        # W_zp is the zero point for int8 quantization.
         W_zp = 2
         W_value_min = -128
         W_value_max = 127
@@ -214,7 +235,13 @@ def test_qfc(self):
             + W_value_min
         ).astype(np.int8)
 
-        avoid_vpmaddubsw_overflow_fc(
+        b_value_min = -10
+        b_value_max = 10
+        b_q0 = np.round(
+            np.random.rand(output_channels) * (b_value_max - b_value_min) + b_value_min
+        ).astype(np.int32)
+
+        avoid_vpmaddubsw_overflow_linear(
             batch_size,
             input_channels,
             output_channels,
@@ -228,24 +255,24 @@ def test_qfc(self):
 
         X = torch.from_numpy(_dequantize(X_q0, X_scale, X_zp)).to(dtype=torch.float)
         W = torch.from_numpy(_dequantize(W_q0, W_scale, W_zp)).to(dtype=torch.float)
+        b = torch.from_numpy(_dequantize(b_q0, X_scale * W_scale, 0)).to(dtype=torch.float)
 
-        X_q = X.quantize_linear(scale=X_scale, zero_point=X_zp)
-        # W_zp + 128 is the zero point for uint8 quantization.
-        W_q = W.quantize_linear(scale=W_scale, zero_point=W_zp + 128)
-        b_q = torch.round(torch.rand(output_channels) * 10 - 10).to(dtype=torch.int32)
+        X_q = X.quantize_linear(scale=X_scale, zero_point=X_zp, dtype=torch.quint8)
+        W_q = W.quantize_linear(scale=W_scale, zero_point=W_zp, dtype=torch.qint8)
+        b_q = b.quantize_linear(scale=X_scale * W_scale, zero_point=0, dtype=torch.qint32)
 
         # Compare X_scale * W_scale * input_channels * X_value_max * W_value_max with
         # Y_scale * 255 (max for uint8).
         Y_scale = 125.1234
         Y_zp = 5
 
-        # Reference quantized FC operator
-        Y_q_ref = qfc_ref(X_q0, X_scale, X_zp, W_q0, W_scale, W_zp, b_q.numpy(), Y_scale, Y_zp)
+        # Reference quantized Linear operator
+        Y_q_ref = qlinear_ref(X_q0, X_scale, X_zp, W_q0, W_scale, W_zp, b_q0, Y_scale, Y_zp)
 
-        # Weight prepacking operator for quantized FC
-        W_prepack = qfc_prepack(W_q)
-        # Quantized FC operator with prepacked weight
-        Y_q = qfc(X_q, W_prepack, b_q, Y_scale, Y_zp)
+        # Weight prepacking operator for quantized Linear
+        W_prepack = qlinear_prepack(W_q)
+        # Quantized Linear operator with prepacked weight
+        Y_q = qlinear(X_q, W_prepack, b_q, Y_scale, Y_zp)
 
         # Y_q_ref_real = _dequantize(Y_q_ref, Y_scale, Y_zp)
         # Y_q_real = Y_q.dequantize()
@@ -256,18 +283,18 @@ def test_qfc(self):
         # Reference quantized result from PyTorch Linear operator
         W_fp32 = W_q.dequantize().to(dtype=torch.float)
         X_fp32 = X_q.dequantize().to(dtype=torch.float)
-        b_fp32 = torch.from_numpy(_dequantize(b_q.numpy(), W_scale * X_scale, 0).astype(np.float)).to(dtype=torch.float)
+        b_fp32 = b_q.dequantize().to(dtype=torch.float)
         Y_fp32_ref = F.linear(X_fp32, W_fp32, b_fp32)
-        Y_q_ref2 = Y_fp32_ref.quantize_linear(Y_scale, Y_zp)
+        Y_q_ref2 = Y_fp32_ref.quantize_linear(Y_scale, Y_zp, torch.quint8)
 
         # Assert equal
         np.testing.assert_equal(Y_q_ref2.int_repr().numpy(), Y_q.int_repr().numpy())
 
 
-    """Tests the correctness of the quantized::fc op."""
-    def test_qfcrelu(self):
-        qfc_prepack = torch.ops.quantized.fbgemm_linear_prepack
-        qfcrelu = torch.ops.quantized.fbgemm_linear_relu
+    """Tests the correctness of the quantized::fbgemm_linear_relu op."""
+    def test_qlinear_relu(self):
+        qlinear_prepack = torch.ops.quantized.fbgemm_linear_prepack
+        qlinear_relu = torch.ops.quantized.fbgemm_linear_relu
 
         batch_size = 4
         input_channels = 16
@@ -292,7 +319,13 @@ def test_qfcrelu(self):
             + W_value_min
         ).astype(np.int8)
 
-        avoid_vpmaddubsw_overflow_fc(
+        b_value_min = -10
+        b_value_max = 10
+        b_q0 = np.round(
+            np.random.rand(output_channels) * (b_value_max - b_value_min) + b_value_min
+        ).astype(np.int32)
+
+        avoid_vpmaddubsw_overflow_linear(
             batch_size,
             input_channels,
             output_channels,
@@ -306,24 +339,25 @@ def test_qfcrelu(self):
 
         X = torch.from_numpy(_dequantize(X_q0, X_scale, X_zp)).to(dtype=torch.float)
         W = torch.from_numpy(_dequantize(W_q0, W_scale, W_zp)).to(dtype=torch.float)
+        b = torch.from_numpy(_dequantize(b_q0, X_scale * W_scale, 0)).to(dtype=torch.float)
 
-        X_q = X.quantize_linear(scale=X_scale, zero_point=X_zp)
-        W_q = W.quantize_linear(scale=W_scale, zero_point=W_zp + 128)
-        b_q = torch.round(torch.rand(output_channels) * 10 - 10).to(dtype=torch.int32)
+        X_q = X.quantize_linear(scale=X_scale, zero_point=X_zp, dtype=torch.quint8)
+        W_q = W.quantize_linear(scale=W_scale, zero_point=W_zp, dtype=torch.qint8)
+        b_q = b.quantize_linear(scale=X_scale * W_scale, zero_point=0, dtype=torch.qint32)
 
         # Compare X_scale * W_scale * input_channels * X_value_max * W_value_max with
         # Y_scale * 255 (max for uint8).
         Y_scale = 125.1234
         Y_zp = 5
 
-        # Reference quantized FC operator
-        Y_q_ref = qfc_ref(X_q0, X_scale, X_zp, W_q0, W_scale, W_zp, b_q.numpy(), Y_scale, Y_zp)
+        # Reference quantized Linear operator
+        Y_q_ref = qlinear_ref(X_q0, X_scale, X_zp, W_q0, W_scale, W_zp, b_q0, Y_scale, Y_zp)
         Y_q_ref[Y_q_ref < Y_zp] = Y_zp
 
-        # Weight prepacking operator for quantized FC
-        W_prepack = qfc_prepack(W_q)
-        # Quantized FC operator with prepacked weight
-        Y_q = qfcrelu(X_q, W_prepack, b_q, Y_scale, Y_zp)
+        # Weight prepacking operator for quantized Linear
+        W_prepack = qlinear_prepack(W_q)
+        # Quantized Linear operator with prepacked weight
+        Y_q = qlinear_relu(X_q, W_prepack, b_q, Y_scale, Y_zp)
 
         # Y_q_ref_real = _dequantize(Y_q_ref, Y_scale, Y_zp)
         # Y_q_real = Y_q.dequantize()
@@ -334,14 +368,128 @@ def test_qfcrelu(self):
         # Reference quantized result from PyTorch Linear operator
         W_fp32 = W_q.dequantize().to(dtype=torch.float)
         X_fp32 = X_q.dequantize().to(dtype=torch.float)
-        b_fp32 = torch.from_numpy(_dequantize(b_q.numpy(), W_scale * X_scale, 0).astype(np.float)).to(dtype=torch.float)
+        b_fp32 = b_q.dequantize().to(dtype=torch.float)
         Y_fp32_ref = F.linear(X_fp32, W_fp32, b_fp32)
         Y_fp32_ref[Y_fp32_ref < 0.0] = 0.0
-        Y_q_ref2 = Y_fp32_ref.quantize_linear(Y_scale, Y_zp)
+        Y_q_ref2 = Y_fp32_ref.quantize_linear(Y_scale, Y_zp, torch.quint8)
 
         # Assert equal
         np.testing.assert_equal(Y_q_ref2.int_repr().numpy(), Y_q.int_repr().numpy())
 
 
+@unittest.skipIf(
+    TEST_WITH_UBSAN or not torch.fbgemm_is_cpu_supported(),
+    " Quantized convolution requires FBGEMM. FBGEMM does not play"
+    " well with UBSAN at the moment, so we skip the test if"
+    " we are in a UBSAN environment.",
+)
+class TestQuantizedConv(unittest.TestCase):
+    """Tests the correctness of quantized convolution op."""
+    def test_qconv(self):
+
+        qconv = torch.ops.quantized.fbgemm_conv2d
+        qconv_prepack = torch.ops.quantized.fbgemm_conv_prepack
+
+        # N
+        batch_size = 1
+        # C
+        input_channels = 16
+        # H, W
+        height = width = 24
+        # K
+        output_channels = 8
+
+        kernel_h = kernel_w = 3
+        stride_h = stride_w = 1
+        padding_h = padding_w = 1
+        dilation_h = dilation_w = 1
+        groups = 1
+
+        W_value_min = 0
+        W_value_max = 5
+        # We use small values to avoid overflow.
+        # (the operator expects them in the format (output_channels, input_channels/groups, kernel_h, kernel_w))
+
+        W_init = torch.randint(
+            W_value_min,
+            W_value_max,
+            (output_channels, int(input_channels / groups), kernel_h, kernel_w),
+        )
+
+        b_init = torch.randint(0, 10, (output_channels,))
+
+        # Existing floating point conv operator
+        conv_op = torch.nn.Conv2d(
+            input_channels,
+            output_channels,
+            (kernel_h, kernel_w),
+            (stride_h, stride_w),
+            (padding_h, padding_w),
+            (dilation_h, dilation_w),
+            groups,
+        )
+
+        # assign the weights
+        conv_op.weight = torch.nn.Parameter(
+            W_init.to(dtype=torch.float), requires_grad=False
+        )
+        conv_op.bias = torch.nn.Parameter(
+            b_init.to(dtype=torch.float), requires_grad=False
+        )
+
+        X_value_min = 0
+        X_value_max = 4
+        X_init = torch.randint(
+            X_value_min, X_value_max, (batch_size, input_channels, height, width)
+        )
+
+        # run on an input tensor
+        result_ref = conv_op(X_init.to(dtype=torch.float))
+
+        # reformat X_init and W_init in the required format by conv operator
+        # NCHW -> NHWC
+        X_NHWC = X_init.permute([0, 2, 3, 1]).contiguous()
+        # KCRS -> RSCK
+        W_RSCK = W_init.permute([2, 3, 1, 0]).contiguous()
+
+        X_scale = 1.5
+        # Currently only 0 as zero point is supported.
+        X_zero_point = 0
+        X = X_scale * (X_NHWC - X_zero_point).to(dtype=torch.float)
+
+        W_scale = 2.5
+        W_zero_point = 0
+        W = W_scale * (W_RSCK - W_zero_point).to(dtype=torch.float)
+
+        X_q = X.quantize_linear(scale=X_scale, zero_point=X_zero_point, dtype=torch.quint8)
+        W_q = W.quantize_linear(scale=W_scale, zero_point=W_zero_point, dtype=torch.quint8)
+        b_q = b_init.to(dtype=torch.int32)
+
+        W_prepack = qconv_prepack(W_q, groups)
+        Y_scale = 7.3
+        Y_zero_point = 5
+
+        Y_q = qconv(
+            X_q,
+            W_prepack,
+            b_q,
+            [1, 1],  # stride
+            [1, 1],  # padding
+            [1, 1],  # dilation
+            [0, 0],  # output_padding
+            1,  # groups
+            Y_scale,
+            Y_zero_point,
+        )
+
+        result_NHWK = result_ref.permute([0, 2, 3, 1])
+        result_q = _requantize(
+            result_NHWK.numpy(), X_scale * W_scale / Y_scale, Y_zero_point
+        )
+
+        # Make sure the results match
+        np.testing.assert_equal(result_q, Y_q.int_repr().numpy())
+
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_sparse.py b/test/test_sparse.py
index 764f0a38c552..e105f91139d9 100644
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@@ -234,33 +234,44 @@ def fn(x):
             [0, 0, 0, 3],
             [0, 0, 1, 4],
         ])
-        v = self.value_tensor([2, 1, 3, 4])
-        x = self.sparse_tensor(i, v, torch.Size([3, 4, 5]))
-        res = self.value_tensor([
-            [[2, 0, 0, 0, 0],
-             [0, 0, 0, 0, 0],
-             [0, 0, 0, 0, 0],
-             [0, 0, 0, 0, 0]],
-            [[1, 0, 0, 0, 0],
-             [0, 0, 0, 0, 0],
-             [0, 0, 0, 0, 0],
-             [0, 0, 0, 0, 0]],
-            [[0, 3, 0, 0, 0],
-             [0, 0, 0, 0, 0],
-             [0, 0, 0, 0, 0],
-             [0, 0, 0, 0, 4]],
-        ])
-        test_tensor(x, res)
-
-        i = self.index_tensor([
-            [0, 1, 2, 2],
-            [0, 0, 0, 3],
-            [0, 0, 1, 4],
-        ])
-        v = self.value_empty(4, 0)
-        x = self.sparse_tensor(i, v, torch.Size([3, 4, 5, 0]))
-        res = self.value_empty(3, 4, 5, 0)
-        test_tensor(x, res)
+        # we don't have to_dense for half types on CPU because it is implemented
+        # with a slower add_ operation
+        for dtype in [torch.float16, torch.float64] if self.device != 'cpu' else [torch.float64]:
+            v = self.value_tensor([2, 1, 3, 4]).to(dtype=dtype)
+            x = self.sparse_tensor(i, v, torch.Size([3, 4, 5]))
+            res = self.value_tensor([
+                [[2, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0]],
+                [[1, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0]],
+                [[0, 3, 0, 0, 0],
+                 [0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 4]],
+            ]).to(dtype=dtype)
+
+            test_tensor(x, res)
+
+            i = self.index_tensor([
+                [0, 1, 2, 2],
+                [0, 0, 0, 3],
+                [0, 0, 1, 4],
+            ])
+            v = self.value_empty(4, 0).to(dtype=dtype)
+            x = self.sparse_tensor(i, v, torch.Size([3, 4, 5, 0]))
+            res = self.value_empty(3, 4, 5, 0).to(dtype=dtype)
+            test_tensor(x, res)
+
+    # half tensors on cpu don't implement to_dense, so need to convert to float
+    def _to_dense_half_safe(self, tensor):
+        if(tensor.dtype == torch.half and tensor.device.type == 'cpu'):
+            return tensor.to(torch.float).to_dense().to(torch.half)
+        else:
+            return tensor.to_dense()
 
     def test_to_sparse(self):
         shape = [10, 5, 19, 8]
@@ -269,12 +280,15 @@ def test_to_sparse(self):
             max_nnz *= dim_sz
             rnnz = torch.randint(2, max_nnz, (1,)).item()
             for nnz in [0, 1, rnnz]:
-                expected, _, _ = self._gen_sparse(dim, nnz, shape)
-                d = expected.to_dense()
-                result = d.to_sparse(dim)
-                self.assertEqual(d, result.to_dense())  # == not implemented for sparse tensors yet
-                self.assertEqual(expected.size(), result.size())
-                self.assertEqual(dim, result.sparse_dim())
+                for dtype in [torch.float16, torch.float64, torch.int]:
+                    expected, _, _ = self._gen_sparse(dim, nnz, shape)
+                    expected = expected.to(dtype)
+
+                    d = self._to_dense_half_safe(expected)
+                    result = d.to_sparse(dim)
+                    self.assertEqual(d, self._to_dense_half_safe(result))  # == not implemented for sparse tensors yet
+                    self.assertEqual(expected.size(), result.size())
+                    self.assertEqual(dim, result.sparse_dim())
 
         sp, _, _ = self._gen_sparse(2, 10, [3, 3, 3])
         self.assertRaises(RuntimeError, lambda: sp.to_sparse())
@@ -563,6 +577,12 @@ def test_Sparse_to_Sparse_copy_(self):
 
         # test type conversion (when x1.copy_(x2), x1.dtype should stay the same)
         x1 = x1.to(torch.float32)
+
+        x2 = x2.to(torch.float16)
+        x1_dtype = x1.dtype
+        x1.copy_(x2)
+        self.assertEqual(x1_dtype, x1.dtype)
+
         x2 = x2.to(torch.float64)
         x1_dtype = x1.dtype
         x1.copy_(x2)
@@ -630,6 +650,12 @@ def test_tensor(x):
         x = torch.sparse.FloatTensor(2, 3, 4)
         test_tensor(x)
 
+        x = torch.sparse.HalfTensor(2, 3, 4)
+        test_tensor(x)
+
+        x = torch.cuda.sparse.HalfTensor(2, 3, 4)
+        test_tensor(x)
+
         x = torch.sparse.FloatTensor(2, 3, 4, 0)
         test_tensor(x)
 
@@ -1512,33 +1538,33 @@ def test_factory(self):
                 for use_tensor_idx in [True, False]:
                     for use_tensor_val in [True, False]:
                         for use_cuda in ([False] if not torch.cuda.is_available() else [True, False]):
-                            # have to include size with cuda sparse tensors
-                            include_size = include_size or use_cuda
-                            dtype = torch.float64
-                            long_dtype = torch.int64
-                            device = torch.device('cpu') if not use_cuda else \
-                                torch.device(torch.cuda.device_count() - 1)
-                            indices = torch.tensor(([0], [2]), dtype=long_dtype) if use_tensor_idx else ([0], [2])
-                            if test_empty_tensor:
-                                values = self.value_empty(1, 0)
-                            else:
-                                if use_tensor_val:
-                                    values = torch.tensor([1.], dtype=dtype)
+                            for dtype in [torch.float64, torch.float16]:
+                                # have to include size with cuda sparse tensors
+                                include_size = include_size or use_cuda
+                                long_dtype = torch.int64
+                                device = torch.device('cpu') if not use_cuda else \
+                                    torch.device(torch.cuda.device_count() - 1)
+                                indices = torch.tensor(([0], [2]), dtype=long_dtype) if use_tensor_idx else ([0], [2])
+                                if test_empty_tensor:
+                                    values = self.value_empty(1, 0).to(dtype)
                                 else:
-                                    values = 1.
-                            if include_size:
-                                sparse_tensor = torch.sparse_coo_tensor(indices, values, size, dtype=dtype,
-                                                                        device=device, requires_grad=True)
-                            else:
-                                sparse_tensor = torch.sparse_coo_tensor(indices, values, dtype=dtype,
-                                                                        device=device, requires_grad=True)
-                            self.assertEqual(indices, sparse_tensor._indices())
-                            self.assertEqual(values, sparse_tensor._values())
-                            self.assertEqual(size if include_size else default_size, sparse_tensor.size())
-                            self.assertEqual(dtype, sparse_tensor.dtype)
-                            if use_cuda:
-                                self.assertEqual(device, sparse_tensor._values().device)
-                            self.assertEqual(True, sparse_tensor.requires_grad)
+                                    if use_tensor_val:
+                                        values = torch.tensor([1.], dtype=dtype)
+                                    else:
+                                        values = 1.
+                                if include_size:
+                                    sparse_tensor = torch.sparse_coo_tensor(indices, values, size, dtype=dtype,
+                                                                            device=device, requires_grad=True)
+                                else:
+                                    sparse_tensor = torch.sparse_coo_tensor(indices, values, dtype=dtype,
+                                                                            device=device, requires_grad=True)
+                                self.assertEqual(indices, sparse_tensor._indices())
+                                self.assertEqual(values, sparse_tensor._values())
+                                self.assertEqual(size if include_size else default_size, sparse_tensor.size())
+                                self.assertEqual(dtype, sparse_tensor.dtype)
+                                if use_cuda:
+                                    self.assertEqual(device, sparse_tensor._values().device)
+                                self.assertEqual(True, sparse_tensor.requires_grad)
 
     def test_factory_size_check(self):
         indices = self.index_tensor([[1, 2],
@@ -1653,6 +1679,8 @@ def test_factory_dense_dim(self):
 
     @cpu_only
     def test_factory_type_inference(self):
+        t = torch.sparse_coo_tensor(torch.tensor(([0], [2])), torch.tensor([1.], dtype=torch.float16))
+        self.assertEqual(torch.float16, t.dtype)
         t = torch.sparse_coo_tensor(torch.tensor(([0], [2])), torch.tensor([1.], dtype=torch.float32))
         self.assertEqual(torch.float32, t.dtype)
         t = torch.sparse_coo_tensor(torch.tensor(([0], [2])), torch.tensor([1.], dtype=torch.float64))
@@ -1660,6 +1688,8 @@ def test_factory_type_inference(self):
         t = torch.sparse_coo_tensor(torch.tensor(([0], [2])), torch.tensor([1]))
         self.assertEqual(torch.int64, t.dtype)
 
+        t = torch.sparse_coo_tensor(torch.tensor(([0], [2])), torch.HalfTensor(1, 0))
+        self.assertEqual(torch.float16, t.dtype)
         t = torch.sparse_coo_tensor(torch.tensor(([0], [2])), torch.FloatTensor(1, 0))
         self.assertEqual(torch.float32, t.dtype)
         t = torch.sparse_coo_tensor(torch.tensor(([0], [2])), torch.DoubleTensor(1, 0))
@@ -1713,6 +1743,10 @@ def test_tensor(indices, values, indices_equal, values_equal):
         values = torch.tensor([1.], dtype=torch.float32)
         test_tensor(indices, values, True, False)
 
+        indices = torch.tensor(([0], [2]), dtype=torch.int64)
+        values = torch.tensor([1.], dtype=torch.float16)
+        test_tensor(indices, values, True, False)
+
         indices = torch.tensor(([0], [2]), dtype=torch.int64)
         values = torch.FloatTensor(1, 0)
         test_tensor(indices, values, True, True)  # An empty tensor's data_ptr is always equal to 0
@@ -1766,14 +1800,14 @@ def test_constructor_device_legacy(self):
 
     @cpu_only  # not really, but we only really want to run this once
     def test_dtypes(self):
-        all_sparse_dtypes = [dtype for dtype in torch.testing.get_all_dtypes() if dtype != torch.float16]
+        all_sparse_dtypes = [dtype for dtype in torch.testing.get_all_dtypes()]
         do_test_dtypes(self, all_sparse_dtypes, torch.sparse_coo, torch.device('cpu'))
         if torch.cuda.is_available():
             do_test_dtypes(self, all_sparse_dtypes, torch.sparse_coo, torch.device('cuda:0'))
 
     @cpu_only  # not really, but we only really want to run this once
     def test_empty_full(self):
-        all_sparse_dtypes = [dtype for dtype in torch.testing.get_all_dtypes() if dtype != torch.float16]
+        all_sparse_dtypes = [dtype for dtype in torch.testing.get_all_dtypes()]
         do_test_empty_full(self, all_sparse_dtypes, torch.sparse_coo, torch.device('cpu'))
         if torch.cuda.device_count() > 0:
             do_test_empty_full(self, all_sparse_dtypes, torch.sparse_coo, None)
@@ -1923,6 +1957,47 @@ def do_test(t):
         do_test(self.sparse_empty(3, 0).data)
         do_test(self.sparse_empty(3, 0).detach())
 
+    def test_change_tensor_metadata(self):
+        i = self.index_tensor([[0], [1]])
+        v = self.value_tensor([[3, 4, 5]])
+        t = torch.sparse_coo_tensor(i, v, torch.Size([1, 2, 3]))
+        i.resize_(2, 3)
+        v.resize_(4, 5)
+        self.assertEqual(list(t.coalesce().indices().size()), [2, 1])
+        self.assertEqual(list(t.coalesce().values().size()), [1, 3])
+
+        i = self.index_tensor([[0], [1]])
+        v = self.value_tensor([[3, 4, 5]])
+        t = torch.sparse_coo_tensor(i, v, torch.Size([1, 2, 3]))
+        i.resize_as_(self.index_tensor([0, 1]))
+        v.resize_as_(self.value_tensor([3, 4, 5]))
+        self.assertEqual(list(t.coalesce().indices().size()), [2, 1])
+        self.assertEqual(list(t.coalesce().values().size()), [1, 3])
+
+        i = self.index_tensor([[0], [1]])
+        v = self.value_tensor([[3, 4, 5]])
+        t = torch.sparse_coo_tensor(i, v, torch.Size([1, 2, 3]))
+        i.as_strided_((2, 1), (1, 1))
+        v.as_strided_((1, 3), (1, 1))
+        self.assertEqual(list(t.coalesce().indices().size()), [2, 1])
+        self.assertEqual(list(t.coalesce().values().size()), [1, 3])
+
+        i = self.index_tensor([[0], [1]])
+        v = self.value_tensor([[3, 4, 5]])
+        t = torch.sparse_coo_tensor(i, v, torch.Size([1, 2, 3]))
+        i.set_(self.index_tensor([0, 1]))
+        v.set_(self.value_tensor([3, 4, 5]))
+        self.assertEqual(list(t.coalesce().indices().size()), [2, 1])
+        self.assertEqual(list(t.coalesce().values().size()), [1, 3])
+
+        i = self.index_tensor([[0], [1]])
+        v = self.value_tensor([[3, 4, 5]])
+        t = torch.sparse_coo_tensor(i, v, torch.Size([1, 2, 3]))
+        i.transpose_(0, 1)
+        v.transpose_(0, 1)
+        self.assertEqual(list(t.coalesce().indices().size()), [2, 1])
+        self.assertEqual(list(t.coalesce().values().size()), [1, 3])
+
 
 class TestUncoalescedSparse(TestSparse):
     def setUp(self):
diff --git a/test/test_tensorboard.py b/test/test_tensorboard.py
index 078c2822cf6d..6990e5787b74 100644
--- a/test/test_tensorboard.py
+++ b/test/test_tensorboard.py
@@ -22,6 +22,13 @@
     HAS_TORCHVISION = False
 skipIfNoTorchVision = unittest.skipIf(not HAS_TORCHVISION, "no torchvision")
 
+TEST_CAFFE2 = True
+try:
+    from caffe2.python import workspace
+except ImportError:
+    TEST_CAFFE2 = False
+skipIfNoCaffe2 = unittest.skipIf(not TEST_CAFFE2, "no caffe2")
+
 TEST_MATPLOTLIB = True
 try:
     import matplotlib
@@ -72,6 +79,10 @@ def test_pytorch_np(self):
             self.assertIsInstance(make_np(0), np.ndarray)
             self.assertIsInstance(make_np(0.1), np.ndarray)
 
+        def test_pytorch_autograd_np(self):
+            x = torch.autograd.Variable(torch.Tensor(1))
+            self.assertIsInstance(make_np(x), np.ndarray)
+
         def test_pytorch_write(self):
             with SummaryWriter() as w:
                 w.add_scalar('scalar', torch.autograd.Variable(torch.rand(1)), 0)
@@ -94,7 +105,7 @@ def test_pytorch_histogram_raw(self):
                                     num=num,
                                     sum=floats.sum().item(),
                                     sum_squares=sum_sq,
-                                    bucket_limits=limits.tolist(),
+                                    bucket_limits=limits[1:].tolist(),
                                     bucket_counts=counts.tolist())
 
                 ints = make_np(torch.randint(0, 100, (num,)))
@@ -107,7 +118,7 @@ def test_pytorch_histogram_raw(self):
                                     num=num,
                                     sum=ints.sum().item(),
                                     sum_squares=sum_sq,
-                                    bucket_limits=limits.tolist(),
+                                    bucket_limits=limits[1:].tolist(),
                                     bucket_counts=counts.tolist())
 
                 ints = torch.tensor(range(0, 100)).float()
@@ -137,13 +148,31 @@ def test_to_HWC(self):
             self.assertEqual(converted.shape, (32, 32, 3))
 
         def test_prepare_video(self):
-            # at each timestep the sum over all other dimensions of the video should stay the same
-            V_before = np.random.random((4, 10, 3, 20, 20))
-            V_after = _prepare_video(np.copy(V_before))
-            V_before = np.swapaxes(V_before, 0, 1)
-            V_before = np.reshape(V_before, newshape=(10, -1))
-            V_after = np.reshape(V_after, newshape=(10, -1))
-            np.testing.assert_array_almost_equal(np.sum(V_before, axis=1), np.sum(V_after, axis=1))
+            # At each timeframe, the sum over all other
+            # dimensions of the video should be the same.
+            shapes = [(16, 30, 3, 28, 28),
+                      (36, 30, 3, 28, 28),
+                      (19, 29, 3, 23, 19),
+                      (3, 3, 3, 3, 3)]
+            for s in shapes:
+                V_input = np.random.random(s)
+                V_after = _prepare_video(np.copy(V_input))
+                total_frame = s[1]
+                V_input = np.swapaxes(V_input, 0, 1)
+                for f in range(total_frame):
+                    x = np.reshape(V_input[f], newshape=(-1))
+                    y = np.reshape(V_after[f], newshape=(-1))
+                    np.testing.assert_array_almost_equal(np.sum(x), np.sum(y))
+
+        def test_numpy_vid_uint8(self):
+            V_input = np.random.randint(0, 256, (16, 30, 3, 28, 28)).astype(np.uint8)
+            V_after = _prepare_video(np.copy(V_input)) * 255
+            total_frame = V_input.shape[1]
+            V_input = np.swapaxes(V_input, 0, 1)
+            for f in range(total_frame):
+                x = np.reshape(V_input[f], newshape=(-1))
+                y = np.reshape(V_after[f], newshape=(-1))
+                np.testing.assert_array_almost_equal(np.sum(x), np.sum(y))
 
     freqs = [262, 294, 330, 349, 392, 440, 440, 440, 440, 440, 440]
 
@@ -500,15 +529,19 @@ def test_scalar(self):
             res = make_np(np.int64(100000000000))
             self.assertIsInstance(res, np.ndarray) and self.assertEqual(res.shape, (1,))
 
-        def test_numpy_vid(self):
-            shapes = [(16, 3, 30, 28, 28), (19, 3, 30, 28, 28), (19, 3, 29, 23, 19)]
-            for s in shapes:
-                x = np.random.random_sample(s)
-                # assert make_np(x, 'VID').shape[3] == 3
+        @skipIfNoCaffe2
+        def test_caffe2_np(self):
+            workspace.FeedBlob("testBlob", np.random.randn(1, 3, 64, 64).astype(np.float32))
+            self.assertIsInstance(make_np('testBlob'), np.ndarray)   
 
-        def test_numpy_vid_uint8(self):
-            x = np.random.randint(0, 256, (16, 3, 30, 28, 28)).astype(np.uint8)
-            # make_np(x, 'VID').shape[3] == 3
+        @skipIfNoCaffe2
+        def test_caffe2_np_expect_fail(self):
+            with self.assertRaises(RuntimeError):
+                res = make_np('This_blob_does_not_exist')
+
+        def test_pytorch_np_expect_fail(self):
+            with self.assertRaises(NotImplementedError):
+                res = make_np({'pytorch': 1.0})
 
 if __name__ == '__main__':
     run_tests()
diff --git a/test/test_torch.py b/test/test_torch.py
index 6eb773c54d13..bf814b561b04 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -1447,6 +1447,18 @@ def test_mv(self):
 
         self.assertEqual(res1, res2)
 
+    def test_numpy_args(self):
+        x1 = torch.randn(10)
+        x2 = torch.randn(10)
+        res1 = torch.add(input=x1, other=x2)
+        res2 = torch.add(x1=x1, x2=x2)
+        self.assertEqual(res1, res2)
+
+        x1 = torch.randn(10, 10, 10)
+        res1 = x1.sum(dim=(0, 2), keepdim=True)
+        res2 = x1.sum(axis=(0, 2), keepdims=True)
+        self.assertEqual(res1, res2)
+
     def test_add(self):
         # [res] torch.add([res,] tensor1, tensor2)
         m1 = torch.randn(100, 100)
@@ -1783,8 +1795,7 @@ def run_test(matrix_size, batches, cast):
 
         # Info should be positive for rank deficient matrices
         a = cast(torch.ones(5, 3, 3))
-        if not (a.is_cuda and any(x in torch.version.cuda for x in ['8.0', '9.2'])):
-            self.assertGreater(a.lu(get_infos=True)[2][0], 0)
+        self.assertGreater(a.lu(get_infos=True)[2][0], 0)
 
         # Error checking, no pivoting variant on CPU
         with self.assertRaisesRegex(RuntimeError,
@@ -2422,6 +2433,50 @@ def test_zeros(self):
         expected = torch.tensor([[0.]], dtype=torch.float16)
         self.assertEqual(halfTensor, expected)
 
+    def test_std_mean(self):
+        for device in torch.testing.get_all_device_types():
+            x = torch.rand(100, 50, 20, device=device)
+            for dim in range(x.dim()):
+                for unbiased in [False, True]:
+                    for keepdim in [False, True]:
+                        std1, mean1 = torch.std_mean(x, dim=dim, unbiased=unbiased, keepdim=keepdim)
+                        std2 = x.std(dim=dim, unbiased=unbiased, keepdim=keepdim)
+                        mean2 = x.mean(dim=dim, keepdim=keepdim)
+                        self.assertEqual(std1, std2)
+                        self.assertEqual(mean1, mean2)
+
+    def test_std_mean_all_dims(self):
+        for device in torch.testing.get_all_device_types():
+            x = torch.rand(100, 50, 20, device=device)
+            for unbiased in [False, True]:
+                std1, mean1 = torch.std_mean(x, unbiased=unbiased)
+                std2 = x.std(unbiased=unbiased)
+                mean2 = x.mean()
+                self.assertEqual(std1, std2)
+                self.assertEqual(mean1, mean2)
+
+    def test_var_mean(self):
+        for device in torch.testing.get_all_device_types():
+            x = torch.rand(100, 300, 50, device=device)
+            for dim in range(x.dim()):
+                for unbiased in [False, True]:
+                    for keepdim in [False, True]:
+                        var1, mean1 = torch.var_mean(x, dim=dim, unbiased=unbiased, keepdim=keepdim)
+                        var2 = x.var(dim=dim, unbiased=unbiased, keepdim=keepdim)
+                        mean2 = x.mean(dim=dim, keepdim=keepdim)
+                        self.assertEqual(var1, var2)
+                        self.assertEqual(mean1, mean2)
+
+    def test_var_mean_all_dims(self):
+        for device in torch.testing.get_all_device_types():
+            x = torch.rand(100, 50, 20, device=device)
+            for unbiased in [False, True]:
+                var1, mean1 = torch.var_mean(x, unbiased=unbiased)
+                var2 = x.var(unbiased=unbiased)
+                mean2 = x.mean()
+                self.assertEqual(var1, var2)
+                self.assertEqual(mean1, mean2)
+
     def test_zeros_like(self):
         expected = torch.zeros(100, 100)
 
@@ -2458,58 +2513,59 @@ def test_zeros_out(self):
         self.assertEqual(torch.zeros(shape), torch.zeros(shape, layout=torch.strided, out=out))
         self.assertEqual(torch.zeros(shape), torch.zeros(shape, device='cpu', out=out))
 
-    @staticmethod
-    def _test_histc(self, device):
-        # negative nbins throws
-        with self.assertRaisesRegex(RuntimeError, 'bins must be > 0'):
-            torch.histc(torch.tensor([1], dtype=torch.float, device=device), bins=-1)
-
-        # without nbins
-        actual = torch.histc(
-            torch.tensor([2, 5], dtype=torch.float, device=device))
-        expected = torch.zeros(100, dtype=torch.float, device=device)
-        expected.data[0] = 1
-        expected.data[99] = 1
-        self.assertEqual(expected, actual)
-        # tensor with the same element
-        actual = torch.histc(torch.ones(5, dtype=torch.float, device=device), bins=5)
-        self.assertEqual(
-            torch.tensor([0, 0, 5, 0, 0], dtype=torch.float, device=device),
-            actual)
-        # no element falls between [min, max]
-        actual = torch.histc(
-            torch.ones(5, dtype=torch.float, device=device), bins=5, min=2, max=3)
-        self.assertEqual(
-            torch.tensor([0, 0, 0, 0, 0], dtype=torch.float, device=device),
-            actual)
-        # element falls below min + integral bin size and
-        actual = torch.histc(
-            torch.tensor([2, 4, 2, 2, 5, 4], dtype=torch.float, device=device),
-            bins=5, min=1, max=5)
-        self.assertEqual(
-            torch.tensor([0, 3, 0, 2, 1], dtype=torch.float, device=device),
-            actual)
-        # non-integral bin size
-        actual = torch.histc(
-            torch.tensor([1, 2, 1], dtype=torch.float, device=device),
-            bins=4, min=0, max=3)
-        self.assertEqual(
-            torch.tensor([0, 2, 1, 0], dtype=torch.float, device=device),
-            actual)
-        # double input
-        actual = torch.histc(
-            torch.tensor([1, 2, 1], dtype=torch.double, device=device),
-            bins=4, min=0, max=3)
-        self.assertEqual(
-            torch.tensor([0, 2, 1, 0], dtype=torch.double, device=device),
-            actual)
-        # mixed input
-        actual = torch.histc(
-            torch.tensor([1., 2, 1], dtype=torch.float, device=device),
-            bins=4, min=0, max=3)
-        self.assertEqual(
-            torch.tensor([0, 2, 1, 0], dtype=torch.float, device=device),
-            actual)
+    def test_histc(self):
+        for device in torch.testing.get_all_device_types():
+            # negative nbins throws
+            with self.assertRaisesRegex(RuntimeError, 'bins must be > 0'):
+                torch.histc(torch.tensor([1], dtype=torch.float, device=device), bins=-1)
+
+            # without nbins
+            actual = torch.histc(
+                torch.tensor([2, 5], dtype=torch.float, device=device))
+            expected = torch.zeros(100, dtype=torch.float, device=device)
+            expected.data[0] = 1
+            expected.data[99] = 1
+            self.assertEqual(expected, actual)
+            # tensor with the same element
+            actual = torch.histc(torch.ones(5, dtype=torch.float, device=device), bins=5)
+            self.assertEqual(
+                torch.tensor([0, 0, 5, 0, 0], dtype=torch.float, device=device),
+                actual)
+            # no element falls between [min, max]
+            actual = torch.histc(
+                torch.ones(5, dtype=torch.float, device=device), bins=5, min=2, max=3)
+            self.assertEqual(
+                torch.tensor([0, 0, 0, 0, 0], dtype=torch.float, device=device),
+                actual)
+            # element falls below min + integral bin size and
+            actual = torch.histc(
+                torch.tensor([2, 4, 2, 2, 5, 4], dtype=torch.float, device=device),
+                bins=5, min=1, max=5)
+            self.assertEqual(
+                torch.tensor([0, 3, 0, 2, 1], dtype=torch.float, device=device),
+                actual)
+            # non-integral bin size
+            actual = torch.histc(
+                torch.tensor([1, 2, 1], dtype=torch.float, device=device),
+                bins=4, min=0, max=3)
+            self.assertEqual(
+                torch.tensor([0, 2, 1, 0], dtype=torch.float, device=device),
+                actual)
+            # double input
+            actual = torch.histc(
+                torch.tensor([1, 2, 1], dtype=torch.double, device=device), bins=4, min=0, max=3)
+            self.assertEqual(
+                torch.tensor([0, 2, 1, 0], dtype=torch.double, device=device),
+                actual)
+            self.assertEqual(actual.dtype, torch.double)
+            # mixed input
+            actual = torch.histc(
+                torch.tensor([1., 2, 1], dtype=torch.float, device=device),
+                bins=4, min=0, max=3)
+            self.assertEqual(
+                torch.tensor([0, 2, 1, 0], dtype=torch.float, device=device),
+                actual)
+            self.assertEqual(actual.dtype, torch.float)
 
         # test against numpy.histogram()
         def test_against_np(tensor, bins=100, min=0, max=0):
@@ -2540,9 +2596,6 @@ def test_against_np(tensor, bins=100, min=0, max=0):
             expanded = torch.randn(1, 5, 1, 2, device=device).expand(3, 5, 7, 2)
             test_against_np(expanded)
 
-    def test_histc_cpu(self):
-        self._test_histc(self, 'cpu')
-
     def test_ones(self):
         res1 = torch.ones(100, 100)
         res2 = torch.Tensor()
@@ -2591,6 +2644,18 @@ def test_copy_dtypes(self):
             copied_dtype = copy.deepcopy(dtype)
             self.assertIs(dtype, copied_dtype)
 
+    def test_copy_transpose(self):
+        x = torch.arange(100 * 100, dtype=torch.float).reshape(100, 100).t()
+        y = torch.empty(100, 100, dtype=torch.float)
+        y.copy_(x)
+        self.assertEqual(y[:, 0], range(100))
+        self.assertEqual(y[:, 40], range(4000, 4100))
+
+        y = torch.empty(100, 100, dtype=torch.double)
+        y.copy_(x)
+        self.assertEqual(y[:, 0], range(100))
+        self.assertEqual(y[:, 40], range(4000, 4100))
+
     def test_device(self):
         cpu = torch.device('cpu')
         self.assertEqual('cpu', str(cpu))
@@ -2680,7 +2745,7 @@ def test_qtensor(self):
         r = torch.ones(num_elements, dtype=torch.float)
         scale = 1.0
         zero_point = 2
-        qr = r.quantize_linear(scale, zero_point)
+        qr = r.quantize_linear(scale, zero_point, torch.quint8)
         self.assertEqual(qr.q_scale(), scale)
         self.assertEqual(qr.q_zero_point(), zero_point)
         self.assertTrue(qr.is_quantized)
@@ -2698,7 +2763,7 @@ def test_qtensor(self):
         # Scalar Tensor
         # item
         r = torch.ones(1, dtype=torch.float)
-        qr = r.quantize_linear(scale, zero_point)
+        qr = r.quantize_linear(scale, zero_point, torch.quint8)
         self.assertEqual(qr.item(), 1)
         self.assertEqual(qr[0].item(), 1)
         # assignment
@@ -2711,12 +2776,12 @@ def test_qtensor(self):
         self.assertEqual(qr.item(), 15)
         # we can also print a qtensor
         self.assertEqual(str(qr),
-                         "tensor([15.], size=(1,), dtype=torch.qint8, " +
+                         "tensor([15.], size=(1,), dtype=torch.quint8, " +
                          "scale=1.0, zero_point=2)")
         empty_r = torch.ones((0, 1), dtype=torch.float)
-        empty_qr = empty_r.quantize_linear(scale, zero_point)
+        empty_qr = empty_r.quantize_linear(scale, zero_point, torch.quint8)
         self.assertEqual(str(empty_qr),
-                         "tensor([], size=(0, 1), dtype=torch.qint8, " +
+                         "tensor([], size=(0, 1), dtype=torch.quint8, " +
                          "scale=1.0, zero_point=2)")
 
     def test_qtensor_quant_dequant(self):
@@ -2724,7 +2789,7 @@ def test_qtensor_quant_dequant(self):
         r = torch.from_numpy(r).float()
         scale = 2
         zero_point = 2
-        qr = r.quantize_linear(scale, zero_point)
+        qr = r.quantize_linear(scale, zero_point, torch.quint8)
         rqr = qr.dequantize()
         self.assertTrue(np.allclose(r.numpy(), rqr.numpy(), atol=2 / scale))
 
@@ -2733,8 +2798,38 @@ def test_qtensor_creation(self):
         zero_point = 10
         val = 100
         numel = 10
-        q = torch._empty_affine_quantized(numel, dtype=torch.qint8, scale=scale, zero_point=zero_point)
-        # TODO: check dequantized values?
+        q = torch._empty_affine_quantized([numel], scale=scale, zero_point=zero_point, dtype=torch.quint8)
+        self.assertEqual(scale, q.q_scale())
+        self.assertEqual(zero_point, q.q_zero_point())
+
+        # create Tensor from uint8_t Tensor, scale and zero_point
+        int_tensor = torch.randint(0, 100, size=(10,), dtype=torch.uint8)
+        q = torch._per_tensor_affine_qtensor(int_tensor, scale, zero_point)
+        self.assertEqual(int_tensor, q.int_repr())
+        self.assertEqual(scale, q.q_scale())
+        self.assertEqual(zero_point, q.q_zero_point())
+
+    def test_qtensor_dtypes(self):
+        r = np.random.rand(3, 2) * 2 - 4
+        r = torch.from_numpy(r).float()
+        scale = 2
+        zero_point = 2
+        qr = r.quantize_linear(scale, zero_point, torch.qint8)
+        rqr = qr.dequantize()
+        self.assertTrue(np.allclose(r.numpy(), rqr.numpy(), atol=2 / scale))
+        qr = r.quantize_linear(scale, zero_point, torch.quint8)
+        rqr = qr.dequantize()
+        self.assertTrue(np.allclose(r.numpy(), rqr.numpy(), atol=2 / scale))
+        qr = r.quantize_linear(scale, zero_point, torch.qint32)
+        rqr = qr.dequantize()
+        self.assertTrue(np.allclose(r.numpy(), rqr.numpy(), atol=2 / scale))
+
+    def test_qtensor_dequantize_linear(self):
+        t = torch.arange(-10, 10, dtype=torch.int8)
+        scale = 3
+        zero_point = 2
+        qt = torch.dequantize_linear(t, scale, zero_point, torch.float)
+
 
     @unittest.skipIf(torch.cuda.device_count() < 2, 'fewer than 2 GPUs detected')
     def test_device_guard(self):
@@ -5872,18 +5967,10 @@ def test_single_det(M, target, desc):
         eye = torch.eye(5, device=device)
         test_single_det(eye, (torch.ones((), device=device), torch.zeros((), device=device)), 'identity')
 
-        # TODO: Remove when MAGMA 2.5.0 is built for CUDA 8 and CUDA 9.2
-        is_cuda_8_92 = False
-        if torch.cuda.is_available() and torch.version.cuda is not None:
-            is_cuda_8_92 = any(x in torch.version.cuda for x in ['8.0', '9.2'])
-
         def test(M):
             assert M.size(0) >= 5, 'this helper fn assumes M to be at least 5x5'
             M = M.to(device)
 
-            if M.is_cuda and is_cuda_8_92:
-                return
-
             ref_M_sdet, ref_M_logabsdet = reference_slogdet(M)
 
             test_single_det(M, (ref_M_sdet, ref_M_logabsdet), 'basic')
@@ -8140,6 +8227,98 @@ def fn(torchfn, *args):
                 A_LU, pivots = fn(torch.lu, (2, 0, 0))
                 self.assertEqual([(2, 0, 0), (2, 0)], [A_LU.shape, pivots.shape])
 
+    def check_single_matmul(self, x, y, shape):
+        a = np.array(x, copy=False)
+        b = np.array(y, copy=False)
+        expected = np.matmul(a, b)
+        self.assertTrue(expected.flags['C_CONTIGUOUS'])
+
+        ans = torch.matmul(x, y)
+        self.assertTrue(ans.is_contiguous())
+        self.assertTrue(np.array_equal(ans, expected))
+
+        out = torch.zeros(*shape, dtype=torch.int64)
+        ans = torch.matmul(x, y, out=out)
+        self.assertIs(ans, out)
+        self.assertTrue(ans.is_contiguous())
+        self.assertTrue(np.array_equal(ans, expected))
+
+    @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
+    def test_matmul_small_brute_force_1d_Nd(self):
+        # Issue #20452: range(0, 10) does not work.
+        n = 1
+        for m in range(1, 8):
+            for p in range(1, 8):
+                for o in range(1, 5):
+                    # 1d, 3d, inner dimensions C
+                    x = torch.arange(m)
+                    y = torch.arange(o * m * p).reshape(o, m, p)
+                    self.check_single_matmul(x, y, (o, n, p))
+
+                    # 1d, 3d, inner dimensions Fortran
+                    x = torch.arange(m)
+                    y = torch.arange(o * p * m).reshape(o, p, m).transpose(-1, -2)
+                    self.check_single_matmul(x, y, (o, n, p))
+
+                    # 1d, 3d, inner dimensions non-contiguous
+                    x = torch.arange(2 * m)[::2]
+                    y = torch.arange(o * m * 2 * p).reshape(o, m, 2 * p)[:, :, ::2]
+                    self.check_single_matmul(x, y, (o, n, p))
+
+                    for r in range(1, 5):
+                        # 1d, 4d, inner dimensions C
+                        x = torch.arange(m)
+                        y = torch.arange(r * o * m * p).reshape(r, o, m, p)
+                        self.check_single_matmul(x, y, (r, o, n, p))
+
+                        # 1d, 4d, inner dimensions Fortran
+                        x = torch.arange(m)
+                        y = torch.arange(r * o * p * m).reshape(r, o, p, m).transpose(-1, -2)
+                        self.check_single_matmul(x, y, (r, o, n, p))
+
+                        # 1d, 4d, inner dimensions non-contiguous
+                        x = torch.arange(2 * m)[::2]
+                        y = torch.arange(r * o * m * 2 * p).reshape(r, o, m, 2 * p)[:, :, :, ::2]
+                        self.check_single_matmul(x, y, (r, o, n, p))
+
+    @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
+    def test_matmul_small_brute_force_2d_Nd(self):
+        # Issue #20452: range(0, 10) does not work.
+        for n in range(1, 5):
+            for m in range(1, 5):
+                for p in range(1, 5):
+                    for o in range(1, 3):
+                        # 2d, 3d, inner dimensions C
+                        x = torch.arange(n * m).reshape(n, m)
+                        y = torch.arange(o * m * p).reshape(o, m, p)
+                        self.check_single_matmul(x, y, (o, n, p))
+
+                        # 2d, 3d, inner dimensions Fortran
+                        x = torch.arange(m * n).reshape(m, n).transpose(-1, -2)
+                        y = torch.arange(o * p * m).reshape(o, p, m).transpose(-1, -2)
+                        self.check_single_matmul(x, y, (o, n, p))
+
+                        # 2d, 3d, inner dimensions non-contiguous
+                        x = torch.arange(n * 2 * m).reshape(n, 2 * m)[:, ::2]
+                        y = torch.arange(o * m * 2 * p).reshape(o, m, 2 * p)[:, :, ::2]
+                        self.check_single_matmul(x, y, (o, n, p))
+
+                        for r in range(1, 2):
+                            # 2d, 4d, inner dimensions C
+                            x = torch.arange(n * m).reshape(n, m)
+                            y = torch.arange(r * o * m * p).reshape(r, o, m, p)
+                            self.check_single_matmul(x, y, (r, o, n, p))
+
+                            # 2d, 4d, inner dimensions Fortran
+                            x = torch.arange(m * n).reshape(m, n).transpose(-1, -2)
+                            y = torch.arange(r * o * p * m).reshape(r, o, p, m).transpose(-1, -2)
+                            self.check_single_matmul(x, y, (r, o, n, p))
+
+                            # 2d, 4d, inner dimensions non-contiguous
+                            x = torch.arange(n * 2 * m).reshape(n, 2 * m)[:, ::2]
+                            y = torch.arange(r * o * m * 2 * p).reshape(r, o, m, 2 * p)[:, :, :, ::2]
+                            self.check_single_matmul(x, y, (r, o, n, p))
+
     @skipIfRocm
     def test_blas_alpha_beta_empty(self):
         for device in torch.testing.get_all_device_types():
@@ -9314,7 +9493,7 @@ def test_serialization_offset_filelike(self):
         i, j = 41, 43
         with BytesIOContext() as f:
             pickle.dump(i, f)
-            torch.save(a, f)            
+            torch.save(a, f)
             pickle.dump(j, f)
             torch.save(b, f)
             f.seek(0)
@@ -11348,6 +11527,21 @@ def test_c10_layer_norm(self):
                 weight), torch.tensor(bias), 1, epsilon, True)
         torch.testing.assert_allclose(expected_norm, actual_norm)
 
+    def test_memory_format(self):
+        x = torch.randn(10, 3, 32, 32)
+        nhwc = x.contiguous(memory_format=torch.channels_last)
+        self.assertFalse(nhwc.is_contiguous())
+        self.assertTrue(nhwc.is_contiguous(memory_format=torch.channels_last))
+        self.assertEqual(nhwc, x)
+
+
+    @unittest.skipIf(not torch.cuda.is_available(), 'no CUDA')
+    def test_memory_format_permute_cuda(self):
+        x = torch.randn(10, 3, 32, 32)
+        nhwc = x.contiguous(memory_format=torch.channels_last).cuda()
+        y = nhwc.permute(0, 1, 3, 2).permute(0, 1, 3, 2)
+        self.assertFalse(y.is_contiguous(memory_format=torch.channels_last))
+
     def test_subclass_tensors(self):
         # raise an error when trying to subclass FloatTensor
         with self.assertRaisesRegex(TypeError, "type 'torch.FloatTensor' is not an acceptable base type"):
@@ -11361,13 +11555,20 @@ def foo(self):
         f = Foo2()
         self.assertEqual(f.foo(), 5)
 
+    def test_ndim(self):
+        a = torch.randn(1, 2, 3)
+        self.assertEqual(3, a.ndim)
+        b = torch.randn(())
+        self.assertEqual(0, b.ndim)
+        c = torch.randn(1, 0)
+        self.assertEqual(2, c.ndim)
+
 # Functions to test negative dimension wrapping
 METHOD = 1
 INPLACE_METHOD = 2
 FUNCTIONAL = 4
 DIM_ARG = None
 
-
 def make_neg_dim_test(name, tensor_arg, arg_constr, types, extra_dim=0):
     def neg_dim_test(self):
         if isinstance(tensor_arg, list):
diff --git a/third_party/fbgemm b/third_party/fbgemm
index 6ec218e6ed5d..9ae8912fc9d0 160000
--- a/third_party/fbgemm
+++ b/third_party/fbgemm
@@ -1 +1 @@
-Subproject commit 6ec218e6ed5dcb9b5397a608a3b5b8027b236819
+Subproject commit 9ae8912fc9d09cd22f333c226188cc161d9509a6
diff --git a/third_party/onnx b/third_party/onnx
index 5bde6371620b..cc2333a3f929 160000
--- a/third_party/onnx
+++ b/third_party/onnx
@@ -1 +1 @@
-Subproject commit 5bde6371620b76302864bce90f521d72eda95d0e
+Subproject commit cc2333a3f929caca7223b98699237f19388dd585
diff --git a/third_party/sleef b/third_party/sleef
index 191f655caa25..9b249c53a803 160000
--- a/third_party/sleef
+++ b/third_party/sleef
@@ -1 +1 @@
-Subproject commit 191f655caa25526ae226cf88dd2529265176014a
+Subproject commit 9b249c53a80343cc1a394ca961d7d5696ea76409
diff --git a/tools/amd_build/build_amd.py b/tools/amd_build/build_amd.py
index e765e5aaaf1f..121e069e0b1e 100644
--- a/tools/amd_build/build_amd.py
+++ b/tools/amd_build/build_amd.py
@@ -93,7 +93,7 @@
 for new_dir in args.extra_include_dir:
     abs_new_dir = os.path.join(proj_dir, new_dir)
     if os.path.exists(abs_new_dir):
-        new_dir = os.path.join(new_dir, '*')
+        new_dir = os.path.join(new_dir, '**/*')
         includes.append(new_dir)
 
 ignores = [
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index aa51c670528a..fbdb4c6a3b33 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -498,13 +498,13 @@
   other: grad.clone().masked_fill_(self > other, 0)
 
 - name: mean(Tensor self)
-  self: grad.expand(self.sizes()) / self.numel()
+  self: mean_backward(grad, self.sizes(), self.numel())
 
 - name: mean(Tensor self, ScalarType dtype)
   self: grad.expand(self.sizes()).to(self.scalar_type()) / self.numel()
 
 - name: mean(Tensor self, IntArrayRef dim, bool keepdim)
-  self: sum_backward(grad, self.sizes(), dim, keepdim) / _safe_size(self.sizes(), dim)
+  self: mean_backward(grad, self.sizes(), dim, keepdim)
 
 - name: mean(Tensor self, IntArrayRef dim, ScalarType dtype)
   self: sum_backward(grad, self.sizes(), dim, false).to(self.scalar_type()) / _safe_size(self.sizes(), dim)
@@ -774,10 +774,10 @@
   self: unsqueeze_to(grad, dim, self.sizes())
 
 - name: std(Tensor self, bool unbiased)
-  self: var_backward(grad / (result * 2), self, unbiased)
+  self: std_backward(result, grad, self, unbiased)
 
 - name: std(Tensor self, IntArrayRef dim, bool unbiased, bool keepdim)
-  self: var_backward(grad / (result * 2), self, dim, unbiased, keepdim)
+  self: std_backward(result, grad, self, dim, unbiased, keepdim)
 
 - name: sub(Tensor self, Tensor other, *, Scalar alpha)
   self: grad
@@ -1486,3 +1486,15 @@
 # PackedSequence helpers
 - name: _pack_padded_sequence(Tensor input, Tensor lengths, bool batch_first)
   input: _pack_padded_sequence_backward(grad, input.sizes(), result1, batch_first)
+
+- name: std_mean(Tensor self, IntArrayRef dim, bool unbiased, bool keepdim)
+  self: var_std_mean_backward(grads, self, result0, result1, dim, unbiased, keepdim, true)
+
+- name: var_mean(Tensor self, IntArrayRef dim, bool unbiased, bool keepdim)
+  self: var_std_mean_backward(grads, self, result0, result1, dim, unbiased, keepdim, false)
+
+- name: std_mean(Tensor self, bool unbiased)
+  self: var_std_mean_backward(grads, self, result0, result1, unbiased, true)
+
+- name: var_mean(Tensor self, bool unbiased)
+  self: var_std_mean_backward(grads, self, result0, result1, unbiased, false)
diff --git a/tools/autograd/templates/Functions.cpp b/tools/autograd/templates/Functions.cpp
index 698899c51c75..48199d332e3b 100644
--- a/tools/autograd/templates/Functions.cpp
+++ b/tools/autograd/templates/Functions.cpp
@@ -676,6 +676,46 @@ Tensor var_backward(Tensor grad, const Tensor & self, IntArrayRef dim, bool unbi
   return (2.0 / (_safe_size(self.sizes(), dim) - unbiased)) * grad * (self - self.mean(dim, true));
 }
 
+Tensor std_backward(const Tensor & result, const Tensor & grad, const Tensor & self, bool unbiased) {
+  return var_backward(grad / (result * 2), self, unbiased);
+}
+
+Tensor std_backward(const Tensor & result, Tensor grad, const Tensor & self, IntArrayRef dim, bool unbiased, bool keepdim) {
+  return var_backward(grad / (result * 2), self, dim, unbiased, keepdim);
+}
+
+Tensor mean_backward(Tensor grad, const IntArrayRef sizes, IntArrayRef dim, bool keepdim) {
+  return sum_backward(grad, sizes, dim, keepdim) / _safe_size(sizes, dim);
+}
+
+Tensor mean_backward(Tensor grad, const IntArrayRef sizes, int numel) {
+  return grad.expand(sizes) / numel;
+}
+
+Tensor var_std_mean_backward(const variable_list& grads, const Tensor & self, const Tensor & r1, const Tensor & r2, IntArrayRef dim, bool unbiased, bool keepdim, bool is_std) {
+  Tensor grad;
+  if (grads[0].defined()) {
+    grad = is_std ? std_backward(r1, grads[0], self, dim, unbiased, keepdim) : var_backward(grads[0], self, dim, unbiased, keepdim);
+  }
+  if (grads[1].defined()) {
+    Tensor mean_grad = mean_backward(grads[1], self.sizes(), dim, keepdim);
+    grad = grads[0].defined() ? grad + mean_grad : mean_grad;
+  }
+  return grad;
+}
+
+Tensor var_std_mean_backward(const variable_list& grads, const Tensor & self, const Tensor & r1, const Tensor & r2, bool unbiased, bool is_std) {
+  Tensor grad;
+  if (grads[0].defined()) {
+    grad = is_std ? std_backward(r1, grads[0], self, unbiased) : var_backward(grads[0], self, unbiased);
+  }
+  if (grads[1].defined()) {
+    Tensor mean_grad = mean_backward(grads[1], self.sizes(), self.numel());
+    grad = grads[0].defined() ? grad + mean_grad : mean_grad;
+  }
+  return grad;
+}
+
 Tensor masked_scatter_backward(const Tensor & grad, const Tensor & mask, IntArrayRef sizes) {
   int64_t numel = 1;
   for (auto size : sizes) {
@@ -1581,7 +1621,7 @@ std::tuple<Tensor, Tensor, Tensor> prelu_double_backward(
 // This makes no assumption on the signs of sigma.
 Tensor svd_backward(const std::vector<torch::autograd::Variable> &grads, const Tensor& self,
           bool some, bool compute_uv, const Tensor& raw_u, const Tensor& sigma, const Tensor& raw_v) {
-  AT_CHECK(compute_uv,
+  TORCH_CHECK(compute_uv,
            "svd_backward: Setting compute_uv to false in torch.svd doesn't compute singular matrices, ",
            "and hence we cannot compute backward. Please use torch.svd(compute_uv=True)");
 
@@ -1664,7 +1704,7 @@ Tensor svd_backward(const std::vector<torch::autograd::Variable> &grads, const T
 // http://eprints.maths.ox.ac.uk/1079/1/NA-08-01.pdf
 Tensor symeig_backward(const std::vector<torch::autograd::Variable> &grads, const Tensor& self,
                     bool eigenvectors, bool upper, const Tensor& lambda, const Tensor& v) {
-    AT_CHECK(eigenvectors,
+    TORCH_CHECK(eigenvectors,
              "symeig_backward: Setting eigenvectors to false in torch.symeig doesn't compute eigenvectors ",
              "and hence we cannot compute backward. Please use torch.symeig(eigenvectors=True)");
 
diff --git a/tools/autograd/templates/VariableType.h b/tools/autograd/templates/VariableType.h
index b368f99c0839..d12cc2dea11a 100644
--- a/tools/autograd/templates/VariableType.h
+++ b/tools/autograd/templates/VariableType.h
@@ -21,6 +21,7 @@ using at::Context;
 using at::Device;
 using at::Generator;
 using at::IntArrayRef;
+using at::MemoryFormat;
 using at::Scalar;
 using at::ScalarType;
 using at::SparseTensorRef;
diff --git a/tools/autograd/templates/python_nn_functions.cpp b/tools/autograd/templates/python_nn_functions.cpp
index 96de55065a71..628fd740117d 100644
--- a/tools/autograd/templates/python_nn_functions.cpp
+++ b/tools/autograd/templates/python_nn_functions.cpp
@@ -2,6 +2,7 @@
 
 // ${generated_comment}
 
+
 #include "torch/csrc/Device.h"
 #include "torch/csrc/DynamicTypes.h"
 #include "torch/csrc/Exceptions.h"
@@ -15,6 +16,7 @@
 
 using at::Tensor;
 using at::Scalar;
+using at::MemoryFormat;
 using namespace torch::autograd::utils;
 
 namespace torch { namespace autograd {
diff --git a/tools/autograd/templates/python_torch_functions.cpp b/tools/autograd/templates/python_torch_functions.cpp
index 6542dddc3748..3b62d90ee04c 100644
--- a/tools/autograd/templates/python_torch_functions.cpp
+++ b/tools/autograd/templates/python_torch_functions.cpp
@@ -121,7 +121,7 @@ static PyObject * THPVariable_arange(PyObject* self, PyObject* args, PyObject* k
           .pinned_memory(r.toBool(5));
       return wrap(dispatch_arange(end, options));
     } else {
-      AT_CHECK(!r.toBool(5), " `pin_memory` and `out` parameters are incompatible");
+      TORCH_CHECK(!r.toBool(5), " `pin_memory` and `out` parameters are incompatible");
       check_out_type_matches(r.tensor(1), r.scalartype(2), r.isNone(2), r.layout(3), r.isNone(3),
                              r.device(4), r.isNone(4));
       return wrap(dispatch_arange(r.scalar(0), r.tensor(1)).set_requires_grad(r.toBool(6)));
@@ -141,7 +141,7 @@ static PyObject * THPVariable_arange(PyObject* self, PyObject* args, PyObject* k
           .pinned_memory(r.toBool(7));
       return wrap(dispatch_arange(start, end, step, options));
     } else {
-      AT_CHECK(!r.toBool(7), " `pin_memory` and `out` parameters are incompatible");
+      TORCH_CHECK(!r.toBool(7), " `pin_memory` and `out` parameters are incompatible");
       check_out_type_matches(r.tensor(3), r.scalartype(4), r.isNone(4), r.layout(5), r.isNone(5),
                                r.device(6), r.isNone(6));
       return wrap(dispatch_arange(r.scalar(0), r.scalar(1), r.scalar(2), r.tensor(3)).set_requires_grad(r.toBool(8)));
diff --git a/tools/autograd/templates/python_variable_methods.cpp b/tools/autograd/templates/python_variable_methods.cpp
index eb694febbda0..4397697bf2cd 100644
--- a/tools/autograd/templates/python_variable_methods.cpp
+++ b/tools/autograd/templates/python_variable_methods.cpp
@@ -143,17 +143,24 @@ static PyObject * THPVariable_dim(PyObject* self, PyObject* args)
    END_HANDLE_TH_ERRORS
 }
 
-static Tensor dispatch_contiguous(const Tensor & self) {
+static Tensor dispatch_contiguous(const Tensor & self, at::MemoryFormat memory_format) {
   AutoNoGIL no_gil;
   OptionalDeviceGuard device_guard(device_of(self));
-  return self.contiguous();
+  return self.contiguous(memory_format);
 }
- static PyObject * THPVariable_contiguous(PyObject* self, PyObject* args)
+
+static PyObject * THPVariable_contiguous(PyObject* self, PyObject* args, PyObject* kwargs)
 {
   HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+    "contiguous(*, MemoryFormat memory_format=contiguous_format)",
+  });
+  ParsedArgs<1> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
   auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
+  auto memory_format = r.toMemoryFormat(0);
   // avoids touching the GIL or current device if self is already contiguous
-  if (self_.is_contiguous()) {
+  if (self_.is_contiguous(memory_format)) {
     // NOTE: this logic is duplicated from VariableType.cpp. Since we need to
     // record this call to contiguous() in the trace regardless of whether
     // we actually call contiguous here, we need to record this information
@@ -163,13 +170,14 @@ static Tensor dispatch_contiguous(const Tensor & self) {
       auto node = tracer_state->graph->create(jit::aten::contiguous, /*num_outputs=*/0);
       jit::tracer::recordSourceLocation(node);
       jit::tracer::addInputs(node, "self", self_);
+      jit::tracer::addInputs(node, "memory_format", memory_format);
       tracer_state->graph->insertNode(node);
       jit::tracer::addOutput(node, self_);
     }
     Py_INCREF(self);
     return self;
   }
-  return THPVariable_Wrap(dispatch_contiguous(self_));
+  return THPVariable_Wrap(dispatch_contiguous(self_, memory_format));
   END_HANDLE_TH_ERRORS
 }
 
@@ -321,7 +329,7 @@ static PyObject * THPVariable_cuda(PyObject* self, PyObject* args, PyObject* kwa
   ParsedArgs<2> parsed_args;
   auto r = parser.parse(args, kwargs, parsed_args);
   auto device = r.isNone(0) ? at::Device(at::DeviceType::CUDA) : r.device(0);
-  AT_CHECK(device.is_cuda(), "Invalid device, must be cuda device");
+  TORCH_CHECK(device.is_cuda(), "Invalid device, must be cuda device");
   torch::utils::cuda_lazy_init();
   return THPVariable_Wrap(dispatch_to(self_, device, r.toBool(1), false));
   END_HANDLE_TH_ERRORS
@@ -432,15 +440,21 @@ static PyObject * THPVariable_requires_grad_(PyObject* self, PyObject* args, PyO
   END_HANDLE_TH_ERRORS
 }
 
-inline bool dispatch_is_contiguous(Tensor & self) {
-  return self.is_contiguous();
+inline bool dispatch_is_contiguous(Tensor & self, MemoryFormat memory_format) {
+  return self.is_contiguous(memory_format);
 }
 
-static PyObject * THPVariable_is_contiguous(PyObject* self_, PyObject* args)
+static PyObject * THPVariable_is_contiguous(PyObject* self_, PyObject* args, PyObject* kwargs)
 {
   HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+    "is_contiguous(*, MemoryFormat memory_format=contiguous_format)",
+  });
+  ParsedArgs<1> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+  auto memory_format = r.toMemoryFormat(0);
   auto& self = reinterpret_cast<THPVariable*>(self_)->cdata;
-  return wrap(dispatch_is_contiguous(self));
+  return wrap(dispatch_is_contiguous(self, memory_format));
   END_HANDLE_TH_ERRORS
 }
 
@@ -686,7 +700,7 @@ PyMethodDef variable_methods[] = {
   {"apply_", (PyCFunction)THPVariable_apply_, METH_O, NULL},
   {"byte", (PyCFunction)THPVariable_byte, METH_NOARGS, NULL},
   {"char", (PyCFunction)THPVariable_char, METH_NOARGS, NULL},
-  {"contiguous", (PyCFunction)THPVariable_contiguous, METH_NOARGS, NULL},
+  {"contiguous", (PyCFunction)THPVariable_contiguous, METH_VARARGS | METH_KEYWORDS, NULL},
   {"copy_", (PyCFunction)THPVariable_copy_, METH_VARARGS | METH_KEYWORDS, NULL},
   {"cpu", (PyCFunction)THPVariable_cpu, METH_NOARGS, NULL},
   {"cuda", (PyCFunction)THPVariable_cuda, METH_VARARGS | METH_KEYWORDS, NULL},
@@ -698,7 +712,7 @@ PyMethodDef variable_methods[] = {
   {"bool", (PyCFunction)THPVariable_bool, METH_NOARGS, NULL},
   {"half", (PyCFunction)THPVariable_half, METH_NOARGS, NULL},
   {"int", (PyCFunction)THPVariable_int, METH_NOARGS, NULL},
-  {"is_contiguous", (PyCFunction)THPVariable_is_contiguous, METH_NOARGS, NULL},
+  {"is_contiguous", (PyCFunction)THPVariable_is_contiguous, METH_VARARGS | METH_KEYWORDS, NULL},
   {"item", (PyCFunction)THPVariable_item, METH_NOARGS, NULL},
   {"long", (PyCFunction)THPVariable_long, METH_NOARGS, NULL},
   {"map_", (PyCFunction)THPVariable_map_, METH_VARARGS | METH_KEYWORDS, NULL},
diff --git a/tools/autograd/templates/variable_factories.h b/tools/autograd/templates/variable_factories.h
index 29b1e559392f..7fbcea2ffd99 100644
--- a/tools/autograd/templates/variable_factories.h
+++ b/tools/autograd/templates/variable_factories.h
@@ -4,7 +4,6 @@
 
 #include <ATen/ATen.h>
 #include <c10/util/ArrayRef.h>
-#include <c10/util/qint8.h>
 #include <torch/csrc/autograd/variable.h>
 #include <torch/csrc/jit/tracer.h>
 
diff --git a/tools/build_pytorch_libs.py b/tools/build_pytorch_libs.py
index 62ec73c0f5b9..4d21610bb41e 100644
--- a/tools/build_pytorch_libs.py
+++ b/tools/build_pytorch_libs.py
@@ -81,14 +81,17 @@ def cmake_defines(lst, **kwargs):
 
 
 def overlay_windows_vcvars(env):
-    from distutils._msvccompiler import _get_vc_env
-    vc_arch = 'x64' if IS_64BIT else 'x86'
-    vc_env = _get_vc_env(vc_arch)
-    for k, v in env.items():
-        lk = k.lower()
-        if lk not in vc_env:
-            vc_env[lk] = v
-    return vc_env
+    if sys.version_info >= (3, 5):
+        from distutils._msvccompiler import _get_vc_env
+        vc_arch = 'x64' if IS_64BIT else 'x86'
+        vc_env = _get_vc_env(vc_arch)
+        for k, v in env.items():
+            lk = k.lower()
+            if lk not in vc_env:
+                vc_env[lk] = v
+        return vc_env
+    else:
+        return env
 
 
 def mkdir_p(dir):
@@ -111,10 +114,10 @@ def create_build_env():
         my_env['CUDA_BIN_PATH'] = escape_path(CUDA_HOME)
 
     if IS_WINDOWS:
-        my_env = overlay_windows_vcvars(my_env)
         # When using Ninja under Windows, the gcc toolchain will be chosen as default.
         # But it should be set to MSVC as the user's first choice.
         if USE_NINJA:
+            my_env = overlay_windows_vcvars(my_env)
             cc = my_env.get('CC', 'cl')
             cxx = my_env.get('CXX', 'cl')
             my_env['CC'] = cc
@@ -134,10 +137,10 @@ def run_cmake(version,
     if USE_NINJA:
         cmake_args.append('-GNinja')
     elif IS_WINDOWS:
+        cmake_args.append('-GVisual Studio 15 2017')
         if IS_64BIT:
-            cmake_args.append('-GVisual Studio 15 2017 Win64')
-        else:
-            cmake_args.append('-GVisual Studio 15 2017')
+            cmake_args.append('-Ax64')
+            cmake_args.append('-Thost=x64')
     try:
         import numpy as np
         NUMPY_INCLUDE_DIR = np.get_include()
@@ -163,7 +166,7 @@ def run_cmake(version,
         BUILDING_WITH_TORCH_LIBS=os.getenv("BUILDING_WITH_TORCH_LIBS", "ON"),
         TORCH_BUILD_VERSION=version,
         CMAKE_BUILD_TYPE=build_type,
-        BUILD_TORCH=os.getenv("BUILD_TORCH", "ON"),
+        CMAKE_VERBOSE_MAKEFILE="ON",
         BUILD_PYTHON=build_python,
         BUILD_SHARED_LIBS=os.getenv("BUILD_SHARED_LIBS", "ON"),
         BUILD_BINARY=check_env_flag('BUILD_BINARY'),
@@ -207,8 +210,12 @@ def run_cmake(version,
         USE_REDIS=os.getenv('USE_REDIS'),
         USE_GLOG=os.getenv('USE_GLOG'),
         USE_GFLAGS=os.getenv('USE_GFLAGS'),
+        USE_ASAN=check_env_flag('USE_ASAN'),
         WERROR=os.getenv('WERROR'))
 
+    if os.getenv('_GLIBCXX_USE_CXX11_ABI'):
+        cmake_defines(cmake_args, GLIBCXX_USE_CXX11_ABI=os.getenv('_GLIBCXX_USE_CXX11_ABI'))
+
     if os.getenv('USE_OPENMP'):
         cmake_defines(cmake_args, USE_OPENMP=check_env_flag('USE_OPENMP'))
 
diff --git a/tools/build_variables.py b/tools/build_variables.py
index 9080b47a911a..81744686a3e4 100644
--- a/tools/build_variables.py
+++ b/tools/build_variables.py
@@ -3,7 +3,7 @@
 # not currently relevant so they are combined into one list.
 from __future__ import absolute_import, division, print_function, unicode_literals
 load("@bazel_skylib//lib:new_sets.bzl", "sets")
-
+load("//caffe2/caffe2/fb:defs_gpu.bzl", "gpu_library_selector")
 
 GENERATED_CPP = [
     "Functions.cpp",
@@ -68,6 +68,7 @@
     "torch/csrc/jit/register_c10_ops.cpp",
     "torch/csrc/jit/subgraph_matcher.cpp",
     "torch/csrc/jit/symbolic_script.cpp",
+    "torch/csrc/jit/profiling_graph_executor_impl.cpp",
     "torch/csrc/jit/profiling_record.cpp",
     "torch/csrc/jit/operator.cpp",
     "torch/csrc/jit/passes/alias_analysis.cpp",
@@ -84,6 +85,7 @@
     "torch/csrc/jit/passes/graph_fuser.cpp",
     "torch/csrc/jit/passes/inline_autodiff_subgraphs.cpp",
     "torch/csrc/jit/passes/inplace_check.cpp",
+    "torch/csrc/jit/passes/insert_guards.cpp",
     "torch/csrc/jit/passes/loop_unrolling.cpp",
     "torch/csrc/jit/passes/lower_grad_of.cpp",
     "torch/csrc/jit/passes/lower_tuples.cpp",
@@ -143,7 +145,6 @@
 
 def add_torch_libs():
     r = {}
-    c2_gpu = (read_config("caffe2", "gpu", "1") == "1")
     libtorch_python_sources = [
         ":generate-code=THNN.cpp",
         ":generate-code=python_functions.cpp",
@@ -157,6 +158,7 @@ def add_torch_libs():
         "torch/csrc/DynamicTypes.cpp",
         "torch/csrc/Generator.cpp",
         "torch/csrc/Layout.cpp",
+        "torch/csrc/MemoryFormat.cpp",
         "torch/csrc/Module.cpp",
         "torch/csrc/PtrWrapper.cpp",
         "torch/csrc/Size.cpp",
@@ -200,6 +202,7 @@ def add_torch_libs():
         "torch/csrc/autograd/python_variable_indexing.cpp",
         "torch/csrc/byte_order.cpp",
         "torch/csrc/distributed/Module.cpp",
+        "torch/csrc/distributed/c10d/comm.cpp",
         "torch/csrc/distributed/c10d/init.cpp",
         "torch/csrc/distributed/c10d/reducer.cpp",
         "torch/csrc/jit/init.cpp",
@@ -231,6 +234,7 @@ def add_torch_libs():
         "torch/csrc/utils/tensor_apply.cpp",
         "torch/csrc/utils/tensor_dtypes.cpp",
         "torch/csrc/utils/tensor_layouts.cpp",
+        "torch/csrc/utils/tensor_memoryformats.cpp",
         "torch/csrc/utils/tensor_list.cpp",
         "torch/csrc/utils/tensor_new.cpp",
         "torch/csrc/utils/tensor_numpy.cpp",
@@ -252,25 +256,26 @@ def add_torch_libs():
         "torch/csrc/distributed/c10d/ddp.cpp",
     ]
 
+    compiler_flags_cpu = [
+        "-D_THP_CORE",
+        "-DUSE_C10D",
+        "-DUSE_DISTRIBUTED",
+        "-DUSE_NUMPY",
+        "-DUSE_SCALARS",
+        "-DNO_CUDNN_DESTROY_HANDLE",
+        "-DPYTORCH_ONNX_CAFFE2_BUNDLE",
+        "-Wno-write-strings",
+        "-Wno-format",
+        "-Wno-strict-aliasing",
+        "-Wno-non-virtual-dtor",
+        "-Wno-shadow-compatible-local",
+        "-Wno-empty-body",
+    ]
+    compiler_flags_cuda = [
+        "-DUSE_CUDNN",
+        "-DUSE_NCCL",
+    ]
     common_flags = {
-        "compiler_flags": [
-            "-D_THP_CORE",
-            "-DUSE_C10D",
-            "-DUSE_DISTRIBUTED",
-            "-DUSE_NUMPY",
-            "-DUSE_SCALARS",
-            "-DNO_CUDNN_DESTROY_HANDLE",
-            "-DPYTORCH_ONNX_CAFFE2_BUNDLE",
-            "-Wno-write-strings",
-            "-Wno-format",
-            "-Wno-strict-aliasing",
-            "-Wno-non-virtual-dtor",
-            "-Wno-shadow-compatible-local",
-            "-Wno-empty-body",
-        ] + ([
-            "-DUSE_CUDNN",
-            "-DUSE_NCCL",
-        ] if c2_gpu else []),
         "compiler_specific_flags": {
             "clang": [
                 "-Wno-absolute-value",
@@ -307,6 +312,7 @@ def add_torch_libs():
             ("nanopb", None, "protobuf-nanopb"),
             ("protobuf", None),
         ],
+        compiler_flags=compiler_flags_cpu,
         **common_flags
     )
 
@@ -336,29 +342,55 @@ def add_torch_libs():
             ("cuda", None, "nvrtc-lazy"),
             ("cuda", None, "nvrtc-builtins-lazy"),
         ],
+        compiler_flags=compiler_flags_cpu + compiler_flags_cuda,
         **common_flags
     )
 
     # TODO: split it into cpp and cuda parts similarly to libtorch
-    cpp_library(
+    gpu_library_selector(
         name="_C_impl",
-        srcs=libtorch_python_sources + (
-            libtorch_python_cuda_sources if c2_gpu else []
-        ),
+        deps_cpu=[":_C_impl_cpu"],
+        deps_cuda=[":_C_impl_cuda"],
+        merge_cpu_deps=False,
+    )
+
+    cpp_library(
+        name="_C_impl_cpu",
+        srcs=libtorch_python_sources,
+        link_whole=True,
+        deps=[
+            ":libtorch",
+            ":thnn",
+            "//caffe2/torch/lib/THD:THD_cpu",
+            "//caffe2/torch/lib/c10d:c10d_cpu",
+            "//caffe2/torch/lib/libshm:libshm",
+        ],
+        external_deps=[
+            ("numpy", None, "cpp"),
+            ("pybind11", None),
+            ("python", None),
+        ],
+        compiler_flags=compiler_flags_cpu,
+        **common_flags
+    )
+
+    cpp_library(
+        name="_C_impl_cuda",
+        srcs=libtorch_python_sources + libtorch_python_cuda_sources,
         link_whole=True,
         deps=[
+            ":libtorch_cuda",
             ":thnn",
             "//caffe2/torch/lib/THD:THD",
             "//caffe2/torch/lib/c10d:c10d",
             "//caffe2/torch/lib/libshm:libshm",
-        ] + [
-            ":libtorch_cuda" if c2_gpu else ":libtorch",
         ],
         external_deps=[
             ("numpy", None, "cpp"),
             ("pybind11", None),
             ("python", None),
         ],
+        compiler_flags=compiler_flags_cpu + compiler_flags_cuda,
         **common_flags
     )
 
diff --git a/tools/jit/gen_jit_dispatch.py b/tools/jit/gen_jit_dispatch.py
index 483fa5f047ef..bde2d2035c2a 100644
--- a/tools/jit/gen_jit_dispatch.py
+++ b/tools/jit/gen_jit_dispatch.py
@@ -40,6 +40,7 @@
     'std::array<bool,4>': 'bool[4]',
     'std::string': 'str',
     'Scalar': 'Scalar',
+    'MemoryFormat': 'MemoryFormat',
     'Scalar?': 'Scalar?',
     'Tensor': 'Tensor',
     'Tensor?': 'Tensor?',
@@ -96,6 +97,7 @@ def jit_type_of(arg):
     'IntArrayRef': '{}.toIntList()->elements()',
     'Layout': '{}.toLayout()',
     'Layout?': '{}.toOptional<c10::Layout>()',
+    'MemoryFormat': '{}.toMemoryFormat()',
     'Scalar': '{}.toScalar()',
     'Scalar?': '{}.toOptional<Scalar>()',
     'ScalarType': '{}.toScalarType()',
@@ -483,6 +485,7 @@ def format_arg(arg):
                 .replace('true', 'True') \
                 .replace('false', 'False') \
                 .replace('Reduction::Mean', 'Mean') \
+                .replace('MemoryFormat::Contiguous', 'contiguous_format') \
                 .replace('{}', 'None' if is_tensor_arg(arg) else '[]') \
                 .replace('{', '[') \
                 .replace('}', ']')
diff --git a/tools/pyi/gen_pyi.py b/tools/pyi/gen_pyi.py
index f3ddc100e346..fd4abf591783 100644
--- a/tools/pyi/gen_pyi.py
+++ b/tools/pyi/gen_pyi.py
@@ -335,6 +335,8 @@ def gen_pyi(declarations_path, out):
         'as_tensor': ["def as_tensor(data: Any, dtype: _dtype=None, device: Optional[_device]=None) -> Tensor: ..."],
         'get_num_threads': ['def get_num_threads() -> _int: ...'],
         'set_num_threads': ['def set_num_threads(num: _int) -> None: ...'],
+        'get_num_interop_threads': ['def get_num_interop_threads() -> _int: ...'],
+        'set_num_interop_threads': ['def set_num_interop_threads(num: _int) -> None: ...'],
         # These functions are explicitly disabled by
         # SKIP_PYTHON_BINDINGS because they are hand bound.
         # Correspondingly, we must hand-write their signatures.
diff --git a/tools/run-clang-tidy-in-ci.sh b/tools/run-clang-tidy-in-ci.sh
index 57ce28212305..39e72e1eb109 100755
--- a/tools/run-clang-tidy-in-ci.sh
+++ b/tools/run-clang-tidy-in-ci.sh
@@ -16,7 +16,7 @@ if [[ ! -d build ]]; then
   mkdir build
   pushd build
   # We really only need compile_commands.json, so no need to build!
-  time cmake -DBUILD_TORCH=ON ..
+  time cmake ..
   popd
 
   # Generate ATen files.
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index 5654b639e45a..e7f72d6ad9de 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -1,8 +1,7 @@
-if (CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
-  if (NOT BUILD_TORCH)
-    return()
-  endif()
-else()
+# This file used to build libtorch.so.
+# Now it only builds the Torch python bindings.
+
+if (NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
   cmake_minimum_required(VERSION 3.5 FATAL_ERROR)
   project(torch CXX C)
   find_package(Caffe2 REQUIRED)
@@ -10,498 +9,34 @@ else()
   set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 endif()
 
-option(TORCH_STATIC "Build libtorch.a rather than libtorch.so" OFF)
+if (NOT BUILD_PYTHON)
+  return()
+endif()
 
 set(TORCH_SRC_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
 set(TORCH_ROOT "${TORCH_SRC_DIR}/..")
 
-if(NOT TORCH_INSTALL_BIN_DIR)
-  set(TORCH_INSTALL_BIN_DIR bin)
-endif()
-
-if(NOT TORCH_INSTALL_INCLUDE_DIR)
-  set(TORCH_INSTALL_INCLUDE_DIR include)
-endif()
-
 if(NOT TORCH_INSTALL_LIB_DIR)
   set(TORCH_INSTALL_LIB_DIR lib)
 endif()
 
-set(CMAKE_POSITION_INDEPENDENT_CODE TRUE)
-
-# Generate files
-set(TOOLS_PATH "${TORCH_ROOT}/tools")
-
-configure_file("${TORCH_ROOT}/aten/src/ATen/common_with_cwrap.py"
-               "${TOOLS_PATH}/shared/cwrap_common.py"
-               COPYONLY)
-
-configure_file("${TORCH_SRC_DIR}/_utils_internal.py"
-               "${TOOLS_PATH}/shared/_utils_internal.py"
-               COPYONLY)
-
-add_custom_command(
-  OUTPUT
-  "${TORCH_SRC_DIR}/csrc/nn/THNN.cpp"
-  "${TORCH_SRC_DIR}/csrc/nn/THCUNN.cpp"
-  "${TORCH_SRC_DIR}/csrc/autograd/generated/VariableType.h"
-  "${TORCH_SRC_DIR}/csrc/autograd/generated/VariableType_0.cpp"
-  "${TORCH_SRC_DIR}/csrc/autograd/generated/VariableType_1.cpp"
-  "${TORCH_SRC_DIR}/csrc/autograd/generated/VariableType_2.cpp"
-  "${TORCH_SRC_DIR}/csrc/autograd/generated/VariableType_3.cpp"
-  "${TORCH_SRC_DIR}/csrc/autograd/generated/VariableType_4.cpp"
-  "${TORCH_SRC_DIR}/csrc/autograd/generated/Functions.h"
-  "${TORCH_SRC_DIR}/csrc/autograd/generated/Functions.cpp"
-  "${TORCH_SRC_DIR}/csrc/autograd/generated/python_functions.h"
-  "${TORCH_SRC_DIR}/csrc/autograd/generated/python_functions.cpp"
-  "${TORCH_SRC_DIR}/csrc/autograd/generated/python_variable_methods.cpp"
-  "${TORCH_SRC_DIR}/csrc/autograd/generated/python_variable_methods_dispatch.h"
-  "${TORCH_SRC_DIR}/csrc/autograd/generated/python_torch_functions.cpp"
-  "${TORCH_SRC_DIR}/csrc/autograd/generated/python_torch_functions_dispatch.h"
-  "${TORCH_SRC_DIR}/csrc/autograd/generated/python_nn_functions.cpp"
-  "${TORCH_SRC_DIR}/csrc/autograd/generated/python_nn_functions.h"
-  "${TORCH_SRC_DIR}/csrc/autograd/generated/python_nn_functions_dispatch.h"
-  "${TORCH_SRC_DIR}/csrc/autograd/generated/variable_factories.h"
-  "${TORCH_SRC_DIR}/csrc/jit/generated/register_aten_ops_0.cpp"
-  "${TORCH_SRC_DIR}/csrc/jit/generated/register_aten_ops_1.cpp"
-  "${TORCH_SRC_DIR}/csrc/jit/generated/register_aten_ops_2.cpp"
-  COMMAND
-  "${PYTHON_EXECUTABLE}" tools/setup_helpers/generate_code.py
-    --declarations-path "${CMAKE_BINARY_DIR}/aten/src/ATen/Declarations.yaml"
-    --nn-path "aten/src/"
-  DEPENDS
-  "${CMAKE_BINARY_DIR}/aten/src/ATen/Declarations.yaml"
-  "${CMAKE_CURRENT_LIST_DIR}/../aten/src/THNN/generic/THNN.h"
-  "${TOOLS_PATH}/autograd/templates/VariableType.h"
-  "${TOOLS_PATH}/autograd/templates/VariableType.cpp"
-  "${TOOLS_PATH}/autograd/templates/Functions.h"
-  "${TOOLS_PATH}/autograd/templates/Functions.cpp"
-  "${TOOLS_PATH}/autograd/templates/python_functions.h"
-  "${TOOLS_PATH}/autograd/templates/python_functions.cpp"
-  "${TOOLS_PATH}/autograd/templates/python_variable_methods.cpp"
-  "${TOOLS_PATH}/autograd/templates/python_variable_methods_dispatch.h"
-  "${TOOLS_PATH}/autograd/templates/python_torch_functions.cpp"
-  "${TOOLS_PATH}/autograd/templates/python_torch_functions_dispatch.h"
-  "${TOOLS_PATH}/autograd/templates/python_nn_functions.cpp"
-  "${TOOLS_PATH}/autograd/templates/python_nn_functions.h"
-  "${TOOLS_PATH}/autograd/templates/python_nn_functions_dispatch.h"
-  "${TOOLS_PATH}/autograd/templates/variable_factories.h"
-  "${TOOLS_PATH}/autograd/deprecated.yaml"
-  "${TOOLS_PATH}/autograd/derivatives.yaml"
-  "${TOOLS_PATH}/autograd/gen_autograd_functions.py"
-  "${TOOLS_PATH}/autograd/gen_autograd.py"
-  "${TOOLS_PATH}/autograd/gen_python_functions.py"
-  "${TOOLS_PATH}/autograd/gen_variable_factories.py"
-  "${TOOLS_PATH}/autograd/gen_variable_type.py"
-  "${TOOLS_PATH}/autograd/load_derivatives.py"
-  "${TOOLS_PATH}/autograd/nested_dict.py"
-  "${TOOLS_PATH}/autograd/utils.py"
-  "${TOOLS_PATH}/jit/gen_jit_dispatch.py"
-  "${TOOLS_PATH}/jit/templates/register_aten_ops.cpp"
-  WORKING_DIRECTORY "${TORCH_ROOT}")
-
-set(TORCH_SRCS
-  ${TORCH_SRC_DIR}/csrc/autograd/anomaly_mode.cpp
-  ${TORCH_SRC_DIR}/csrc/autograd/engine.cpp
-  ${TORCH_SRC_DIR}/csrc/autograd/function.cpp
-  ${TORCH_SRC_DIR}/csrc/autograd/function_hook.cpp
-  ${TORCH_SRC_DIR}/csrc/autograd/functions/accumulate_grad.cpp
-  ${TORCH_SRC_DIR}/csrc/autograd/functions/basic_ops.cpp
-  ${TORCH_SRC_DIR}/csrc/autograd/functions/tensor.cpp
-  ${TORCH_SRC_DIR}/csrc/autograd/functions/utils.cpp
-  ${TORCH_SRC_DIR}/csrc/autograd/generated/Functions.cpp
-  ${TORCH_SRC_DIR}/csrc/autograd/generated/VariableType_0.cpp
-  ${TORCH_SRC_DIR}/csrc/autograd/generated/VariableType_1.cpp
-  ${TORCH_SRC_DIR}/csrc/autograd/generated/VariableType_2.cpp
-  ${TORCH_SRC_DIR}/csrc/autograd/generated/VariableType_3.cpp
-  ${TORCH_SRC_DIR}/csrc/autograd/generated/VariableType_4.cpp
-  ${TORCH_SRC_DIR}/csrc/autograd/grad_mode.cpp
-  ${TORCH_SRC_DIR}/csrc/autograd/input_buffer.cpp
-  ${TORCH_SRC_DIR}/csrc/autograd/profiler.cpp
-  ${TORCH_SRC_DIR}/csrc/autograd/record_function.cpp
-  ${TORCH_SRC_DIR}/csrc/autograd/saved_variable.cpp
-  ${TORCH_SRC_DIR}/csrc/autograd/variable.cpp
-  ${TORCH_SRC_DIR}/csrc/autograd/VariableTypeManual.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/autodiff.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/attributes.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/argument_spec.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/export.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/pass_manager.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/pickler.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/generated/register_aten_ops_0.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/generated/register_aten_ops_1.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/generated/register_aten_ops_2.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/graph_executor.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/import_source.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/import.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/import_export_helpers.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/interpreter.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/constants.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/node_hashing.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/ir.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/irparser.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/netdef_converter.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/operator.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/register_c10_ops.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/subgraph_matcher.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/symbolic_script.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/profiling_record.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/passes/alias_analysis.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/passes/batch_mm.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/passes/canonicalize.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/passes/constant_propagation.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/passes/constant_pooling.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/passes/common_subexpression_elimination.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/passes/create_autodiff_subgraphs.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/passes/inline_autodiff_subgraphs.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/passes/dead_code_elimination.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/passes/decompose_ops.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/passes/canonicalize_ops.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/passes/erase_number_types.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/passes/inline_fork_wait.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/passes/graph_fuser.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/passes/inplace_check.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/passes/loop_unrolling.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/passes/lower_grad_of.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/passes/lower_tuples.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/passes/peephole.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/passes/remove_expands.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/passes/remove_inplace_ops.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/passes/shape_analysis.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/passes/requires_grad_analysis.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/passes/specialize_autogradzero.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/passes/subgraph_rewrite.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/passes/python_print.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/passes/utils/subgraph_utils.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/passes/utils/check_alias_annotation.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/passes/utils/memory_dag.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/passes/quantization.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/fuser/interface.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/register_prim_ops.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/register_special_ops.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/register_quantized_ops.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/scope.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/script/compiler.cpp
-  ${TORCH_SRC_DIR}/csrc/api/src/jit.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/testing/file_check.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/script/final_returns.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/script/schema_matching.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/script/script_type_parser.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/script/sugared_value.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/script/class_type.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/script/parser.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/script/builtin_functions.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/script/edit_distance.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/script/logging.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/script/module.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/script/jit_exception.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/tracer.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/hooks_for_testing.cpp
-  ${TORCH_SRC_DIR}/csrc/utils/tensor_flatten.cpp
-  ${TORCH_SRC_DIR}/csrc/utils/variadic.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/fuser/kernel_cache.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/fuser/compiler.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/fuser/executor.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/fuser/codegen.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/fuser/fallback.cpp
-  ${TORCH_ROOT}/test/cpp/jit/test.cpp
-  )
-
-if (WIN32)
-  list(APPEND TORCH_SRCS
-    ${TORCH_SRC_DIR}/csrc/jit/fuser/cpu/dynamic_library_win.cpp
-  )
-else ()
-  list(APPEND TORCH_SRCS
-    ${TORCH_SRC_DIR}/csrc/jit/fuser/cpu/dynamic_library_unix.cpp
-    ${TORCH_SRC_DIR}/csrc/jit/fuser/cpu/fused_kernel.cpp
-  )
-  if (USE_CUDA AND NOT USE_ROCM)
-    list(APPEND TORCH_SRCS
-      ${TORCH_SRC_DIR}/csrc/jit/fuser/cuda/fused_kernel.cpp
-    )
-    add_library(thnvrtc SHARED ${TORCH_SRC_DIR}/csrc/jit/fuser/cuda/thnvrtc.cpp)
-    target_link_libraries(thnvrtc ${CUDA_NVRTC} ${CUDA_CUDA_LIB} ${CUDA_NVRTC_LIB})
-    target_include_directories(thnvrtc PRIVATE ${CUDA_INCLUDE_DIRS})
-    install(TARGETS thnvrtc DESTINATION "${TORCH_INSTALL_LIB_DIR}")
-  endif()
-endif ()
-
-if (USE_CUDA)
-  list(APPEND TORCH_SRCS
-    ${TORCH_SRC_DIR}/csrc/autograd/profiler_cuda.cpp
-    ${TORCH_SRC_DIR}/csrc/autograd/functions/comm.cpp
-    ${TORCH_SRC_DIR}/csrc/cuda/comm.cpp
-  )
-endif()
-
-
-if (USE_ROCM)
-  list(APPEND TORCH_SRCS
-    ${TORCH_SRC_DIR}/csrc/cuda/comm.cpp
-  )
-endif()
-
-
-if (NOT NO_API)
-  list(APPEND TORCH_SRCS
-    ${TORCH_SRC_DIR}/csrc/api/src/cuda.cpp
-    ${TORCH_SRC_DIR}/csrc/api/src/data/datasets/mnist.cpp
-    ${TORCH_SRC_DIR}/csrc/api/src/data/samplers/distributed.cpp
-    ${TORCH_SRC_DIR}/csrc/api/src/data/samplers/random.cpp
-    ${TORCH_SRC_DIR}/csrc/api/src/data/samplers/sequential.cpp
-    ${TORCH_SRC_DIR}/csrc/api/src/data/samplers/stream.cpp
-    ${TORCH_SRC_DIR}/csrc/api/src/nn/init.cpp
-    ${TORCH_SRC_DIR}/csrc/api/src/nn/module.cpp
-    ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/batchnorm.cpp
-    ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/conv.cpp
-    ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/dropout.cpp
-    ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/embedding.cpp
-    ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/functional.cpp
-    ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/linear.cpp
-    ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/named_any.cpp
-    ${TORCH_SRC_DIR}/csrc/api/src/nn/modules/rnn.cpp
-    ${TORCH_SRC_DIR}/csrc/api/src/optim/adagrad.cpp
-    ${TORCH_SRC_DIR}/csrc/api/src/optim/adam.cpp
-    ${TORCH_SRC_DIR}/csrc/api/src/optim/lbfgs.cpp
-    ${TORCH_SRC_DIR}/csrc/api/src/optim/optimizer.cpp
-    ${TORCH_SRC_DIR}/csrc/api/src/optim/rmsprop.cpp
-    ${TORCH_SRC_DIR}/csrc/api/src/optim/serialize.cpp
-    ${TORCH_SRC_DIR}/csrc/api/src/optim/sgd.cpp
-    ${TORCH_SRC_DIR}/csrc/api/src/serialize/input-archive.cpp
-    ${TORCH_SRC_DIR}/csrc/api/src/serialize/output-archive.cpp
-  )
-endif()
-
-
-if (TORCH_STATIC)
-  add_library(torch STATIC ${TORCH_SRCS})
-  target_compile_definitions(torch PUBLIC TORCH_BUILD_STATIC_LIBS)
-else()
-  add_library(torch SHARED ${TORCH_SRCS})
-endif()
-
-target_compile_definitions(torch PUBLIC _THP_CORE)
-
-# until they can be unified, keep these lists synced with setup.py
-if(MSVC)
-  target_compile_options(torch PUBLIC
-    ${MSVC_RUNTIME_LIBRARY_OPTION}
-    /Z7
-    /EHa
-    /DNOMINMAX
-    /wd4267
-    /wd4251
-    /wd4522
-    /wd4522
-    /wd4838
-    /wd4305
-    /wd4244
-    /wd4190
-    /wd4101
-    /wd4996
-    /wd4275
-    /bigobj
-    )
-else()
-  target_compile_options(torch PUBLIC
-    -std=c++11
-    -Wall
-    -Wextra
-    -Wno-unused-parameter
-    -Wno-missing-field-initializers
-    -Wno-write-strings
-    -Wno-unknown-pragmas
-    # Clang has an unfixed bug leading to spurious missing braces
-    # warnings, see https://bugs.llvm.org/show_bug.cgi?id=21629
-    -Wno-missing-braces
-    )
-
-  if(NOT APPLE)
-    target_compile_options(torch PRIVATE
-      # Considered to be flaky.  See the discussion at
-      # https://github.com/pytorch/pytorch/pull/9608
-      -Wno-maybe-uninitialized)
-  endif()
-
-endif()
-
-if (MSVC)
-elseif (WERROR)
-  target_compile_options(torch PRIVATE -Werror -Wno-strict-overflow)
-endif()
-
-if (MSVC)
-  target_link_libraries(torch onnx onnx_library)
-endif()
-
-target_link_libraries(torch caffe2_library)
-
-find_package(OpenMP QUIET)
-if(USE_OPENMP AND OPENMP_FOUND)
-  message(STATUS "pytorch is compiling with OpenMP. \n"
-    "OpenMP CXX_FLAGS: ${OpenMP_CXX_FLAGS}. \n"
-    "OpenMP libraries: ${OpenMP_CXX_LIBRARIES}.")
-  target_compile_options(torch INTERFACE ${OpenMP_CXX_FLAGS})
-  target_link_libraries(torch ${OpenMP_CXX_LIBRARIES})
-endif()
-
-if (NOT NO_API)
-  target_include_directories(torch PUBLIC
-    ${TORCH_SRC_DIR}/csrc/api
-    ${TORCH_SRC_DIR}/csrc/api/include)
-endif()
-
-if(USE_CUDA)
-  if(MSVC)
-    if (NOT NVTOOLEXT_HOME)
-      set(NVTOOLEXT_HOME "C:/Program Files/NVIDIA Corporation/NvToolsExt")
-    endif()
-    if ($ENV{NVTOOLEXT_HOME})
-      set(NVTOOLEXT_HOME $ENV{NVTOOLEXT_HOME})
-    endif()
-    set(TORCH_CUDA_LIBRARIES
-      ${NVTOOLEXT_HOME}/lib/x64/nvToolsExt64_1.lib
-      ${CUDA_LIBRARIES})
-    target_include_directories(torch PUBLIC "${NVTOOLEXT_HOME}/include")
-  elseif(APPLE)
-    set(TORCH_CUDA_LIBRARIES
-      ${CUDA_TOOLKIT_ROOT_DIR}/lib/libcudart.dylib
-      ${CUDA_TOOLKIT_ROOT_DIR}/lib/libnvrtc.dylib
-      ${CUDA_TOOLKIT_ROOT_DIR}/lib/libnvToolsExt.dylib
-      ${CUDA_LIBRARIES})
-    set_target_properties(torch PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
-  else()
-    find_library(LIBNVTOOLSEXT libnvToolsExt.so PATHS ${CUDA_TOOLKIT_ROOT_DIR}/lib64/)
-    set(TORCH_CUDA_LIBRARIES
-      ${LIBNVTOOLSEXT}
-      ${CUDA_LIBRARIES})
-  endif()
-
-  target_link_libraries(torch caffe2_gpu_library ${TORCH_CUDA_LIBRARIES})
-  target_compile_definitions(torch PRIVATE USE_CUDA)
-endif()
-
-if(USE_ROCM)
-  target_link_libraries(torch caffe2_hip_library)
-  target_compile_definitions(torch PRIVATE
-    USE_ROCM
-    __HIP_PLATFORM_HCC__
-    )
-  target_include_directories(torch PRIVATE
-    /opt/rocm/include
-    /opt/rocm/hcc/include
-    /opt/rocm/rocblas/include
-    /opt/rocm/hipsparse/include
-    )
-endif()
-
-
-set(TH_CPU_INCLUDE
-  # dense
-  ${TORCH_ROOT}/aten/src/TH
-  ${CMAKE_CURRENT_BINARY_DIR}/../aten/src/TH
-  ${TORCH_ROOT}/aten/src
-  ${CMAKE_CURRENT_BINARY_DIR}/../aten/src
-  ${CMAKE_BINARY_DIR}/aten/src)
-target_include_directories(torch PRIVATE ${TH_CPU_INCLUDE})
-
-set(ATen_CPU_INCLUDE
-  ${TORCH_ROOT}/aten/src
-  ${CMAKE_CURRENT_BINARY_DIR}/../aten/src
-  ${CMAKE_CURRENT_BINARY_DIR}/../aten/src/ATen
-  ${CMAKE_BINARY_DIR}/aten/src)
-target_include_directories(torch PUBLIC ${ATen_CPU_INCLUDE})
-
-target_include_directories(torch PUBLIC
-  ${TORCH_SRC_DIR}/csrc)
-
-target_include_directories(torch PUBLIC
-  ${TORCH_ROOT}/third_party/miniz-2.0.8)
-
-set_target_properties(torch PROPERTIES VERSION 1 SOVERSION 1)
-
-if(NOT ${CMAKE_VERSION} VERSION_LESS "3.1")
-  set_property(TARGET torch PROPERTY CXX_STANDARD 11)
-endif()
-
-# Prevent the unused functions being optimized away
-# Otherwise torch.dll will be linked without caffe2_gpu.dll
 if (MSVC)
-  set_target_properties(torch PROPERTIES LINK_FLAGS "/OPT:NOREF")
-endif(MSVC)
-
-install(DIRECTORY "${TORCH_SRC_DIR}/csrc"
-        DESTINATION ${TORCH_INSTALL_INCLUDE_DIR}/torch
-        FILES_MATCHING PATTERN "*.h")
-install(FILES "${TORCH_SRC_DIR}/script.h" "${TORCH_SRC_DIR}/extension.h"
-        DESTINATION ${TORCH_INSTALL_INCLUDE_DIR}/torch)
-
-install(TARGETS torch DESTINATION "${TORCH_INSTALL_LIB_DIR}")
-
-if (MSVC AND NOT TORCH_STATIC)
-  install(FILES $<TARGET_PDB_FILE:torch> DESTINATION "${TORCH_INSTALL_LIB_DIR}" OPTIONAL)
-endif()
-
-if (BUILD_TEST AND NOT MSVC AND NOT USE_ROCM)
-  add_subdirectory(${TORCH_ROOT}/test/cpp/jit ${CMAKE_BINARY_DIR}/test_jit)
-endif()
-
-if (BUILD_TEST AND NOT NO_API)
-  add_subdirectory(${TORCH_ROOT}/test/cpp/api ${CMAKE_BINARY_DIR}/test_api)
+    set(LIBSHM_SUBDIR libshm_windows)
+else()
+    set(LIBSHM_SUBDIR libshm)
 endif()
 
-if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
-  message(STATUS "${CMAKE_CXX_COMPILER} ${CMAKE_CURRENT_LIST_DIR}/abi-check.cpp -o ${CMAKE_BINARY_DIR}/abi-check")
-  execute_process(
-    COMMAND
-      "${CMAKE_CXX_COMPILER}"
-      "${CMAKE_CURRENT_LIST_DIR}/abi-check.cpp"
-      "-o"
-      "${CMAKE_BINARY_DIR}/abi-check"
-    RESULT_VARIABLE ABI_CHECK_COMPILE_RESULT)
-  if (ABI_CHECK_COMPILE_RESULT)
-    message(FATAL_ERROR "Could not compile ABI Check: ${ABI_CHECK_COMPILE_RESULT}")
-  endif()
-  execute_process(
-    COMMAND "${CMAKE_BINARY_DIR}/abi-check"
-    RESULT_VARIABLE ABI_CHECK_RESULT
-    OUTPUT_VARIABLE GLIBCXX_USE_CXX11_ABI)
-  if (ABI_CHECK_RESULT)
-    message(WARNING "Could not run ABI Check: ${ABI_CHECK_RESULT}")
-  endif()
-  message(STATUS "Determined _GLIBCXX_USE_CXX11_ABI=${GLIBCXX_USE_CXX11_ABI}")
-endif()
+set(LIBSHM_SRCDIR ${TORCH_SRC_DIR}/lib/${LIBSHM_SUBDIR})
+add_subdirectory(${LIBSHM_SRCDIR})
 
-# CMake config for external projects.
-configure_file(
-    ${PROJECT_SOURCE_DIR}/cmake/TorchConfigVersion.cmake.in
-    ${PROJECT_BINARY_DIR}/TorchConfigVersion.cmake
-    @ONLY)
-configure_file(
-    ${TORCH_ROOT}/cmake/TorchConfig.cmake.in
-    ${PROJECT_BINARY_DIR}/TorchConfig.cmake
-    @ONLY)
-install(FILES
-    ${PROJECT_BINARY_DIR}/TorchConfigVersion.cmake
-    ${PROJECT_BINARY_DIR}/TorchConfig.cmake
-    DESTINATION share/cmake/Torch)
 
-if (USE_DISTRIBUTED)
-  add_subdirectory(${TORCH_SRC_DIR}/lib/THD)
-  if (NOT MSVC AND NOT APPLE)
-    add_subdirectory(${TORCH_SRC_DIR}/lib/c10d)
-  endif()
-endif()
+# Generate files
+set(TOOLS_PATH "${TORCH_ROOT}/tools")
 
-if (BUILD_PYTHON)
-  if (MSVC)
-    add_subdirectory(${TORCH_SRC_DIR}/lib/libshm_windows)
-  else()
-    add_subdirectory(${TORCH_SRC_DIR}/lib/libshm)
-  endif()
 
-  set(TORCH_PYTHON_SRCS
+set(TORCH_PYTHON_SRCS
+    ${GENERATED_THNN_CXX}
+    ${GENERATED_CXX_PYTHON}
     ${TORCH_SRC_DIR}/csrc/CudaIPCTypes.cpp
     ${TORCH_SRC_DIR}/csrc/DataLoader.cpp
     ${TORCH_SRC_DIR}/csrc/Device.cpp
@@ -511,16 +46,13 @@ if (BUILD_PYTHON)
     ${TORCH_SRC_DIR}/csrc/TypeInfo.cpp
     ${TORCH_SRC_DIR}/csrc/Generator.cpp
     ${TORCH_SRC_DIR}/csrc/Layout.cpp
+    ${TORCH_SRC_DIR}/csrc/MemoryFormat.cpp
     ${TORCH_SRC_DIR}/csrc/Module.cpp
     ${TORCH_SRC_DIR}/csrc/PtrWrapper.cpp
     ${TORCH_SRC_DIR}/csrc/Size.cpp
     ${TORCH_SRC_DIR}/csrc/Storage.cpp
     ${TORCH_SRC_DIR}/csrc/api/src/python/init.cpp
     ${TORCH_SRC_DIR}/csrc/autograd/functions/init.cpp
-    ${TORCH_SRC_DIR}/csrc/autograd/generated/python_functions.cpp
-    ${TORCH_SRC_DIR}/csrc/autograd/generated/python_nn_functions.cpp
-    ${TORCH_SRC_DIR}/csrc/autograd/generated/python_torch_functions.cpp
-    ${TORCH_SRC_DIR}/csrc/autograd/generated/python_variable_methods.cpp
     ${TORCH_SRC_DIR}/csrc/autograd/init.cpp
     ${TORCH_SRC_DIR}/csrc/autograd/python_anomaly_mode.cpp
     ${TORCH_SRC_DIR}/csrc/autograd/python_cpp_function.cpp
@@ -545,7 +77,6 @@ if (BUILD_PYTHON)
     ${TORCH_SRC_DIR}/csrc/jit/script/python_sugared_value.cpp
     ${TORCH_SRC_DIR}/csrc/jit/script/python_tree_views.cpp
     ${TORCH_SRC_DIR}/csrc/multiprocessing/init.cpp
-    ${TORCH_SRC_DIR}/csrc/nn/THNN.cpp
     ${TORCH_SRC_DIR}/csrc/onnx/init.cpp
     ${TORCH_SRC_DIR}/csrc/serialization.cpp
     ${TORCH_SRC_DIR}/csrc/tensor/python_tensor.cpp
@@ -559,13 +90,14 @@ if (BUILD_PYTHON)
     ${TORCH_SRC_DIR}/csrc/utils/tensor_dtypes.cpp
     ${TORCH_SRC_DIR}/csrc/utils/tensor_layouts.cpp
     ${TORCH_SRC_DIR}/csrc/utils/tensor_list.cpp
+    ${TORCH_SRC_DIR}/csrc/utils/tensor_memoryformats.cpp
     ${TORCH_SRC_DIR}/csrc/utils/tensor_new.cpp
     ${TORCH_SRC_DIR}/csrc/utils/tensor_numpy.cpp
     ${TORCH_SRC_DIR}/csrc/utils/tensor_types.cpp
     ${TORCH_SRC_DIR}/csrc/utils/tuple_parser.cpp
     )
 
-  set(TORCH_PYTHON_INCLUDE_DIRECTORIES
+set(TORCH_PYTHON_INCLUDE_DIRECTORIES
     ${PYTHON_INCLUDE_DIR}
 
     ${TORCH_ROOT}
@@ -588,40 +120,35 @@ if (BUILD_PYTHON)
     ${TORCH_SRC_DIR}/lib
     )
 
-  if (MSVC)
-    list(APPEND TORCH_PYTHON_INCLUDE_DIRECTORIES
-      ${TORCH_SRC_DIR}/lib/libshm_windows)
-  else()
-    list(APPEND TORCH_PYTHON_INCLUDE_DIRECTORIES
-      ${TORCH_SRC_DIR}/lib/libshm)
-  endif()
 
-  set(TORCH_PYTHON_LINK_LIBRARIES
-    torch
+list(APPEND TORCH_PYTHON_INCLUDE_DIRECTORIES ${LIBSHM_SRCDIR})
+
+set(TORCH_PYTHON_LINK_LIBRARIES
+    caffe2_library
     shm)
 
-  set(TORCH_PYTHON_COMPILE_DEFINITIONS)
+set(TORCH_PYTHON_COMPILE_DEFINITIONS)
 
-  set(TORCH_PYTHON_COMPILE_OPTIONS)
+set(TORCH_PYTHON_COMPILE_OPTIONS)
 
-  set(TORCH_PYTHON_LINK_FLAGS "")
+set(TORCH_PYTHON_LINK_FLAGS "")
 
-  if (MSVC)
+if (MSVC)
     string(APPEND TORCH_PYTHON_LINK_FLAGS " /NODEFAULTLIB:LIBCMT.LIB")
     list(APPEND TORCH_PYTHON_LINK_LIBRARIES ${PYTHON_LIBRARIES})
     if (NOT ${CMAKE_BUILD_TYPE} MATCHES "Release")
       string(APPEND TORCH_PYTHON_LINK_FLAGS " /DEBUG:FULL")
     endif()
-  elseif (APPLE)
+elseif (APPLE)
     string(APPEND TORCH_PYTHON_LINK_FLAGS " -undefined dynamic_lookup")
-  else()
+else()
     list(APPEND TORCH_PYTHON_COMPILE_OPTIONS
       -fno-strict-aliasing
       -Wno-write-strings
       -Wno-strict-aliasing)
-  endif()
+endif()
 
-  if (USE_CUDA)
+if (USE_CUDA)
     list(APPEND TORCH_PYTHON_SRCS
       ${TORCH_SRC_DIR}/csrc/cuda/Module.cpp
       ${TORCH_SRC_DIR}/csrc/cuda/Storage.cpp
@@ -630,7 +157,7 @@ if (BUILD_PYTHON)
       ${TORCH_SRC_DIR}/csrc/cuda/utils.cpp
       ${TORCH_SRC_DIR}/csrc/cuda/python_comm.cpp
       ${TORCH_SRC_DIR}/csrc/cuda/serialization.cpp
-      ${TORCH_SRC_DIR}/csrc/nn/THCUNN.cpp
+      ${GENERATED_THNN_CXX_CUDA}
       )
     list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_CUDA)
 
@@ -643,9 +170,11 @@ if (BUILD_PYTHON)
       find_library(LIBNVTOOLSEXT libnvToolsExt.so PATHS ${CUDA_TOOLKIT_ROOT_DIR}/lib64/)
       list(APPEND TORCH_PYTHON_LINK_LIBRARIES ${LIBNVTOOLSEXT})
     endif()
-  endif()
 
-  if (USE_CUDNN)
+    list(APPEND TORCH_PYTHON_LINK_LIBRARIES caffe2_gpu_library)
+endif()
+
+if (USE_CUDNN)
     list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_CUDNN)
 
     # NOTE: these are at the front, in case there's another cuDNN in
@@ -655,19 +184,19 @@ if (BUILD_PYTHON)
     # we're not careful.
     list(INSERT 0 TORCH_PYTHON_LINK_LIBRARIES ${CUDNN_LIBRARY})
     list(INSERT 0 TORCH_PYTHON_INCLUDE_DIRECTORIES ${CUDNN_INCLUDE_DIR})
-  endif()
+endif()
 
-  if (USE_MIOPEN)
+if (USE_MIOPEN)
     list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_MIOPEN)
     list(APPEND TORCH_PYTHON_INCLUDE_DIRECTORIES ${MIOPEN_INCLUDE_DIR})
     list(APPEND TORCH_PYTHON_LINK_LIBRARIES ${MIOPEN_LIBRARY})
-  endif()
+endif()
 
-  if (USE_NUMPY)
+if (USE_NUMPY)
     list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_NUMPY)
-  endif()
+endif()
 
-  if (USE_ROCM)
+if (USE_ROCM)
     list(APPEND TORCH_PYTHON_SRCS
       ${TORCH_SRC_DIR}/csrc/cuda/Module.cpp
       ${TORCH_SRC_DIR}/csrc/cuda/Storage.cpp
@@ -676,32 +205,35 @@ if (BUILD_PYTHON)
       ${TORCH_SRC_DIR}/csrc/cuda/utils.cpp
       ${TORCH_SRC_DIR}/csrc/cuda/python_comm.cpp
       ${TORCH_SRC_DIR}/csrc/cuda/serialization.cpp
-      ${TORCH_SRC_DIR}/csrc/nn/THCUNN.cpp
+      ${GENERATED_THNN_CXX_CUDA}
       )
     list(APPEND TORCH_PYTHON_LINK_LIBRARIES caffe2_hip_library)
     list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS
       USE_ROCM
       __HIP_PLATFORM_HCC__
       )
-  endif()
+endif()
 
-  if (USE_DISTRIBUTED)
+if (USE_DISTRIBUTED)
     list(APPEND TORCH_PYTHON_SRCS ${TORCH_SRC_DIR}/csrc/distributed/Module.cpp)
     list(APPEND TORCH_PYTHON_INCLUDE_DIRECTORIES ${TORCH_SRC_DIR}/lib/THD)
     list(APPEND TORCH_PYTHON_LINK_LIBRARIES THD)
     list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_DISTRIBUTED)
     if (NOT MSVC AND NOT APPLE)
-      list(APPEND TORCH_PYTHON_SRCS ${TORCH_SRC_DIR}/csrc/distributed/c10d/init.cpp)
-      list(APPEND TORCH_PYTHON_SRCS ${TORCH_SRC_DIR}/csrc/distributed/c10d/reducer.cpp)
+      list(APPEND TORCH_PYTHON_SRCS
+        ${TORCH_SRC_DIR}/csrc/distributed/c10d/comm.cpp
+        ${TORCH_SRC_DIR}/csrc/distributed/c10d/init.cpp
+        ${TORCH_SRC_DIR}/csrc/distributed/c10d/reducer.cpp
+        )
       list(APPEND TORCH_PYTHON_LINK_LIBRARIES c10d)
       list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D)
       if (USE_CUDA OR USE_ROCM)
         list(APPEND TORCH_PYTHON_SRCS ${TORCH_SRC_DIR}/csrc/distributed/c10d/ddp.cpp)
       endif()
     endif()
-  endif()
+endif()
 
-  if (USE_NCCL)
+if (USE_NCCL)
     list(APPEND TORCH_PYTHON_SRCS
       ${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp
       ${TORCH_SRC_DIR}/csrc/cuda/python_nccl.cpp)
@@ -709,12 +241,16 @@ if (BUILD_PYTHON)
     list(APPEND TORCH_PYTHON_LINK_LIBRARIES __caffe2_nccl)
     if (USE_SYSTEM_NCCL)
     endif()
-  endif()
+endif()
 
-  add_custom_target(torch_python_stubs DEPENDS "${TORCH_SRC_DIR}/__init__.pyi")
-  # For Declarations.yaml dependency
-  add_dependencies(torch_python_stubs ATEN_CPU_FILES_GEN_TARGET)
-  add_custom_command(
+
+
+
+
+add_custom_target(torch_python_stubs DEPENDS "${TORCH_SRC_DIR}/__init__.pyi")
+# For Declarations.yaml dependency
+add_dependencies(torch_python_stubs ATEN_CPU_FILES_GEN_TARGET)
+add_custom_command(
     OUTPUT
     "${TORCH_SRC_DIR}/__init__.pyi"
     COMMAND
@@ -727,21 +263,34 @@ if (BUILD_PYTHON)
     "${TORCH_ROOT}"
     )
 
-  add_library(torch_python SHARED ${TORCH_PYTHON_SRCS})
 
-  add_dependencies(torch_python torch_python_stubs)
 
-  target_link_libraries(torch_python ${TORCH_PYTHON_LINK_LIBRARIES})
+add_library(torch_python SHARED ${TORCH_PYTHON_SRCS})
 
-  target_compile_definitions(torch_python PRIVATE ${TORCH_PYTHON_COMPILE_DEFINITIONS})
+# Required workaround for generated sources
+# See https://samthursfield.wordpress.com/2015/11/21/cmake-dependencies-between-targets-and-files-and-custom-commands/#custom-commands-in-different-directories
+add_dependencies(torch_python generate-torch-sources)
+set_source_files_properties(
+    ${GENERATED_THNN_SOURCES}
+    ${GENERATED_CXX_PYTHON}
+    PROPERTIES GENERATED TRUE
+    )
 
-  target_compile_options(torch_python PRIVATE ${TORCH_PYTHON_COMPILE_OPTIONS})
+target_compile_definitions(torch_python PUBLIC _THP_CORE)
 
-  target_include_directories(torch_python PUBLIC ${TORCH_PYTHON_INCLUDE_DIRECTORIES})
+add_dependencies(torch_python torch_python_stubs)
 
-  if (NOT TORCH_PYTHON_LINK_FLAGS STREQUAL "")
-    set_target_properties(torch_python PROPERTIES LINK_FLAGS ${TORCH_PYTHON_LINK_FLAGS})
-  endif()
+target_link_libraries(torch_python ${TORCH_PYTHON_LINK_LIBRARIES})
 
-  install(TARGETS torch_python DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+target_compile_definitions(torch_python PRIVATE ${TORCH_PYTHON_COMPILE_DEFINITIONS})
+
+target_compile_options(torch_python PRIVATE ${TORCH_PYTHON_COMPILE_OPTIONS})
+
+target_include_directories(torch_python PUBLIC ${TORCH_PYTHON_INCLUDE_DIRECTORIES})
+
+
+if (NOT TORCH_PYTHON_LINK_FLAGS STREQUAL "")
+    set_target_properties(torch_python PROPERTIES LINK_FLAGS ${TORCH_PYTHON_LINK_FLAGS})
 endif()
+
+install(TARGETS torch_python DESTINATION "${TORCH_INSTALL_LIB_DIR}")
diff --git a/torch/_jit_internal.py b/torch/_jit_internal.py
index 8c2cff48e6c6..d7a779dd8265 100644
--- a/torch/_jit_internal.py
+++ b/torch/_jit_internal.py
@@ -179,7 +179,7 @@ def decorator(fn):
 
 try:
     import typing
-    from typing import Tuple, List, Dict
+    from typing import Tuple, List, Dict, Optional
 
     def is_tuple(ann):
         # For some reason Python 3.7 violates the Type[A, B].__origin__ == Type rule
@@ -196,6 +196,22 @@ def is_dict(ann):
         return ann.__module__ == 'typing' and \
             (getattr(ann, '__origin__', None) is typing.Dict or
              getattr(ann, '__origin__', None) is dict)
+
+    def is_optional(ann):
+        # Optional[T] is just shorthand for Union[T, None], so check for both
+        union_optional = False
+        if ann.__module__ == 'typing' and \
+           (getattr(ann, '__origin__', None) is typing.Union):
+            args = getattr(ann, '__args__', ())
+            if len(args) == 2:
+                union_optional = (issubclass(args[1], type(None)) and not issubclass(args[0], type(None))) \
+                    or (issubclass(args[0], type(None)) and not issubclass(args[1], type(None)))
+
+        optional = ann.__module__ == 'typing' and \
+            (getattr(ann, '__origin__', None) is typing.Optional)
+
+        return optional or union_optional
+
 except ImportError:
     # A minimal polyfill for versions of Python that don't have typing.
     # Note that this means that they also don't support the fancy annotation syntax, so
@@ -232,9 +248,20 @@ class DictCls(object):
         def __getitem__(self, types):
             return DictInstance(types)
 
+    class OptionalInstance(object):
+        __slots__ = ['__args__']
+
+        def __init__(self, types):
+            self.__args__ = types
+
+    class OptionalCls(object):
+        def __getitem__(self, types):
+            return OptionalInstance(types)
+
     Tuple = TupleCls()  # noqa: T484
     List = ListCls()  # noqa: T484
     Dict = DictCls()  # noqa: T484
+    Optional = DictCls()  # noqa: T484
 
     def is_tuple(ann):
         return isinstance(ann, TupleInstance)
@@ -245,6 +272,9 @@ def is_list(ann):
     def is_dict(ann):
         return isinstance(ann, DictInstance)
 
+    def is_optional(ann):
+        return isinstance(ann, OptionalInstance)
+
 
 # allows BroadcastingList instance to be subscriptable
 class BroadcastingListCls(object):
diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py
index b904f09009fa..6e39e36dfcc9 100644
--- a/torch/_tensor_docs.py
+++ b/torch/_tensor_docs.py
@@ -3031,6 +3031,15 @@ def callable(a, b) -> number
 See :func:`torch.det`
 """)
 
+add_docstr_all('dequantize_linear',
+               r"""
+dequantize_linear(int_tensor, scale, zero_point) -> Tensor
+
+Dequantize an int Tensor that represents the underlying quantized data
+using affine quantization scheme with given scale and zero_point.
+returns a float Tensor.
+""")
+
 add_docstr_all('where',
                r"""
 where(condition, y) -> Tensor
@@ -3191,3 +3200,8 @@ def callable(a, b) -> number
                r"""
 Is the :class:`torch.device` where this Tensor is.
 """)
+
+add_docstr_all('ndim',
+               r"""
+Alias for :meth:`~Tensor.dim()`
+""")
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index 291a5871fbe0..8683f3434552 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -2147,7 +2147,15 @@ def merge_dicts(*dicts):
            r"""
 get_num_threads() -> int
 
-Gets the number of threads used for parallelizing CPU operations
+Returns the number of threads used for parallelizing CPU operations
+""")
+
+add_docstr(torch.get_num_interop_threads,
+           r"""
+get_num_interop_threads() -> int
+
+Returns the number of threads used for inter-op parallelism on CPU
+(e.g. in JIT interpreter)
 """)
 
 add_docstr(torch.gt,
@@ -4304,6 +4312,16 @@ def merge_dicts(*dicts):
 must be called before running eager, JIT or autograd code.
 """)
 
+add_docstr(torch.set_num_interop_threads,
+           r"""
+set_num_interop_threads(int)
+
+Sets the number of threads used for interop parallelism
+(e.g. in JIT interpreter) on CPU.
+WARNING: Can only be called once and before any inter-op parallel work
+is started (e.g. JIT execution).
+""")
+
 add_docstr(torch.sigmoid,
            r"""
 sigmoid(input, out=None) -> Tensor
@@ -4660,6 +4678,56 @@ def merge_dicts(*dicts):
     tensor([ 1.0311,  0.7477,  1.2204,  0.9087])
 """.format(**multi_dim_common))
 
+add_docstr(torch.std_mean,
+           r"""
+.. function:: std_mean(input, unbiased=True) -> (Tensor, Tensor)
+
+Returns the standard-deviation and mean of all elements in the :attr:`input` tensor.
+
+If :attr:`unbiased` is ``False``, then the standard-deviation will be calculated
+via the biased estimator. Otherwise, Bessel's correction will be used.
+
+Args:
+    input (Tensor): the input tensor
+    unbiased (bool): whether to use the unbiased estimation or not
+
+Example::
+
+    >>> a = torch.randn(1, 3)
+    >>> a
+    tensor([[0.3364, 0.3591, 0.9462]])
+    >>> torch.std_mean(a)
+    (tensor(0.3457), tensor(0.5472))
+
+.. function:: std(input, dim, keepdim=False, unbiased=True) -> (Tensor, Tensor)
+
+Returns the standard-deviation and mean of each row of the :attr:`input` tensor in the
+dimension :attr:`dim`. If :attr:`dim` is a list of dimensions,
+reduce over all of them.
+
+{keepdim_details}
+
+If :attr:`unbiased` is ``False``, then the standard-deviation will be calculated
+via the biased estimator. Otherwise, Bessel's correction will be used.
+
+Args:
+    input (Tensor): the input tensor
+    {dim}
+    {keepdim}
+    unbiased (bool): whether to use the unbiased estimation or not
+
+Example::
+
+    >>> a = torch.randn(4, 4)
+    >>> a
+    tensor([[ 0.5648, -0.5984, -1.2676, -1.4471],
+            [ 0.9267,  1.0612,  1.1050, -0.6014],
+            [ 0.0154,  1.9301,  0.0125, -1.0904],
+            [-1.9711, -0.7748, -1.3840,  0.5067]])
+    >>> torch.std_mean(a, 1)
+    (tensor([0.9110, 0.8197, 1.2552, 1.0608]), tensor([-0.6871,  0.6229,  0.2169, -0.9058]))
+""".format(**multi_dim_common))
+
 add_docstr(torch.sum,
            r"""
 .. function:: sum(input, dtype=None) -> Tensor
@@ -5513,6 +5581,55 @@ def merge_dicts(*dicts):
     tensor([ 1.7444,  1.1363,  0.7356,  0.5112])
 """.format(**multi_dim_common))
 
+add_docstr(torch.var_mean,
+           r"""
+.. function:: var_mean(input, unbiased=True) -> (Tensor, Tensor)
+
+Returns the variance and mean of all elements in the :attr:`input` tensor.
+
+If :attr:`unbiased` is ``False``, then the variance will be calculated via the
+biased estimator. Otherwise, Bessel's correction will be used.
+
+Args:
+    input (Tensor): the input tensor
+    unbiased (bool): whether to use the unbiased estimation or not
+
+Example::
+
+    >>> a = torch.randn(1, 3)
+    >>> a
+    tensor([[0.0146, 0.4258, 0.2211]])
+    >>> torch.var_mean(a)
+    (tensor(0.0423), tensor(0.2205))
+
+.. function:: var_mean(input, dim, keepdim=False, unbiased=True) -> (Tensor, Tensor)
+
+Returns the variance and mean of each row of the :attr:`input` tensor in the given
+dimension :attr:`dim`.
+
+{keepdim_details}
+
+If :attr:`unbiased` is ``False``, then the variance will be calculated via the
+biased estimator. Otherwise, Bessel's correction will be used.
+
+Args:
+    input (Tensor): the input tensor
+    {dim}
+    {keepdim}
+    unbiased (bool): whether to use the unbiased estimation or not
+
+Example::
+
+    >>> a = torch.randn(4, 4)
+    >>> a
+    tensor([[-1.5650,  2.0415, -0.1024, -0.5790],
+            [ 0.2325, -2.6145, -1.6428, -0.3537],
+            [-0.2159, -1.1069,  1.2882, -1.3265],
+            [-0.6706, -1.5893,  0.6827,  1.6727]])
+    >>> torch.var_mean(a, 1)
+    (tensor([2.3174, 1.6403, 1.4092, 2.0791]), tensor([-0.0512, -1.0946, -0.3403,  0.0239]))
+""".format(**multi_dim_common))
+
 add_docstr(torch.zeros,
            r"""
 zeros(*sizes, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
diff --git a/torch/csrc/Device.cpp b/torch/csrc/Device.cpp
index 46af266f3fde..d17bbabc1246 100644
--- a/torch/csrc/Device.cpp
+++ b/torch/csrc/Device.cpp
@@ -67,7 +67,7 @@ PyObject *THPDevice_pynew(PyTypeObject *type, PyObject *args, PyObject *kwargs)
       device_index = r.toInt64(1);
       // -1 is allowed in ATen/C++, to mean the default device, but not in
       // Python.
-      AT_CHECK(device_index >= 0, "Device index must not be negative");
+      TORCH_CHECK(device_index >= 0, "Device index must not be negative");
     }
     at::Device device(as_device.type(), device_index);
     return THPDevice_New(device);
diff --git a/torch/csrc/DynamicTypes.cpp b/torch/csrc/DynamicTypes.cpp
index c0578002d29f..97679f290f6d 100644
--- a/torch/csrc/DynamicTypes.cpp
+++ b/torch/csrc/DynamicTypes.cpp
@@ -25,18 +25,6 @@
 
 namespace torch {
 namespace {
-const std::unordered_map<std::string, at::ScalarType> attype_names = {
-  {"Float", at::kFloat},
-  {"Double", at::kDouble},
-  {"Half", at::kHalf},
-  {"Byte", at::kByte},
-  {"Char", at::kChar},
-  {"Short", at::kShort},
-  {"Int", at::kInt},
-  {"Long", at::kLong},
-  {"Bool", at::kBool},
-};
-
 std::unordered_map<at::DeprecatedTypeProperties*, PyTypeObject*> attype_to_py_storage_type;
 std::unordered_map<PyTypeObject*, at::DeprecatedTypeProperties*> py_storage_type_to_attype;
 
@@ -62,12 +50,11 @@ at::Backend get_backend(bool is_cuda, bool is_sparse) {
   }
 }
 
-at::DeprecatedTypeProperties* get_type(const std::string& name, bool is_cuda, bool is_sparse) {
-  if (is_sparse && name == "Half") {
+at::DeprecatedTypeProperties* get_type(at::Backend backend, at::ScalarType scalarType) {
+  if (isSparse(backend) && scalarType == at::kHalf) {
     return nullptr;
   }
-  at::Backend backend = get_backend(is_cuda, is_sparse);
-  return &at::getNonVariableDeprecatedTypeProperties(backend, attype_names.at(name));
+  return &at::getNonVariableDeprecatedTypeProperties(backend, scalarType);
 }
 
 PyTypeObject* getPyTypeObject(const at::Storage& storage)
@@ -85,9 +72,8 @@ PyTypeObject* getPyTypeObject(const at::Storage& storage)
 }
 } // namespace
 
-void registerStoragePyTypeObject(PyTypeObject *pytype, const std::string& name, bool is_cuda, bool is_sparse)
-{
-  auto attype = get_type(name, is_cuda, is_sparse);
+void registerStoragePyTypeObject(PyTypeObject *pytype, at::Backend backend, at::ScalarType scalarType) {
+  auto attype = get_type(backend, scalarType);
   if (attype) {
     attype_to_py_storage_type[attype] = pytype;
     py_storage_type_to_attype[pytype] = attype;
diff --git a/torch/csrc/DynamicTypes.h b/torch/csrc/DynamicTypes.h
index 1f3c4bc342bf..93ce9f2af914 100644
--- a/torch/csrc/DynamicTypes.h
+++ b/torch/csrc/DynamicTypes.h
@@ -25,8 +25,7 @@ struct Type;
 namespace torch {
 // Register a PyTypeObject* with the given attributes
 void registerStoragePyTypeObject(
-    PyTypeObject *pytype, const std::string& name,
-    bool is_cuda, bool is_sparse);
+    PyTypeObject *pytype, at::Backend backend, at::ScalarType scalarType);
 
 void registerDtypeObject(THPDtype *dtype, at::ScalarType scalarType);
 void registerLayoutObject(THPLayout *layout, at::Backend backend);
diff --git a/torch/csrc/MemoryFormat.cpp b/torch/csrc/MemoryFormat.cpp
new file mode 100644
index 000000000000..54ce6aad1240
--- /dev/null
+++ b/torch/csrc/MemoryFormat.cpp
@@ -0,0 +1,80 @@
+#include <torch/csrc/MemoryFormat.h>
+
+#include <torch/csrc/Exceptions.h>
+#include <torch/csrc/utils/object_ptr.h>
+#include <torch/csrc/utils/python_strings.h>
+
+#include <c10/core/MemoryFormat.h>
+
+#include <structmember.h>
+#include <cstring>
+#include <string>
+
+PyObject *THPMemoryFormat_New(at::MemoryFormat memory_format, const std::string& name)
+{
+  auto type = (PyTypeObject*)&THPMemoryFormatType;
+  auto self = THPObjectPtr{type->tp_alloc(type, 0)};
+  if (!self) throw python_error();
+  auto self_ = reinterpret_cast<THPMemoryFormat*>(self.get());
+  self_->memory_format = memory_format;
+  std::strncpy (self_->name, name.c_str(), MEMORY_FORMAT_NAME_LEN);
+  self_->name[MEMORY_FORMAT_NAME_LEN] = '\0';
+  return self.release();
+}
+
+PyObject *THPMemoryFormat_repr(THPMemoryFormat *self)
+{
+  return THPUtils_packString(self->name);
+}
+
+PyTypeObject THPMemoryFormatType = {
+  PyVarObject_HEAD_INIT(nullptr, 0)
+  "torch.memory_format",                       /* tp_name */
+  sizeof(THPMemoryFormat),                     /* tp_basicsize */
+  0,                                           /* tp_itemsize */
+  nullptr,                                     /* tp_dealloc */
+  nullptr,                                     /* tp_print */
+  nullptr,                                     /* tp_getattr */
+  nullptr,                                     /* tp_setattr */
+  nullptr,                                     /* tp_reserved */
+  (reprfunc)THPMemoryFormat_repr,              /* tp_repr */
+  nullptr,                                     /* tp_as_number */
+  nullptr,                                     /* tp_as_sequence */
+  nullptr,                                     /* tp_as_mapping */
+  nullptr,                                     /* tp_hash  */
+  nullptr,                                     /* tp_call */
+  nullptr,                                     /* tp_str */
+  nullptr,                                     /* tp_getattro */
+  nullptr,                                     /* tp_setattro */
+  nullptr,                                     /* tp_as_buffer */
+  Py_TPFLAGS_DEFAULT,                          /* tp_flags */
+  nullptr,                                     /* tp_doc */
+  nullptr,                                     /* tp_traverse */
+  nullptr,                                     /* tp_clear */
+  nullptr,                                     /* tp_richcompare */
+  0,                                           /* tp_weaklistoffset */
+  nullptr,                                     /* tp_iter */
+  nullptr,                                     /* tp_iternext */
+  nullptr,                                     /* tp_methods */
+  nullptr,                                     /* tp_members */
+  nullptr,                                     /* tp_getset */
+  nullptr,                                     /* tp_base */
+  nullptr,                                     /* tp_dict */
+  nullptr,                                     /* tp_descr_get */
+  nullptr,                                     /* tp_descr_set */
+  0,                                           /* tp_dictoffset */
+  nullptr,                                     /* tp_init */
+  nullptr,                                     /* tp_alloc */
+  nullptr,                                     /* tp_new */
+};
+
+void THPMemoryFormat_init(PyObject *module)
+{
+  if (PyType_Ready(&THPMemoryFormatType) < 0) {
+    throw python_error();
+  }
+  Py_INCREF(&THPMemoryFormatType);
+  if (PyModule_AddObject(module, "memory_format", (PyObject *)&THPMemoryFormatType) != 0) {
+    throw python_error();
+  }
+}
diff --git a/torch/csrc/MemoryFormat.h b/torch/csrc/MemoryFormat.h
new file mode 100644
index 000000000000..835b8e92b38e
--- /dev/null
+++ b/torch/csrc/MemoryFormat.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include <torch/csrc/python_headers.h>
+
+#include <c10/core/MemoryFormat.h>
+
+#include <string>
+
+const int MEMORY_FORMAT_NAME_LEN = 64;
+
+struct THPMemoryFormat {
+  PyObject_HEAD
+  at::MemoryFormat memory_format;
+  char name[MEMORY_FORMAT_NAME_LEN + 1];
+};
+
+extern PyTypeObject THPMemoryFormatType;
+
+inline bool THPMemoryFormat_Check(PyObject *obj) {
+  return Py_TYPE(obj) == &THPMemoryFormatType;
+}
+
+PyObject * THPMemoryFormat_New(at::MemoryFormat memory_format, const std::string& name);
+
+void THPMemoryFormat_init(PyObject *module);
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index b0658e824213..3675a146b189 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -25,6 +25,7 @@
 #include <torch/csrc/DataLoader.h>
 #include <torch/csrc/Generator.h>
 #include <torch/csrc/Layout.h>
+#include <torch/csrc/MemoryFormat.h>
 #include <torch/csrc/TypeInfo.h>
 #include <torch/csrc/autograd/generated/python_nn_functions.h>
 #include <torch/csrc/autograd/python_legacy_variable.h>
@@ -34,6 +35,7 @@
 #include <torch/csrc/utils/tensor_dtypes.h>
 #include <torch/csrc/utils/python_strings.h>
 #include <torch/csrc/utils/tensor_layouts.h>
+#include <torch/csrc/utils/tensor_memoryformats.h>
 #include <torch/csrc/utils/tensor_numpy.h>
 #include <torch/csrc/jit/python_tracer.h>
 #include <torch/csrc/jit/init.h>
@@ -97,6 +99,7 @@ static PyObject * THPModule_initExtension(PyObject *_unused, PyObject *shm_manag
     return nullptr;
   }
   torch::utils::initializeLayouts();
+  torch::utils::initializeMemoryFormats();
   torch::utils::initializeDtypes();
   torch::tensors::initialize_python_bindings();
   std::string path = THPUtils_unpackString(shm_manager_path);
@@ -155,7 +158,24 @@ static PyObject * THPModule_setNumThreads(PyObject *module, PyObject *arg)
 {
   THPUtils_assert(THPUtils_checkLong(arg), "set_num_threads expects an int, "
           "but got %s", THPUtils_typename(arg));
-  at::set_num_threads((int)THPUtils_unpackLong(arg));
+  int nthreads = (int)THPUtils_unpackLong(arg);
+  THPUtils_assert(nthreads > 0, "set_num_threads expects a positive integer");
+  at::set_num_threads(nthreads);
+  Py_RETURN_NONE;
+}
+
+static PyObject * THPModule_getNumInteropThreads(PyObject *module)
+{
+  return PyLong_FromLong(at::get_num_interop_threads());
+}
+
+static PyObject * THPModule_setNumInteropThreads(PyObject *module, PyObject *arg)
+{
+  THPUtils_assert(THPUtils_checkLong(arg), "set_num_interop_threads expects an int, "
+          "but got %s", THPUtils_typename(arg));
+  int nthreads = (int)THPUtils_unpackLong(arg);
+  THPUtils_assert(nthreads > 0, "set_num_interop_threads expects a positive integer");
+  at::set_num_interop_threads(nthreads);
   Py_RETURN_NONE;
 }
 
@@ -455,6 +475,8 @@ static PyMethodDef TorchMethods[] = {
   {"_get_backcompat_keepdim_warn", (PyCFunction)THPModule_getBackcompatKeepdimWarn, METH_NOARGS, nullptr},
   {"get_num_threads", (PyCFunction)THPModule_getNumThreads,     METH_NOARGS,  nullptr},
   {"set_num_threads", (PyCFunction)THPModule_setNumThreads,     METH_O,       nullptr},
+  {"get_num_interop_threads", (PyCFunction)THPModule_getNumInteropThreads,     METH_NOARGS,  nullptr},
+  {"set_num_interop_threads", (PyCFunction)THPModule_setNumInteropThreads,     METH_O,       nullptr},
   {"_get_cudnn_enabled", (PyCFunction)THPModule_userEnabledCuDNN, METH_NOARGS,     nullptr},
   {"_set_cudnn_enabled", (PyCFunction)THPModule_setUserEnabledCuDNN, METH_O,  nullptr},
   {"_get_cudnn_benchmark", (PyCFunction)THPModule_benchmarkCuDNN, METH_NOARGS,     nullptr},
@@ -589,6 +611,7 @@ PyObject* initModule() {
   THPDtype_init(module);
   THPDTypeInfo_init(module);
   THPLayout_init(module);
+  THPMemoryFormat_init(module);
   THPDevice_init(module);
   ASSERT_TRUE(THPVariable_initModule(module));
   ASSERT_TRUE(THPFunction_initModule(module));
diff --git a/torch/csrc/TypeInfo.cpp b/torch/csrc/TypeInfo.cpp
index dcccbb217d34..12b97fb8ec1b 100644
--- a/torch/csrc/TypeInfo.cpp
+++ b/torch/csrc/TypeInfo.cpp
@@ -55,7 +55,7 @@ PyObject* THPFInfo_pynew(PyTypeObject* type, PyObject* args, PyObject* kwargs) {
 
   torch::ParsedArgs<1> parsed_args;
   auto r = parser.parse(args, kwargs, parsed_args);
-  AT_CHECK(r.idx < 2, "Not a type");
+  TORCH_CHECK(r.idx < 2, "Not a type");
   at::ScalarType scalar_type;
   if (r.idx == 1) {
     scalar_type = torch::tensors::get_default_scalar_type();
@@ -81,7 +81,7 @@ PyObject* THPIInfo_pynew(PyTypeObject* type, PyObject* args, PyObject* kwargs) {
   });
   torch::ParsedArgs<1> parsed_args;
   auto r = parser.parse(args, kwargs, parsed_args);
-  AT_CHECK(r.idx == 0, "Not a type");
+  TORCH_CHECK(r.idx == 0, "Not a type");
 
   at::ScalarType scalar_type = r.scalartype(0);
   if (!at::isIntegralType(scalar_type)) {
diff --git a/torch/csrc/WindowsTorchApiMacro.h b/torch/csrc/WindowsTorchApiMacro.h
index 371355aa9943..7f8ef4e01677 100644
--- a/torch/csrc/WindowsTorchApiMacro.h
+++ b/torch/csrc/WindowsTorchApiMacro.h
@@ -1,21 +1,7 @@
 #pragma once
 
-#ifdef _WIN32
-#if !defined(TORCH_BUILD_STATIC_LIBS)
-#if defined(torch_EXPORTS)
-#define TORCH_API __declspec(dllexport)
-#else
-#define TORCH_API __declspec(dllimport)
-#endif
-#else
-#define TORCH_API
-#endif
-#elif defined(__GNUC__)
-#if defined(torch_EXPORTS)
-#define TORCH_API __attribute__((__visibility__("default")))
-#else
-#define TORCH_API
-#endif
-#else
-#define TORCH_API
-#endif
+#include <c10/macros/Export.h>
+
+// There's no difference between aten, torch and caffe2 libs any more
+// TODO: clean up the naming for consistency
+#define TORCH_API CAFFE2_API
diff --git a/torch/csrc/api/include/torch/data/dataloader.h b/torch/csrc/api/include/torch/data/dataloader.h
index 14da9edbaf9f..61b220ad4819 100644
--- a/torch/csrc/api/include/torch/data/dataloader.h
+++ b/torch/csrc/api/include/torch/data/dataloader.h
@@ -38,7 +38,7 @@ make_data_loader(
     Dataset dataset,
     DataLoaderOptions options = DataLoaderOptions()) {
   const optional<size_t> size = dataset.size();
-  AT_CHECK(
+  TORCH_CHECK(
       size.has_value(),
       "Expected the dataset to be sized in "
       "order to construct the Sampler");
diff --git a/torch/csrc/api/include/torch/data/dataloader/base.h b/torch/csrc/api/include/torch/data/dataloader/base.h
index 1ec478bac7f1..b0ad56eb6514 100644
--- a/torch/csrc/api/include/torch/data/dataloader/base.h
+++ b/torch/csrc/api/include/torch/data/dataloader/base.h
@@ -55,7 +55,7 @@ class DataLoaderBase {
   /// standard algorithms like `std::copy(dataloader.begin(), dataloader.end(),
   /// output_iterator)`  are supported too.
   Iterator<Batch> begin() {
-    AT_CHECK(
+    TORCH_CHECK(
         shuttle_.in_flight_jobs() == 0,
         "Attempted to get a new DataLoader iterator "
         "while another iterator is not yet exhausted");
diff --git a/torch/csrc/api/include/torch/data/datasets/chunk.h b/torch/csrc/api/include/torch/data/datasets/chunk.h
index 74142591bf99..c519a964d42f 100644
--- a/torch/csrc/api/include/torch/data/datasets/chunk.h
+++ b/torch/csrc/api/include/torch/data/datasets/chunk.h
@@ -112,7 +112,7 @@ class BatchDataBuffer {
           batch_example_indices.value().size() == example_count)
       BatchRequestType& indices = batch_example_indices.value();
       for (size_t i : indices) {
-        AT_CHECK(i < data_size, "Index out of range");
+        TORCH_CHECK(i < data_size, "Index out of range");
         batch.emplace_back(std::move(data[i]));
       }
       remaining_size -= example_count;
@@ -249,16 +249,16 @@ struct ChunkDatasetOptions {
       : preloader_count_(preloader_count),
         batch_size_(batch_size),
         cache_size_(cache_size) {
-    AT_CHECK(
+    TORCH_CHECK(
         preloader_count_ > 0,
         "Preloader count is 0. At least one preloader needs to be specified.");
-    AT_CHECK(
+    TORCH_CHECK(
         batch_size_ > 0,
         "Batch size is 0. A positive batch size needs to be specified.");
-    AT_CHECK(
+    TORCH_CHECK(
         cache_size_ > 0,
         "Cache size is 0. A positive cache size needs to be specified.");
-    AT_CHECK(
+    TORCH_CHECK(
         cache_size_ >= batch_size_,
         "Cache size is less than batch size. Cache needs to be large enough to "
         "hold at least one batch.");
@@ -323,11 +323,11 @@ class ChunkDataset final
   /// is dataset agnostic and does not need overriding in different chunk
   /// datasets.
   BatchType get_batch(size_t batch_size) override {
-    AT_CHECK(
+    TORCH_CHECK(
       batch_buffer_ != nullptr,
       "Dataset needs to call reset() before calling get_batch().");
 
-    AT_CHECK(
+    TORCH_CHECK(
       batch_size == options_.batch_size_,
       "The requested batch size does not match with the initialized batch size.\n"
       " The requested batch size is ", batch_size,
diff --git a/torch/csrc/api/include/torch/data/iterator.h b/torch/csrc/api/include/torch/data/iterator.h
index 21e2d3d3d593..2ba1a5d33dfb 100644
--- a/torch/csrc/api/include/torch/data/iterator.h
+++ b/torch/csrc/api/include/torch/data/iterator.h
@@ -50,7 +50,7 @@ struct ValidIterator : public IteratorImpl<Batch> {
   void next() override {
     // If we didn't get the very first batch yet, get it now.
     lazy_initialize();
-    AT_CHECK(
+    TORCH_CHECK(
         batch_.has_value(), "Attempted to increment iterator past the end");
     // Increment to the next batch.
     batch_ = next_batch_();
@@ -62,7 +62,7 @@ struct ValidIterator : public IteratorImpl<Batch> {
   Batch& get() override {
     // If we didn't get the very first batch yet, get it now.
     lazy_initialize();
-    AT_CHECK(
+    TORCH_CHECK(
         batch_.has_value(),
         "Attempted to dereference iterator that was past the end");
     return batch_.value();
diff --git a/torch/csrc/api/include/torch/expanding_array.h b/torch/csrc/api/include/torch/expanding_array.h
index a840a881d9bb..605e6e2e3d0f 100644
--- a/torch/csrc/api/include/torch/expanding_array.h
+++ b/torch/csrc/api/include/torch/expanding_array.h
@@ -31,7 +31,7 @@ class ExpandingArray {
   /// at runtime.
   /*implicit*/ ExpandingArray(at::ArrayRef<T> values) {
     // clang-format off
-    AT_CHECK(
+    TORCH_CHECK(
         values.size() == D,
         "Expected ", D, " values, but instead got ", values.size());
     // clang-format on
diff --git a/torch/csrc/api/include/torch/nn/cloneable.h b/torch/csrc/api/include/torch/nn/cloneable.h
index 99037250ac1f..c7b6e5aaf677 100644
--- a/torch/csrc/api/include/torch/nn/cloneable.h
+++ b/torch/csrc/api/include/torch/nn/cloneable.h
@@ -41,7 +41,7 @@ class Cloneable : public virtual Module {
     copy->buffers_.clear();
     copy->children_.clear();
     copy->reset();
-    AT_CHECK(
+    TORCH_CHECK(
         copy->parameters_.size() == parameters_.size(),
         "The cloned module does not have the same number of "
         "parameters as the original module after calling reset(). "
@@ -52,7 +52,7 @@ class Cloneable : public virtual Module {
       copy->parameters_[parameter.key()].set_data(
           device ? data.to(*device) : data);
     }
-    AT_CHECK(
+    TORCH_CHECK(
         copy->buffers_.size() == buffers_.size(),
         "The cloned module does not have the same number of "
         "buffers as the original module after calling reset(). "
@@ -62,7 +62,7 @@ class Cloneable : public virtual Module {
       auto data = autograd::Variable(*buffer).data().clone();
       copy->buffers_[buffer.key()].set_data(device ? data.to(*device) : data);
     }
-    AT_CHECK(
+    TORCH_CHECK(
         copy->children_.size() == children_.size(),
         "The cloned module does not have the same number of "
         "child modules as the original module after calling reset(). "
@@ -80,7 +80,7 @@ class Cloneable : public virtual Module {
     // was registered under the same name as `this`), but you never know what
     // crazy things `reset()` does, so `dynamic_cast` just to be safe.
     auto clone = std::dynamic_pointer_cast<Derived>(other.clone(device));
-    AT_CHECK(
+    TORCH_CHECK(
         clone != nullptr,
         "Attempted to clone submodule, but it is of a "
         "different type than the submodule it was to be cloned into");
diff --git a/torch/csrc/api/include/torch/nn/module.h b/torch/csrc/api/include/torch/nn/module.h
index 7b566b494842..6e6509532bfe 100644
--- a/torch/csrc/api/include/torch/nn/module.h
+++ b/torch/csrc/api/include/torch/nn/module.h
@@ -566,8 +566,8 @@ template <typename ModuleType>
 std::shared_ptr<ModuleType> Module::register_module(
     std::string name,
     std::shared_ptr<ModuleType> module) {
-  AT_CHECK(!name.empty(), "Submodule name must not be empty");
-  AT_CHECK(
+  TORCH_CHECK(!name.empty(), "Submodule name must not be empty");
+  TORCH_CHECK(
       name.find('.') == std::string::npos,
       "Submodule name must not contain a dot (got '",
       name,
diff --git a/torch/csrc/api/include/torch/nn/modules/any.h b/torch/csrc/api/include/torch/nn/modules/any.h
index 08deb0b526db..71b90487411e 100644
--- a/torch/csrc/api/include/torch/nn/modules/any.h
+++ b/torch/csrc/api/include/torch/nn/modules/any.h
@@ -383,7 +383,7 @@ struct AnyModule::Holder : public AnyModule::Placeholder {
   /// Calls `forward()` on the underlying module, casting each `Value` in the
   /// argument vector to a concrete value.
   Value forward(std::vector<Value>&& arguments) override {
-    AT_CHECK(
+    TORCH_CHECK(
         arguments.size() == sizeof...(ArgumentTypes),
         c10::demangle(type_info.name()),
         "'s forward() method expects ",
@@ -466,7 +466,7 @@ AnyModule& AnyModule::operator=(std::shared_ptr<ModuleType> module) {
 
 template <typename... ArgumentTypes>
 AnyModule::Value AnyModule::any_forward(ArgumentTypes&&... arguments) {
-  AT_CHECK(!is_empty(), "Cannot call forward() on an empty AnyModule");
+  TORCH_CHECK(!is_empty(), "Cannot call forward() on an empty AnyModule");
   std::vector<Value> values;
   values.reserve(sizeof...(ArgumentTypes));
   torch::apply(
@@ -483,13 +483,13 @@ ReturnType AnyModule::forward(ArgumentTypes&&... arguments) {
 
 template <typename T, typename>
 T& AnyModule::get() {
-  AT_CHECK(!is_empty(), "Cannot call get() on an empty AnyModule");
+  TORCH_CHECK(!is_empty(), "Cannot call get() on an empty AnyModule");
   return get_<T>();
 }
 
 template <typename T, typename>
 const T& AnyModule::get() const {
-  AT_CHECK(!is_empty(), "Cannot call get() on an empty AnyModule");
+  TORCH_CHECK(!is_empty(), "Cannot call get() on an empty AnyModule");
   return get_<T>();
 }
 
@@ -499,20 +499,20 @@ T AnyModule::get() const {
 }
 
 inline std::shared_ptr<Module> AnyModule::ptr() const {
-  AT_CHECK(!is_empty(), "Cannot call ptr() on an empty AnyModule");
+  TORCH_CHECK(!is_empty(), "Cannot call ptr() on an empty AnyModule");
   return content_->ptr();
 }
 
 template <typename T, typename>
 std::shared_ptr<T> AnyModule::ptr() const {
-  AT_CHECK(!is_empty(), "Cannot call ptr() on an empty AnyModule");
+  TORCH_CHECK(!is_empty(), "Cannot call ptr() on an empty AnyModule");
   // Call get() but discard the value, just to do the type checking.
   get_<T>();
   return std::dynamic_pointer_cast<T>(ptr());
 }
 
 inline const std::type_info& AnyModule::type_info() const {
-  AT_CHECK(!is_empty(), "Cannot call type_info() on an empty AnyModule");
+  TORCH_CHECK(!is_empty(), "Cannot call type_info() on an empty AnyModule");
   return content_->type_info;
 }
 
diff --git a/torch/csrc/api/include/torch/nn/modules/conv.h b/torch/csrc/api/include/torch/nn/modules/conv.h
index 66078bc56577..c492ef6f0d34 100644
--- a/torch/csrc/api/include/torch/nn/modules/conv.h
+++ b/torch/csrc/api/include/torch/nn/modules/conv.h
@@ -78,7 +78,7 @@ struct ConvOptions {
 
 /// Base class for all (dimension-specialized) convolution modules.
 template <size_t D, typename Derived>
-class ConvImpl : public torch::nn::Cloneable<Derived> {
+class TORCH_API ConvImpl : public torch::nn::Cloneable<Derived> {
  public:
   ConvImpl(
       int64_t input_channels,
diff --git a/torch/csrc/api/include/torch/nn/modules/dropout.h b/torch/csrc/api/include/torch/nn/modules/dropout.h
index 1212dcaa7c5d..ad70eaff86d3 100644
--- a/torch/csrc/api/include/torch/nn/modules/dropout.h
+++ b/torch/csrc/api/include/torch/nn/modules/dropout.h
@@ -40,7 +40,7 @@ class DropoutImplBase : public torch::nn::Cloneable<Derived> {
 /// about the exact semantics of this module.
 class TORCH_API DropoutImpl : public detail::DropoutImplBase<DropoutImpl> {
  public:
-  using detail::DropoutImplBase<DropoutImpl>::DropoutImplBase;
+  explicit DropoutImpl(DropoutOptions options_ = DropoutOptions());
 
   /// During training, applies a noise mask to the input tensor.
   /// During evaluation, applies an identity function.
@@ -62,7 +62,7 @@ class TORCH_API DropoutImpl : public detail::DropoutImplBase<DropoutImpl> {
 class TORCH_API FeatureDropoutImpl
     : public detail::DropoutImplBase<FeatureDropoutImpl> {
  public:
-  using detail::DropoutImplBase<FeatureDropoutImpl>::DropoutImplBase;
+  explicit FeatureDropoutImpl(DropoutOptions options_ = DropoutOptions());
 
   /// During training, applies a noise mask to the input tensor.
   /// During evaluation, applies an identity function.
diff --git a/torch/csrc/api/include/torch/nn/modules/rnn.h b/torch/csrc/api/include/torch/nn/modules/rnn.h
index 7d78d6eec61d..e6d161e9f56b 100644
--- a/torch/csrc/api/include/torch/nn/modules/rnn.h
+++ b/torch/csrc/api/include/torch/nn/modules/rnn.h
@@ -53,7 +53,7 @@ struct TORCH_API RNNOptionsBase {
 
 /// Base class for all RNN implementations (intended for code sharing).
 template <typename Derived>
-class RNNImplBase : public torch::nn::Cloneable<Derived> {
+class TORCH_API RNNImplBase : public torch::nn::Cloneable<Derived> {
  public:
   /// These must line up with the CUDNN mode codes:
   /// https://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#cudnnRNNMode_t
diff --git a/torch/csrc/api/include/torch/nn/modules/sequential.h b/torch/csrc/api/include/torch/nn/modules/sequential.h
index 7a4818cfe821..1f1c17e731f7 100644
--- a/torch/csrc/api/include/torch/nn/modules/sequential.h
+++ b/torch/csrc/api/include/torch/nn/modules/sequential.h
@@ -161,7 +161,7 @@ class SequentialImpl : public Cloneable<SequentialImpl> {
   /// \endrst
   template <typename ReturnType = Tensor, typename... InputTypes>
   ReturnType forward(InputTypes&&... inputs) {
-    AT_CHECK(!is_empty(), "Cannot call forward() on an empty Sequential");
+    TORCH_CHECK(!is_empty(), "Cannot call forward() on an empty Sequential");
 
     auto iterator = modules_.begin();
     auto input = iterator->any_forward(std::forward<InputTypes>(inputs)...);
@@ -263,7 +263,7 @@ class SequentialImpl : public Cloneable<SequentialImpl> {
     static_assert(
         torch::detail::is_module<T>::value,
         "Can only call Sequential::at with an nn::Module type");
-    AT_CHECK(index < size(), "Index out of range");
+    TORCH_CHECK(index < size(), "Index out of range");
     return modules_[index].get<T>();
   }
 
@@ -275,7 +275,7 @@ class SequentialImpl : public Cloneable<SequentialImpl> {
     static_assert(
         torch::detail::is_module<T>::value,
         "Can only call Sequential::at with an nn::Module type");
-    AT_CHECK(index < size(), "Index out of range");
+    TORCH_CHECK(index < size(), "Index out of range");
     return modules_[index].get<T>();
   }
 
@@ -283,7 +283,7 @@ class SequentialImpl : public Cloneable<SequentialImpl> {
   /// underlying module at the given index. Throws an exception if the index is
   /// out of bounds.
   std::shared_ptr<Module> ptr(size_t index) const {
-    AT_CHECK(index < size(), "Index out of range");
+    TORCH_CHECK(index < size(), "Index out of range");
     return modules_[index].ptr();
   }
 
@@ -295,7 +295,7 @@ class SequentialImpl : public Cloneable<SequentialImpl> {
     static_assert(
         torch::detail::is_module<T>::value,
         "Can only call Sequential::ptr with an nn::Module type");
-    AT_CHECK(index < size(), "Index out of range");
+    TORCH_CHECK(index < size(), "Index out of range");
     return modules_[index].ptr<T>();
   }
 
diff --git a/torch/csrc/api/include/torch/nn/parallel/data_parallel.h b/torch/csrc/api/include/torch/nn/parallel/data_parallel.h
index be777be0ad86..bc1c59053758 100644
--- a/torch/csrc/api/include/torch/nn/parallel/data_parallel.h
+++ b/torch/csrc/api/include/torch/nn/parallel/data_parallel.h
@@ -73,10 +73,10 @@ std::vector<Tensor> parallel_apply(
     std::vector<ModuleType>& modules,
     const std::vector<Tensor>& inputs,
     const optional<std::vector<Device>>& devices = nullopt) {
-  AT_CHECK(
+  TORCH_CHECK(
       modules.size() == inputs.size(), "Must have as many inputs as modules");
   if (devices) {
-    AT_CHECK(
+    TORCH_CHECK(
         modules.size() == devices->size(),
         "Must have as many devices as modules");
   }
@@ -140,7 +140,7 @@ Tensor data_parallel(
     int64_t dim = 0) {
   if (!devices) {
     const auto device_count = torch::cuda::device_count();
-    AT_CHECK(
+    TORCH_CHECK(
         device_count > 0, "Expected at least one CUDA device to be available");
     devices = std::vector<Device>();
     devices->reserve(device_count);
diff --git a/torch/csrc/api/include/torch/nn/pimpl.h b/torch/csrc/api/include/torch/nn/pimpl.h
index e3cda201975b..1a033b7cf293 100644
--- a/torch/csrc/api/include/torch/nn/pimpl.h
+++ b/torch/csrc/api/include/torch/nn/pimpl.h
@@ -98,19 +98,19 @@ class ModuleHolder : torch::detail::ModuleHolderIndicator {
 
   /// Returns a shared pointer to the underlying module.
   const std::shared_ptr<Contained>& ptr() const {
-    AT_CHECK(!is_empty(), "Accessing empty ModuleHolder");
+    TORCH_CHECK(!is_empty(), "Accessing empty ModuleHolder");
     return impl_;
   }
 
   /// Returns a pointer to the underlying module.
   Contained* get() {
-    AT_CHECK(!is_empty(), "Accessing empty ModuleHolder");
+    TORCH_CHECK(!is_empty(), "Accessing empty ModuleHolder");
     return impl_.get();
   }
 
   /// Returns a const pointer to the underlying module.
   const Contained* get() const {
-    AT_CHECK(!is_empty(), "Accessing empty ModuleHolder");
+    TORCH_CHECK(!is_empty(), "Accessing empty ModuleHolder");
     return impl_.get();
   }
 
diff --git a/torch/csrc/api/include/torch/ordered_dict.h b/torch/csrc/api/include/torch/ordered_dict.h
index 65b9958774e5..a26de4321548 100644
--- a/torch/csrc/api/include/torch/ordered_dict.h
+++ b/torch/csrc/api/include/torch/ordered_dict.h
@@ -295,41 +295,41 @@ typename OrderedDict<Key, Value>::ConstIterator OrderedDict<Key, Value>::end()
 
 template <typename Key, typename Value>
 typename OrderedDict<Key, Value>::Item& OrderedDict<Key, Value>::front() {
-  AT_CHECK(!items_.empty(), "Called front() on an empty OrderedDict");
+  TORCH_CHECK(!items_.empty(), "Called front() on an empty OrderedDict");
   return items_.front();
 }
 
 template <typename Key, typename Value>
 const typename OrderedDict<Key, Value>::Item& OrderedDict<Key, Value>::front()
     const {
-  AT_CHECK(!items_.empty(), "Called front() on an empty OrderedDict");
+  TORCH_CHECK(!items_.empty(), "Called front() on an empty OrderedDict");
   return items_.front();
 }
 
 template <typename Key, typename Value>
 typename OrderedDict<Key, Value>::Item& OrderedDict<Key, Value>::back() {
-  AT_CHECK(!items_.empty(), "Called back() on an empty OrderedDict");
+  TORCH_CHECK(!items_.empty(), "Called back() on an empty OrderedDict");
   return items_.back();
 }
 
 template <typename Key, typename Value>
 const typename OrderedDict<Key, Value>::Item& OrderedDict<Key, Value>::back()
     const {
-  AT_CHECK(!items_.empty(), "Called back() on an empty OrderedDict");
+  TORCH_CHECK(!items_.empty(), "Called back() on an empty OrderedDict");
   return items_.back();
 }
 
 template <typename Key, typename Value>
 typename OrderedDict<Key, Value>::Item& OrderedDict<Key, Value>::operator[](
     size_t index) {
-  AT_CHECK(index < items_.size(), "Index ", index, " is out of bounds");
+  TORCH_CHECK(index < items_.size(), "Index ", index, " is out of bounds");
   return items_[index];
 }
 
 template <typename Key, typename Value>
 const typename OrderedDict<Key, Value>::
     Item& OrderedDict<Key, Value>::operator[](size_t index) const {
-  AT_CHECK(index < items_.size(), "Index ", index, " is out of bounds");
+  TORCH_CHECK(index < items_.size(), "Index ", index, " is out of bounds");
   return items_[index];
 }
 
@@ -352,7 +352,7 @@ const Value& OrderedDict<Key, Value>::operator[](const Key& key) const {
 template <typename Key, typename Value>
 template <typename K, typename V>
 Value& OrderedDict<Key, Value>::insert(K&& key, V&& value) {
-  AT_CHECK(
+  TORCH_CHECK(
       index_.count(key) == 0, key_description_, " '", key, "' already defined");
   // Copy `key` here and move it into the index.
   items_.emplace_back(key, std::forward<V>(value));
diff --git a/torch/csrc/api/include/torch/torch.h b/torch/csrc/api/include/torch/torch.h
index e7d190c499df..b41e6eefffdd 100644
--- a/torch/csrc/api/include/torch/torch.h
+++ b/torch/csrc/api/include/torch/torch.h
@@ -4,6 +4,14 @@
 
 #ifdef TORCH_API_INCLUDE_EXTENSION_H
 #include <torch/extension.h>
-#warning \
+
+#define DEPRECATE_MESSAGE \
     "Including torch/torch.h for C++ extensions is deprecated. Please include torch/extension.h"
+
+#ifdef _MSC_VER
+#  pragma message ( DEPRECATE_MESSAGE )
+#else
+#  warning DEPRECATE_MESSAGE
+#endif
+
 #endif // defined(TORCH_API_INCLUDE_EXTENSION_H)
diff --git a/torch/csrc/api/include/torch/utils.h b/torch/csrc/api/include/torch/utils.h
index 617265c4dc37..70fc19a972d3 100644
--- a/torch/csrc/api/include/torch/utils.h
+++ b/torch/csrc/api/include/torch/utils.h
@@ -26,4 +26,10 @@ using at::get_num_threads;
 // Sets the number of threads to be used in parallel region.
 using at::set_num_threads;
 
+// Returns the number of threads used for inter-op parallelism.
+using at::get_num_interop_threads;
+
+// Sets the number of threads to be used for inter-op parallelism.
+using at::set_num_interop_threads;
+
 } // namespace torch
diff --git a/torch/csrc/api/src/data/datasets/mnist.cpp b/torch/csrc/api/src/data/datasets/mnist.cpp
index d77b4573ca53..7dfe17a2a089 100644
--- a/torch/csrc/api/src/data/datasets/mnist.cpp
+++ b/torch/csrc/api/src/data/datasets/mnist.cpp
@@ -45,7 +45,7 @@ uint32_t read_int32(std::ifstream& stream) {
 uint32_t expect_int32(std::ifstream& stream, uint32_t expected) {
   const auto value = read_int32(stream);
   // clang-format off
-  AT_CHECK(value == expected,
+  TORCH_CHECK(value == expected,
       "Expected to read number ", expected, " but found ", value, " instead");
   // clang-format on
   return value;
@@ -63,7 +63,7 @@ Tensor read_images(const std::string& root, bool train) {
   const auto path =
       join_paths(root, train ? kTrainImagesFilename : kTestImagesFilename);
   std::ifstream images(path, std::ios::binary);
-  AT_CHECK(images, "Error opening images file at ", path);
+  TORCH_CHECK(images, "Error opening images file at ", path);
 
   const auto count = train ? kTrainSize : kTestSize;
 
@@ -83,7 +83,7 @@ Tensor read_targets(const std::string& root, bool train) {
   const auto path =
       join_paths(root, train ? kTrainTargetsFilename : kTestTargetsFilename);
   std::ifstream targets(path, std::ios::binary);
-  AT_CHECK(targets, "Error opening targets file at ", path);
+  TORCH_CHECK(targets, "Error opening targets file at ", path);
 
   const auto count = train ? kTrainSize : kTestSize;
 
diff --git a/torch/csrc/api/src/nn/init.cpp b/torch/csrc/api/src/nn/init.cpp
index 7d64b9f02da8..c16f2b2b9ab3 100644
--- a/torch/csrc/api/src/nn/init.cpp
+++ b/torch/csrc/api/src/nn/init.cpp
@@ -18,7 +18,7 @@ namespace {
 struct Fan {
   explicit Fan(Tensor& tensor) {
     const auto dimensions = tensor.ndimension();
-    AT_CHECK(
+    TORCH_CHECK(
         dimensions >= 2,
         "Fan in and fan out can not be computed for tensor with fewer than 2 dimensions");
 
@@ -73,7 +73,7 @@ Tensor constant_(Tensor tensor, Scalar value) {
 Tensor dirac_(Tensor tensor) {
   NoGradGuard guard;
 
-  AT_CHECK(
+  TORCH_CHECK(
       tensor.ndimension() >= 3 && tensor.ndimension() <= 5,
       "Only tensors with 3, 4, or 5 dimensions are supported");
 
@@ -100,7 +100,7 @@ Tensor dirac_(Tensor tensor) {
 
 Tensor eye_(Tensor matrix) {
   NoGradGuard guard;
-  AT_CHECK(
+  TORCH_CHECK(
       matrix.ndimension() == 2, "Only tensors with 2 dimensions are supported");
   return torch::eye_out(matrix, matrix.size(0), matrix.size(1));
 }
@@ -118,7 +118,7 @@ Tensor ones_(Tensor tensor) {
 Tensor orthogonal_(Tensor tensor, double gain) {
   NoGradGuard guard;
 
-  AT_CHECK(
+  TORCH_CHECK(
       tensor.ndimension() >= 2,
       "Only tensors with 2 or more dimensions are supported");
 
@@ -151,7 +151,7 @@ Tensor orthogonal_(Tensor tensor, double gain) {
 Tensor sparse_(Tensor tensor, double sparsity, double std) {
   NoGradGuard guard;
 
-  AT_CHECK(
+  TORCH_CHECK(
       tensor.ndimension() == 2, "Only tensors with 2 dimensions are supported");
 
   const auto rows = tensor.size(0);
diff --git a/torch/csrc/api/src/nn/module.cpp b/torch/csrc/api/src/nn/module.cpp
index c0f54efb3fc4..e266656fdfd5 100644
--- a/torch/csrc/api/src/nn/module.cpp
+++ b/torch/csrc/api/src/nn/module.cpp
@@ -310,8 +310,8 @@ Tensor& Module::register_parameter(
     std::string name,
     Tensor tensor,
     bool requires_grad) {
-  AT_CHECK(!name.empty(), "Parameter name must not be empty");
-  AT_CHECK(
+  TORCH_CHECK(!name.empty(), "Parameter name must not be empty");
+  TORCH_CHECK(
       name.find('.') == std::string::npos,
       "Parameter name must not contain a dot (got '",
       name,
@@ -321,8 +321,8 @@ Tensor& Module::register_parameter(
 }
 
 Tensor& Module::register_buffer(std::string name, Tensor tensor) {
-  AT_CHECK(!name.empty(), "Buffer name must not be empty");
-  AT_CHECK(
+  TORCH_CHECK(!name.empty(), "Buffer name must not be empty");
+  TORCH_CHECK(
       name.find('.') == std::string::npos,
       "Buffer name must not contain a dot (got '",
       name,
@@ -388,7 +388,7 @@ std::ostream& operator<<(std::ostream& stream, const nn::Module& module) {
 serialize::OutputArchive& operator<<(
     serialize::OutputArchive& archive,
     const std::shared_ptr<nn::Module>& module) {
-  AT_CHECK(module != nullptr, "Cannot serialize empty module");
+  TORCH_CHECK(module != nullptr, "Cannot serialize empty module");
   module->save(archive);
   return archive;
 }
@@ -396,7 +396,7 @@ serialize::OutputArchive& operator<<(
 serialize::InputArchive& operator>>(
     serialize::InputArchive& archive,
     const std::shared_ptr<nn::Module>& module) {
-  AT_CHECK(module != nullptr, "Cannot deserialize empty module");
+  TORCH_CHECK(module != nullptr, "Cannot deserialize empty module");
   module->load(archive);
   return archive;
 }
diff --git a/torch/csrc/api/src/nn/modules/batchnorm.cpp b/torch/csrc/api/src/nn/modules/batchnorm.cpp
index 8a542e5cefc0..7fab1f5f645a 100644
--- a/torch/csrc/api/src/nn/modules/batchnorm.cpp
+++ b/torch/csrc/api/src/nn/modules/batchnorm.cpp
@@ -42,7 +42,7 @@ void BatchNormImpl::pretty_print(std::ostream& stream) const {
 }
 
 Tensor BatchNormImpl::forward(const Tensor& input) {
-  AT_CHECK(
+  TORCH_CHECK(
       options.stateful_,
       "Calling BatchNorm::forward is only permitted when "
       "the 'stateful' option is true (was false). "
@@ -56,7 +56,7 @@ Tensor BatchNormImpl::pure_forward(
     const Tensor& variance) {
   if (is_training()) {
     const auto num_channels = input.dim() > 1 ? input.size(1) : 1;
-    AT_CHECK(
+    TORCH_CHECK(
         input.numel() / num_channels > 1,
         "BatchNorm expected more than 1 value per channel when training!");
   }
diff --git a/torch/csrc/api/src/nn/modules/conv.cpp b/torch/csrc/api/src/nn/modules/conv.cpp
index 42147438b963..739b7ccd7d21 100644
--- a/torch/csrc/api/src/nn/modules/conv.cpp
+++ b/torch/csrc/api/src/nn/modules/conv.cpp
@@ -22,7 +22,7 @@ template <size_t D, typename Derived>
 void ConvImpl<D, Derived>::reset() {
   if (!options.transposed_) {
     for (auto pad : *options.output_padding_) {
-      AT_CHECK(
+      TORCH_CHECK(
           pad == 0, "Only transposed convolutions support output padding!");
     }
   }
diff --git a/torch/csrc/api/src/nn/modules/dropout.cpp b/torch/csrc/api/src/nn/modules/dropout.cpp
index c068f70d389c..84a0d916b7e1 100644
--- a/torch/csrc/api/src/nn/modules/dropout.cpp
+++ b/torch/csrc/api/src/nn/modules/dropout.cpp
@@ -14,8 +14,8 @@ namespace detail {
 template <typename Derived>
 DropoutImplBase<Derived>::DropoutImplBase(DropoutOptions options_)
     : options(options_) {
-  AT_CHECK(options.rate_ >= 0, "Dropout rate must not be less than zero");
-  AT_CHECK(options.rate_ <= 1, "Dropout rate must not be greater than one");
+  TORCH_CHECK(options.rate_ >= 0, "Dropout rate must not be less than zero");
+  TORCH_CHECK(options.rate_ <= 1, "Dropout rate must not be greater than one");
 }
 
 template <typename Derived>
@@ -27,6 +27,8 @@ template class DropoutImplBase<FeatureDropoutImpl>;
 
 DropoutOptions::DropoutOptions(double rate) : rate_(rate) {}
 
+DropoutImpl::DropoutImpl(DropoutOptions options_) : DropoutImplBase(options_) {}
+
 Tensor DropoutImpl::forward(const Tensor& input) {
   return torch::dropout(input, options.rate_, this->is_training());
 }
@@ -35,6 +37,9 @@ void DropoutImpl::pretty_print(std::ostream& stream) const {
   stream << "torch::nn::Dropout(rate=" << options.rate_ << ")";
 }
 
+FeatureDropoutImpl::FeatureDropoutImpl(DropoutOptions options_)
+    : DropoutImplBase(options_) {}
+
 Tensor FeatureDropoutImpl::forward(const Tensor& input) {
   return torch::feature_dropout(input, options.rate_, this->is_training());
 }
diff --git a/torch/csrc/api/src/serialize/input-archive.cpp b/torch/csrc/api/src/serialize/input-archive.cpp
index e86217912fe8..15d9c2648bb4 100644
--- a/torch/csrc/api/src/serialize/input-archive.cpp
+++ b/torch/csrc/api/src/serialize/input-archive.cpp
@@ -30,7 +30,7 @@ bool InputArchive::try_read(
   // clang-format off
   auto read_param = is_buffer ? buffer : param;
   auto read_tensor = read_param->value().toTensor();
-  AT_CHECK(
+  TORCH_CHECK(
       bool(buffer) == is_buffer,
       "Expected deserialized tensor for key '", key,
       "' to ", is_buffer ? "not " : "", "be a buffer, but it was not");
@@ -52,7 +52,7 @@ void InputArchive::read(
     const std::string& key,
     Tensor& tensor,
     bool is_buffer) {
-  AT_CHECK(
+  TORCH_CHECK(
     try_read(key, tensor, is_buffer),
     "No such serialized tensor '",
     key,
@@ -69,7 +69,7 @@ bool InputArchive::try_read(const std::string& key, InputArchive& archive) {
 }
 
 void InputArchive::read(const std::string& key, InputArchive& archive) {
-  AT_CHECK(
+  TORCH_CHECK(
     try_read(key, archive),
     "No such serialized submodule: '", key, "'");
 }
diff --git a/torch/csrc/autograd/function.h b/torch/csrc/autograd/function.h
index 512cfb60cb4b..b28a1f1e9366 100644
--- a/torch/csrc/autograd/function.h
+++ b/torch/csrc/autograd/function.h
@@ -214,7 +214,7 @@ struct TORCH_API Function : std::enable_shared_from_this<Function> {
   /// Returns true if the particular output edge is active, and that particular
   /// output of this function should be computed.
   bool should_compute_output(size_t output_edge_index) const {
-    AT_CHECK(output_edge_index < num_outputs(), "Index out of range");
+    TORCH_CHECK(output_edge_index < num_outputs(), "Index out of range");
     return next_edges_[output_edge_index].is_valid();
   }
 
diff --git a/torch/csrc/autograd/functions/comm.cpp b/torch/csrc/autograd/functions/comm.cpp
index 8df447cc0f8a..d762b558df65 100644
--- a/torch/csrc/autograd/functions/comm.cpp
+++ b/torch/csrc/autograd/functions/comm.cpp
@@ -74,7 +74,7 @@ Gather::~Gather() {}
 variable_list Gather::apply(variable_list&& inputs) {
   bool all_are_zero_dim = true;
   for (const auto& input : inputs) {
-    AT_CHECK(
+    TORCH_CHECK(
         input.is_cuda(),
         "All inputs to Gather must be CUDA tensors, got ",
         input.type());
diff --git a/torch/csrc/autograd/functions/comm.h b/torch/csrc/autograd/functions/comm.h
index 1e98573b74d1..204b1075093c 100644
--- a/torch/csrc/autograd/functions/comm.h
+++ b/torch/csrc/autograd/functions/comm.h
@@ -15,7 +15,7 @@ namespace torch {
 namespace autograd {
 
 //TODO: change it to TORCH_API when we merge the libs
-struct TORCH_API Scatter : public Function {
+struct AT_CUDA_API Scatter : public Function {
   explicit Scatter(
       std::vector<at::Device> devices,
       const c10::optional<std::vector<int64_t>>& chunk_sizes = c10::nullopt,
@@ -34,7 +34,7 @@ struct TORCH_API Scatter : public Function {
   bool unsqueeze_scalars_;
 };
 
-struct TORCH_API Gather : public Function {
+struct AT_CUDA_API Gather : public Function {
   explicit Gather(const at::Device& destination_device, int64_t dim = 0);
   ~Gather() override;
 
diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp
index 8a22777225aa..5275dfa8695e 100644
--- a/torch/csrc/autograd/profiler.cpp
+++ b/torch/csrc/autograd/profiler.cpp
@@ -27,6 +27,8 @@ std::list<std::shared_ptr<RangeEventList>> all_event_lists;
 thread_local std::shared_ptr<RangeEventList> event_list;
 thread_local uint16_t thread_id;
 
+ProfilerConfig::~ProfilerConfig() = default;
+
 RangeEventList& getEventList() {
   if (!event_list) {
     std::lock_guard<std::mutex> guard(all_event_lists_mutex);
@@ -246,7 +248,7 @@ RecordProfile::~RecordProfile() {
 }
 
 void RecordProfile::processEvents(const std::vector<Event*>& events) {
-  AT_CHECK(out_, "could not open file");
+  TORCH_CHECK(out_, "could not open file");
   Event* start = nullptr;
   for (Event* e : events) {
     if(0 == strcmp(e->name(), "__start_profile")) {
@@ -254,7 +256,7 @@ void RecordProfile::processEvents(const std::vector<Event*>& events) {
       break;
     }
   }
-  AT_CHECK(start, "could not find start?");
+  TORCH_CHECK(start, "could not find start?");
   std::vector<Event*> stack;
   out_ << "[\n";
   bool first = true;
diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h
index 5de1a2a39de7..78a6b419c085 100644
--- a/torch/csrc/autograd/profiler.h
+++ b/torch/csrc/autograd/profiler.h
@@ -101,9 +101,10 @@ enum class TORCH_API ProfilerState {
     NVTX,  // only emit NVTX markers
 };
 
-struct ProfilerConfig {
+struct TORCH_API ProfilerConfig {
   ProfilerConfig(ProfilerState state, bool report_input_shapes)
       : state(state), report_input_shapes(report_input_shapes) {}
+  ~ProfilerConfig();
   ProfilerState state;
   bool report_input_shapes;
 };
diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp
index 3357ca7a32a3..9b3ede12c38a 100644
--- a/torch/csrc/autograd/python_variable.cpp
+++ b/torch/csrc/autograd/python_variable.cpp
@@ -304,6 +304,13 @@ PyObject *THPVariable_get_requires_grad(THPVariable *self)
   END_HANDLE_TH_ERRORS
 }
 
+PyObject *THPVariable_get_ndim(THPVariable *self)
+{
+  HANDLE_TH_ERRORS
+  return PyInt_FromLong(self->cdata.dim());
+  END_HANDLE_TH_ERRORS
+}
+
 int THPVariable_set_requires_grad(THPVariable *self, PyObject *obj)
 {
   HANDLE_TH_ERRORS
@@ -443,6 +450,7 @@ static struct PyGetSetDef THPVariable_properties[] = {
   {"dtype", (getter)THPVariable_dtype, nullptr, nullptr, nullptr},
   {"layout", (getter)THPVariable_layout, nullptr, nullptr, nullptr},
   {"device", (getter)THPVariable_device, nullptr, nullptr, nullptr},
+  {"ndim", (getter)THPVariable_get_ndim, nullptr, nullptr, nullptr},
   {nullptr}
 };
 
@@ -508,8 +516,8 @@ void initTensorImplConversion(PyObject* module) {
   m.def("_wrap_tensor_impl", [](void* ptr) {
     auto p = c10::intrusive_ptr<c10::TensorImpl, at::UndefinedTensorImpl>::
         unsafe_reclaim_from_nonowning(static_cast<c10::TensorImpl*>(ptr));
-    AT_CHECK(p.defined(), "Can't wrap undefined tensor");
-    AT_CHECK(!p->is_variable(), "Can wrap only non-variable tensor");
+    TORCH_CHECK(p.defined(), "Can't wrap undefined tensor");
+    TORCH_CHECK(!p->is_variable(), "Can wrap only non-variable tensor");
     auto tensor = at::Tensor::wrap_tensor_impl(std::move(p));
     return py::cast(torch::autograd::Variable(
         torch::autograd::make_variable(std::move(tensor), false)));
diff --git a/torch/csrc/autograd/record_function.cpp b/torch/csrc/autograd/record_function.cpp
index 75394b3c0cbe..9ecae6da64dc 100644
--- a/torch/csrc/autograd/record_function.cpp
+++ b/torch/csrc/autograd/record_function.cpp
@@ -1,6 +1,8 @@
 #include <torch/csrc/autograd/record_function.h>
 #include <torch/csrc/autograd/function.h>
 
+#include <cstdlib>
+
 namespace torch { namespace autograd { namespace profiler {
 
 namespace {
@@ -8,6 +10,28 @@ std::vector<RecordFunctionCallback> start_callbacks;
 std::vector<RecordFunctionCallback> end_callbacks;
 size_t callback_needs_inputs = 0;
 thread_local RecordFunction* thread_local_func_ = nullptr;
+
+bool is_sampled_callbacks = false;
+double sampling_prob = 1.0;
+constexpr double kEps = 1e-10;
+}
+
+void setSamplingProbability(double prob) {
+  if (std::abs(prob - 1.0) < kEps) {
+    is_sampled_callbacks = false;
+  } else {
+    TORCH_CHECK(prob > -kEps && prob < 1.0);
+    is_sampled_callbacks = true;
+  }
+  sampling_prob = prob;
+}
+
+double getSamplingProbability() {
+  return sampling_prob;
+}
+
+bool checkCallbacksSampled() {
+  return is_sampled_callbacks;
 }
 
 void pushCallback(
diff --git a/torch/csrc/autograd/record_function.h b/torch/csrc/autograd/record_function.h
index 7d25f55f2cab..def063f78a71 100644
--- a/torch/csrc/autograd/record_function.h
+++ b/torch/csrc/autograd/record_function.h
@@ -92,14 +92,25 @@ struct TORCH_API RecordFunction {
 TORCH_API bool hasCallbacks();
 TORCH_API bool needsInputs();
 
+TORCH_API void setSamplingProbability(double);
+TORCH_API double getSamplingProbability();
+TORCH_API bool checkCallbacksSampled();
+
+inline bool checkCallbacksEnabled() {
+  return !checkCallbacksSampled() ||
+      (((double) std::rand() / RAND_MAX) < getSamplingProbability());
+}
+
 // optional argument - function's seq_no
 #define RECORD_FUNCTION(fn, inputs, ...) \
   torch::autograd::profiler::RecordFunction guard; \
   if (torch::autograd::profiler::hasCallbacks()) { \
-    if (torch::autograd::profiler::needsInputs()) { \
-      guard.before(fn, inputs, ##__VA_ARGS__); \
-    } else { \
-      guard.before(fn, ##__VA_ARGS__); \
+    if (torch::autograd::profiler::checkCallbacksEnabled()) { \
+      if (torch::autograd::profiler::needsInputs()) { \
+        guard.before(fn, inputs, ##__VA_ARGS__); \
+      } else { \
+        guard.before(fn, ##__VA_ARGS__); \
+      } \
     } \
   }
 
diff --git a/torch/csrc/autograd/variable.cpp b/torch/csrc/autograd/variable.cpp
index 9fa2033c15d8..b5a6798f5aa8 100644
--- a/torch/csrc/autograd/variable.cpp
+++ b/torch/csrc/autograd/variable.cpp
@@ -30,7 +30,7 @@ Variable::Impl::Impl(at::Tensor data, std::unique_ptr<Variable::AutogradMeta> au
 
   // set_requires_grad also checks error conditions.
   autograd_meta->set_requires_grad(requires_grad, this);
-  AT_CHECK(
+  TORCH_CHECK(
       !autograd_meta->grad_fn_ || !autograd_meta->requires_grad_,
       "requires_grad should be false if grad_fn is set");
   if (!data_.defined()) {
@@ -54,8 +54,8 @@ IntArrayRef Variable::Impl::strides() const {
   return data_.strides();
 }
 
-bool Variable::Impl::is_contiguous() const {
-  return data_.is_contiguous();
+bool Variable::Impl::is_contiguous(MemoryFormat memory_format) const {
+  return data_.is_contiguous(memory_format);
 }
 
 int64_t Variable::Impl::dim() const {
@@ -170,12 +170,12 @@ void Variable::Impl::set_data(const at::Tensor &new_data) {
   device_opt_ = new_data.device();
   type_id_ = new_data.dispatch_type().type_id();
 
-  auto new_data_impl_copy = new_data.getIntrusivePtr()->shallow_copy_and_detach();
   // Version counter is not shared when we replace a `Variable`'s underlying `Tensor`
   // by calling `set_data(...)`. The original version of the `Variable` is always preserved.
   // See NOTE [ Version Counter Sharing ] for details.
-  auto saved_version_ = data_.unsafeGetTensorImpl()->version_counter().current_version();
-  new_data_impl_copy->set_version_counter(saved_version_);
+  auto new_data_impl_copy = new_data.getIntrusivePtr()->shallow_copy_and_detach(
+    /*version_counter=*/data_.unsafeGetTensorImpl()->version_counter(),
+    /*allow_tensor_metadata_change=*/true);
   data_ = std::move(at::Tensor(new_data_impl_copy));
 }
 
@@ -188,7 +188,7 @@ Variable::DifferentiableViewImpl::DifferentiableViewImpl(Variable base, at::Tens
     : Variable::Impl(std::move(data), std::move(autograd_meta), false, std::move(gradient_edge)) {
   auto diff_view_meta = static_cast<Variable::DifferentiableViewMeta*>(get_autograd_meta());
   diff_view_meta->base_ = std::move(base);
-  AT_CHECK(diff_view_meta->base_.defined(), "base is undefined");
+  TORCH_CHECK(diff_view_meta->base_.defined(), "base is undefined");
   if (diff_view_meta->base_.is_view()) {
     diff_view_meta->base_ = diff_view_meta->base_.base();
   }
@@ -238,7 +238,7 @@ void Variable::rebase_history(Edge gradient_edge) {
     auto diff_view_meta = static_cast<Variable::DifferentiableViewMeta*>(get_autograd_meta());
     AT_ASSERT(gradient_edge.input_nr == 0);
     AT_ASSERT(gradient_edge.function);
-    AT_CHECK(
+    TORCH_CHECK(
         gradient_edge.function->num_inputs() == 1,
         "Functions which modify views in-place must return a single Variable");
     diff_view_meta->output_nr_ = gradient_edge.input_nr;
diff --git a/torch/csrc/autograd/variable.h b/torch/csrc/autograd/variable.h
index ec42d56562c9..5cd7d648131e 100644
--- a/torch/csrc/autograd/variable.h
+++ b/torch/csrc/autograd/variable.h
@@ -137,14 +137,14 @@ struct TORCH_API Variable : public at::Tensor {
   // "Downcasts" a `Tensor` into a `Variable`. Only call this on tensors you
   // know are Variables.
   /*implicit*/ Variable(at::Tensor const& rhs) : at::Tensor(rhs) {
-    AT_CHECK(
+    TORCH_CHECK(
         is_variable() || !defined(),
         "Tensor that was converted to Variable was not actually a Variable");
   }
 
   /*implicit*/ Variable(at::Tensor&& rhs)
       : at::Tensor(std::move(rhs)) {
-    AT_CHECK(
+    TORCH_CHECK(
         is_variable() || !defined(),
         "Tensor that was converted to Variable was not actually a Variable");
   }
@@ -355,7 +355,7 @@ struct TORCH_API Variable::AutogradMeta : public c10::AutogradMetaInterface {
   /// leaf variables that want to accumulate gradients, and false for all other
   /// variables.
   void set_requires_grad(bool requires_grad, at::TensorImpl* self_impl) override {
-    AT_CHECK(
+    TORCH_CHECK(
       !requires_grad || at::isFloatingType(at::typeMetaToScalarType(self_impl->dtype())),
       "Only Tensors of floating point dtype can require gradients");
     requires_grad_ = requires_grad;
@@ -409,7 +409,7 @@ struct TORCH_API Variable::Impl : public at::TensorImpl {
   int64_t numel() const override;
   at::IntArrayRef sizes() const override;
   at::IntArrayRef strides() const override;
-  bool is_contiguous() const override;
+  bool is_contiguous(at::MemoryFormat memory_format=at::MemoryFormat::Any) const override;
   int64_t size(int64_t d) const override;
   int64_t stride(int64_t d) const override;
   void resize_dim(int64_t ndim) override;
@@ -546,21 +546,22 @@ inline Variable make_variable_view(
   if (data.defined()) {
     if (is_differentiable) {
       /// Differentiable view. Track history with DifferentiableViewImpl.
-      auto data_impl_copy = data.getIntrusivePtr()->shallow_copy_and_detach();
-      data_impl_copy->set_allow_tensor_metadata_change(allow_tensor_metadata_change);
+      auto data_impl_copy = data.getIntrusivePtr()->shallow_copy_and_detach(
+        /*version_counter=*/0,
+        /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
       auto data_copy = at::Tensor(data_impl_copy);
       auto diff_view_meta = c10::guts::make_unique<Variable::DifferentiableViewMeta>();
       return Variable(c10::make_intrusive<Variable::DifferentiableViewImpl>(
               std::move(base), std::move(data_copy), std::move(gradient_edge), std::move(diff_view_meta)));
     } else {
       /// Non-differentiable view. Just share version counter.
-      auto data_impl_copy = data.getIntrusivePtr()->shallow_copy_and_detach();
-      data_impl_copy->set_allow_tensor_metadata_change(allow_tensor_metadata_change);
+      auto data_impl_copy = data.getIntrusivePtr()->shallow_copy_and_detach(
+        /*version_counter=*/base.version_counter(),
+        /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
       auto data_copy = at::Tensor(data_impl_copy);
       auto autograd_meta = c10::guts::make_unique<Variable::AutogradMeta>();
       auto var = Variable(c10::make_intrusive<Variable::Impl>(
               std::move(data_copy), std::move(autograd_meta), false, std::move(gradient_edge)));
-      var.set_version_counter(base.version_counter());
       return var;
     }
   }
@@ -571,12 +572,13 @@ inline Variable make_variable(
     at::Tensor data,
     bool requires_grad = false,
     bool allow_tensor_metadata_change = true) {
-  AT_CHECK(
+  TORCH_CHECK(
       !data.is_variable(),
       "Must not create a new variable from a variable, use its .data()");
   if (data.defined()) {
-    auto data_impl_copy = data.getIntrusivePtr()->shallow_copy_and_detach();
-    data_impl_copy->set_allow_tensor_metadata_change(allow_tensor_metadata_change);
+    auto data_impl_copy = data.getIntrusivePtr()->shallow_copy_and_detach(
+      /*version_counter=*/0,
+      /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
     auto data_copy = at::Tensor(data_impl_copy);
     auto autograd_meta = c10::guts::make_unique<Variable::AutogradMeta>();
     return Variable(c10::make_intrusive<Variable::Impl>(data_copy, std::move(autograd_meta), requires_grad));
@@ -588,7 +590,7 @@ inline Variable make_variable_consuming(
     at::Tensor data,
     bool requires_grad = false,
     bool allow_tensor_metadata_change = true) {
-  AT_CHECK(
+  TORCH_CHECK(
       !data.is_variable(),
       "Must not create a new variable from a variable, use its .data()");
   if (data.defined()) {
@@ -604,12 +606,13 @@ inline Variable make_variable(
     at::Tensor data,
     Edge gradient_edge,
     bool allow_tensor_metadata_change = true) {
-  AT_CHECK(
+  TORCH_CHECK(
       !data.is_variable(),
       "Must not create a new variable from a variable, use its .data()");
   if (data.defined()) {
-    auto data_impl_copy = data.getIntrusivePtr()->shallow_copy_and_detach();
-    data_impl_copy->set_allow_tensor_metadata_change(allow_tensor_metadata_change);
+    auto data_impl_copy = data.getIntrusivePtr()->shallow_copy_and_detach(
+      /*version_counter=*/0,
+      /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
     auto data_copy = at::Tensor(data_impl_copy);
     auto autograd_meta = c10::guts::make_unique<Variable::AutogradMeta>();
     return Variable(c10::make_intrusive<Variable::Impl>(data_copy, std::move(autograd_meta), false, std::move(gradient_edge)));
@@ -624,7 +627,7 @@ inline Variable make_variable(
 /// in DEBUG mode and the tensor's dynamic type is not in fact `Variable`,
 /// throws a `std::invalid_argument` exception.
 inline Variable& as_variable_ref(at::Tensor& tensor) {
-  AT_CHECK(
+  TORCH_CHECK(
       tensor.is_variable(),
       "Attempted to cast a Tensor to a Variable, but "
       "the dynamic type of the value is not Variable.");
@@ -632,7 +635,7 @@ inline Variable& as_variable_ref(at::Tensor& tensor) {
 }
 
 inline const Variable& as_variable_ref(const at::Tensor& tensor) {
-  AT_CHECK(
+  TORCH_CHECK(
       tensor.is_variable(),
       "Attempted to cast a Tensor to a Variable, but "
       "the dynamic type of the value is not Variable.");
@@ -767,7 +770,7 @@ inline Variable::Variable(c10::intrusive_ptr<Variable::Impl> self)
     : at::Tensor(std::move(self)) {}
 
 inline Variable::Impl* Variable::get() const {
-  AT_CHECK(defined(), "Called Variable::get() on an undefined Variable");
+  TORCH_CHECK(defined(), "Called Variable::get() on an undefined Variable");
   return static_cast<Variable::Impl*>(impl_.get());
 }
 }} // namespace torch::autograd
diff --git a/torch/csrc/cuda/Event.cpp b/torch/csrc/cuda/Event.cpp
index 216f7acf980e..c0e3bbb68edd 100644
--- a/torch/csrc/cuda/Event.cpp
+++ b/torch/csrc/cuda/Event.cpp
@@ -56,10 +56,10 @@ static PyObject * THCPEvent_from_ipc_handle(
   at::Device device = r.device(0);
   std::string handle_string = r.string(1);
 
-  AT_CHECK(handle_string.size() == sizeof(cudaIpcEventHandle_t),
+  TORCH_CHECK(handle_string.size() == sizeof(cudaIpcEventHandle_t),
     "cudaIpcEventHandle_t expects byte-like object of size ",
     sizeof(cudaIpcEventHandle_t), ", but got ", handle_string.size());
-  AT_CHECK(device.type() == at::kCUDA, "Event can only be created on "
+  TORCH_CHECK(device.type() == at::kCUDA, "Event can only be created on "
     "CUDA devices, but got device type ", device.type())
 
   THPObjectPtr ptr(type->tp_alloc(type, 0));
diff --git a/torch/csrc/cuda/comm.cpp b/torch/csrc/cuda/comm.cpp
index 308ca606f708..25ba19651215 100644
--- a/torch/csrc/cuda/comm.cpp
+++ b/torch/csrc/cuda/comm.cpp
@@ -181,7 +181,7 @@ std::vector<at::Tensor> scatter(
   if (chunk_sizes) {
     const int64_t chunk_size_sum =
         std::accumulate(chunk_sizes->begin(), chunk_sizes->end(), int64_t{0});
-    AT_CHECK(
+    TORCH_CHECK(
       chunk_size_sum == tensor.size(dim),
       "given chunk sizes don't sum up to the tensor's size ",
       "(sum(chunk_sizes) == ", chunk_size_sum,
@@ -190,7 +190,7 @@ std::vector<at::Tensor> scatter(
     int64_t chunk_start = 0;
     for (size_t chunk = 0; chunk < chunk_sizes->size(); ++chunk) {
       const int64_t chunk_size = (*chunk_sizes)[chunk];
-      AT_CHECK(chunk_size > 0, "Chunk size must be positive");
+      TORCH_CHECK(chunk_size > 0, "Chunk size must be positive");
       chunks.push_back(tensor.narrow(dim, chunk_start, chunk_size));
       chunk_start += chunk_size;
     }
@@ -202,7 +202,7 @@ std::vector<at::Tensor> scatter(
   for (size_t chunk = 0; chunk < chunks.size(); ++chunk) {
     const auto device_index = static_cast<int16_t>(devices[chunk]);
     if (streams && (*streams)[chunk]) {
-      AT_CHECK(
+      TORCH_CHECK(
           (*streams)[chunk]->device_index() == device_index,
           "Expected the device associated with the stream at index ",
           chunk, " (was ", (*streams)[chunk]->device_index(), ") ",
@@ -220,19 +220,19 @@ at::Tensor gather(
     at::TensorList tensors,
     int64_t dim,
     c10::optional<int32_t> destination_index) {
-  AT_CHECK(!tensors.empty(), "Expected at least one tensor to gather from");
+  TORCH_CHECK(!tensors.empty(), "Expected at least one tensor to gather from");
   at::Tensor result;
   int64_t total_size = 0;
   auto& first = tensors.front();
   const auto first_size = first.sizes();
   std::vector<int64_t> expected_size(first_size.begin(), first_size.end());
   for (const auto& tensor : tensors) {
-    AT_CHECK(
+    TORCH_CHECK(
         tensor.is_cuda(), "Gather expects all inputs to have CUDA type");
     AT_ASSERT(tensor.ndimension() == static_cast<int64_t>(expected_size.size()));
     expected_size[dim] = tensor.size(dim);
     for (size_t dimension = 0; dimension < expected_size.size(); ++dimension) {
-      AT_CHECK(
+      TORCH_CHECK(
           expected_size[dimension] == tensor.size(dimension),
           "Gather got an input of invalid size: got ",
           tensor.sizes(), ", but expected ", at::IntArrayRef(expected_size));
diff --git a/torch/csrc/cuda/nccl.cpp b/torch/csrc/cuda/nccl.cpp
index 2db8306252d8..e377e971f4df 100644
--- a/torch/csrc/cuda/nccl.cpp
+++ b/torch/csrc/cuda/nccl.cpp
@@ -250,7 +250,7 @@ void broadcast(
     const auto stream = (streams.empty() || !streams[i])
         ? at::cuda::getCurrentCUDAStream(device).stream()
         : streams[i]->stream();
-    AT_CHECK(
+    TORCH_CHECK(
         static_cast<uint64_t>(numel) <= static_cast<uint64_t>(count_max),
         "Broadcast tensor has ",
         numel,
@@ -275,7 +275,7 @@ void reduce(
     const comm_list& user_comms) {
 #ifdef USE_NCCL
   using namespace torch::cuda::nccl::detail;
-  AT_CHECK(
+  TORCH_CHECK(
       root >= 0 && static_cast<size_t>(root) < inputs.size(), "invalid root");
 
   _check_inputs(inputs, outputs, 1, 1);
diff --git a/torch/csrc/distributed/c10d/comm.cpp b/torch/csrc/distributed/c10d/comm.cpp
new file mode 100644
index 000000000000..5d6659559404
--- /dev/null
+++ b/torch/csrc/distributed/c10d/comm.cpp
@@ -0,0 +1,82 @@
+#include <torch/csrc/distributed/c10d/comm.h>
+
+#include <deque>
+
+#include <ATen/core/functional.h>
+#include <torch/csrc/distributed/c10d/reducer.h>
+#include <torch/csrc/utils/tensor_flatten.h>
+
+namespace c10d {
+namespace {
+
+class BroadcastWork {
+ public:
+  BroadcastWork(
+      const std::shared_ptr<c10d::ProcessGroup>& process_group,
+      std::vector<at::Tensor> bucket_tensors)
+      : bucket_tensors_(std::move(bucket_tensors)),
+        flat_tensor_({torch::utils::flatten_dense_tensors(bucket_tensors_)}),
+        work_(process_group->broadcast(flat_tensor_)) {}
+
+  void finish() {
+    work_->wait();
+
+    // Copy the output of the broadcast operation back.
+    auto output_tensors = torch::utils::unflatten_dense_tensors(
+        flat_tensor_.front(), bucket_tensors_);
+    AT_ASSERT(output_tensors.size() == bucket_tensors_.size());
+    for (size_t i = 0; i < output_tensors.size(); i++) {
+      bucket_tensors_[i].copy_(output_tensors[i], /*non_blocking=*/true);
+    }
+  }
+
+ protected:
+  // The list of tensors to broadcast. They are guaranteed to be
+  // placed on the same device and have the same dtype.
+  std::vector<at::Tensor> bucket_tensors_;
+
+  // The vector with a single flattened tensor containing the contents
+  // of the tensors in bucket_tensors_. It must be stored in a vector
+  // because c10d::ProcessGroup::broadcast takes a vector argument.
+  std::vector<at::Tensor> flat_tensor_;
+
+  // The broadcast work that is kicked off upon construction.
+  std::shared_ptr<c10d::ProcessGroup::Work> work_;
+};
+
+} // namespace
+
+// Broadcast many tensors to all processes in the process group.
+void broadcast_coalesced(
+    std::shared_ptr<c10d::ProcessGroup> process_group,
+    at::TensorList tensors,
+    size_t buffer_size) {
+  // Coalesce tensors into buckets taking into account the maximum buffer size.
+  // This routine is multi-device aware, so the tensors can be split across
+  // multiple devices and can contain a mix of CPU and CUDA tensors.
+  const auto buckets =
+      compute_bucket_assignment_by_size(tensors.vec(), {buffer_size});
+
+  // Returns tensor at specified index in input tensor list.
+  const auto lookup = [&tensors](size_t index) { return tensors[index]; };
+
+  // We maintain a maximum of 2 in flight broadcast operations to avoid
+  // allocating too much memory (in case the specified tensors are very large).
+  std::deque<BroadcastWork> in_flight;
+  constexpr auto max_in_flight = 2;
+  for (const auto& bucket : buckets) {
+    if (in_flight.size() >= max_in_flight) {
+      in_flight.front().finish();
+      in_flight.pop_front();
+    }
+
+    in_flight.emplace_back(process_group, c10::fmap(bucket, lookup));
+  }
+
+  while (!in_flight.empty()) {
+    in_flight.front().finish();
+    in_flight.pop_front();
+  }
+}
+
+} // namespace c10d
diff --git a/torch/csrc/distributed/c10d/comm.h b/torch/csrc/distributed/c10d/comm.h
new file mode 100644
index 000000000000..9aec2e6bac3e
--- /dev/null
+++ b/torch/csrc/distributed/c10d/comm.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include <memory>
+
+#include <ATen/ATen.h>
+#include <c10d/ProcessGroup.hpp>
+
+namespace c10d {
+
+// Broadcast many tensors to all processes in the process group.
+void broadcast_coalesced(
+    std::shared_ptr<c10d::ProcessGroup> process_group,
+    at::TensorList tensors,
+    size_t buffer_size);
+
+} // namespace c10d
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index a1b14fd24da3..b8ce586ded42 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -18,6 +18,7 @@
 #include <pybind11/chrono.h>
 
 #include <torch/csrc/Exceptions.h>
+#include <torch/csrc/distributed/c10d/comm.h>
 #include <torch/csrc/distributed/c10d/ddp.h>
 #include <torch/csrc/distributed/c10d/reducer.h>
 #include <torch/csrc/utils/object_ptr.h>
@@ -534,6 +535,21 @@ They are used in specifying strategies for reduction collectives, e.g.,
       py::arg("bucket_size"),
       py::call_guard<py::gil_scoped_release>());
 
+  module.def(
+      "_broadcast_coalesced",
+      // Define a lambda such that the pybind11 prototype can take a std::vector
+      // for the tensor list argument, but still pass it to the underlying
+      // function as a c10::ArrayRef.
+      [](std::shared_ptr<::c10d::ProcessGroup> process_group,
+         std::vector<at::Tensor> tensors,
+         size_t buffer_size) {
+        broadcast_coalesced(process_group, tensors, buffer_size);
+      },
+      py::arg("process_group"),
+      py::arg("tensors"),
+      py::arg("buffer_size"),
+      py::call_guard<py::gil_scoped_release>());
+
   Py_RETURN_TRUE;
 }
 
diff --git a/torch/csrc/distributed/c10d/reducer.cpp b/torch/csrc/distributed/c10d/reducer.cpp
index 3b9caef7df4c..d919705d1e57 100644
--- a/torch/csrc/distributed/c10d/reducer.cpp
+++ b/torch/csrc/distributed/c10d/reducer.cpp
@@ -395,17 +395,19 @@ void Reducer::prepare_for_backward(
         "starting a new one. ",
         "",
         "This error indicates that your module has parameters that were ",
-        "not used in producing its output (the return value of `forward`). ",
+        "not used in producing loss. ",
         "",
-        "You can enable unused parameter detection by passing the keyword "
+        "You can enable unused parameter detection by (1) passing the keyword "
         "argument `find_unused_parameters=True` to ",
-        "`torch.nn.parallel.DistributedDataParallel`. ",
+        "`torch.nn.parallel.DistributedDataParallel`; (2) making sure all ",
+        "`forward` function outputs participate in calculating loss. "
         "",
-        "If you already have this argument set, then the distributed data ",
-        "parallel module wasn't able to locate the output tensors in the ",
+        "If you already have done the above two steps, then the distributed ",
+        "data parallel module wasn't able to locate the output tensors in the ",
         "return value of your module's `forward` function. ",
-        "Please include the structure of the return value of `forward` of ",
-        "your module when reporting this issue (e.g. list, dict, iterable).");
+        "Please include the loss function and the structure of the return ",
+        "value of `forward` of your module when reporting this issue (e.g. ",
+        "list, dict, iterable).");
   }
 
   // Reset accounting.
diff --git a/torch/csrc/generic/Storage.cpp b/torch/csrc/generic/Storage.cpp
index 4e9382f19c3f..5bec5935ef61 100644
--- a/torch/csrc/generic/Storage.cpp
+++ b/torch/csrc/generic/Storage.cpp
@@ -382,12 +382,11 @@ void THPStorage_(postInit)(PyObject *module)
   THPStorageClass = PyObject_GetAttrString(module,(char*)TH_CONCAT_STRING_2(Real,Storage));
   if (!THPStorageClass) throw python_error();
 
-  bool is_cuda = false;
+  at::Backend backend = at::Backend::CPU;
 #ifdef THC_GENERIC_FILE
-  is_cuda = true;
+  backend = at::Backend::CUDA;
 #endif
-  const char *type_name = TH_CONCAT_STRING_2(Real,);
-  torch::registerStoragePyTypeObject((PyTypeObject*)THPStorageClass, type_name, is_cuda, false);
+  torch::registerStoragePyTypeObject((PyTypeObject*)THPStorageClass, backend, TH_CONCAT_2(at::k, Real));
 }
 
 #endif
diff --git a/torch/csrc/jit/autodiff.cpp b/torch/csrc/jit/autodiff.cpp
index d10868f96070..dd77ee86186d 100644
--- a/torch/csrc/jit/autodiff.cpp
+++ b/torch/csrc/jit/autodiff.cpp
@@ -856,7 +856,7 @@ static void lambdaLiftReverse(Gradient& grad_desc, ReverseDetails& rev_info) {
 Gradient differentiate(std::shared_ptr<Graph>& graph) {
   Gradient grad_desc;
   // Take ownership of the graph
-  AT_CHECK(
+  TORCH_CHECK(
       graph.use_count() == 1,
       "differentiate will mutate and destroy the graph, so it requires "
       "graph.use_count() == 1, but found %d",
diff --git a/torch/csrc/jit/constants.cpp b/torch/csrc/jit/constants.cpp
index c66ec81acc84..ef01b4c90543 100644
--- a/torch/csrc/jit/constants.cpp
+++ b/torch/csrc/jit/constants.cpp
@@ -85,7 +85,7 @@ c10::optional<Value*> tryInsertConstant(
     return c10::nullopt;
   }
   if (loc)
-    n->setSourceLocation(std::make_shared<SourceRange>(*loc));
+    n->setSourceRange(*loc);
   if (scope)
     n->setScope(*scope);
   if (result_type) {
diff --git a/torch/csrc/jit/custom_operator.h b/torch/csrc/jit/custom_operator.h
index 87c2f74c847a..93a334ab9e08 100644
--- a/torch/csrc/jit/custom_operator.h
+++ b/torch/csrc/jit/custom_operator.h
@@ -2,7 +2,7 @@
 
 #include <torch/csrc/jit/operator.h>
 #include <ATen/core/stack.h>
-#include <ATen/core/op_registration/infer_schema.h>
+#include <ATen/core/op_registration/op_registration.h>
 #include <torch/csrc/jit/tracer.h>
 #include <torch/csrc/utils/variadic.h>
 
@@ -83,7 +83,7 @@ inline void checkArgumentVector(
     const FunctionSchema& inferredSchema,
     const FunctionSchema& providedSchema) {
   // clang-format off
-  AT_CHECK(
+  TORCH_CHECK(
       inferred.size() == provided.size(),
       "Inferred ", inferred.size(), " ", what,
       "(s) for operator implementation, but the provided schema specified ",
@@ -92,7 +92,7 @@ inline void checkArgumentVector(
   // clang-format on
   for (size_t i = 0; i < provided.size(); ++i) {
     // clang-format off
-    AT_CHECK(
+    TORCH_CHECK(
         provided[i].type()->isSubtypeOf(inferred[i].type()),
         "Inferred type for ", what, " #", i, " was ", *inferred[i].type(),
         ", but the provided schema specified type ", *provided[i].type(),
@@ -246,12 +246,33 @@ struct TORCH_API RegisterOperators {
   RegisterOperators& op(
       const std::string& name,
       Implementation&& implementation,
-      OperatorOptions options = OperatorOptions()) {
+      OperatorOptions options) {
+
     registerOperator(createOperator(
         name, std::forward<Implementation>(implementation), options));
     return *this;
   }
+
+  template <typename Implementation>
+  RegisterOperators& op(
+      const std::string& name,
+      Implementation&& implementation) {
+    registrars_.emplace_back(std::make_shared<c10::RegisterOperators>(name, std::forward<Implementation>(implementation)));
+
+    return *this;
+  }
+
+private:
+  // A c10::RegisterOperators instance is not copyable, so to make
+  // torch::jit::RegisterOperators copyable, we use shared_ptrs.
+  // We need to keep the c10::RegisterOperators instances around
+  // because this is an RAII pattern. In the destructor, the registered
+  // ops get de-registered.
+  std::vector<std::shared_ptr<c10::RegisterOperators>> registrars_;
 };
 
 } // namespace jit
+
+using RegisterOperators = c10::RegisterOperators;
+
 } // namespace torch
diff --git a/torch/csrc/jit/export.cpp b/torch/csrc/jit/export.cpp
index 150838162ff5..0d64e0399a63 100644
--- a/torch/csrc/jit/export.cpp
+++ b/torch/csrc/jit/export.cpp
@@ -38,13 +38,7 @@ namespace onnx = ::ONNX_NAMESPACE;
 class ScriptModuleSerializer;
 
 std::string getNodeStackTraceString(const Node* n) {
-  std::stringstream ss;
-  if (n->getSourceLocation()) {
-    n->getSourceLocation()->highlight(ss);
-  } else {
-    ss << "<unknown location>";
-  }
-  return ss.str();
+  return n->sourceRange().str();
 }
 
 void validateBlock(
@@ -258,10 +252,8 @@ void EncoderBase::EncodeBlock(
       continue;
     }
     auto p_n = graph_proto->add_node();
-    if (node->getSourceLocation() && !strip_doc_) {
-      std::stringstream ss;
-      node->getSourceLocation()->highlight(ss);
-      p_n->set_doc_string(ss.str());
+    if (!strip_doc_) {
+      p_n->set_doc_string(node->sourceRange().str());
     }
     for (auto input : node->inputs()) {
       if (input->node()->mustBeNone() && !is_raw_export) {
@@ -512,7 +504,10 @@ class ScriptModuleSerializer final {
   // to dump the content of a tensor
   void writeTensorTable(torch::ModelDef* model_def);
 
-  void writeAttributeTable();
+  // Write the list of ivalues to a file as a pickle program
+  void writePickleArchive(
+      const std::string& name,
+      const std::vector<IValue>& ivalues);
   void writeLibs(torch::ModelDef* model_def);
 
   void convertModule(
@@ -521,10 +516,8 @@ class ScriptModuleSerializer final {
       const std::string& name,
       torch::ModuleDef* module_def);
 
-  void convertParameter(
-      const script::Slot& param,
-      torch::ParameterDef* param_def,
-      bool is_parameter);
+  IValue moduleGetState(const script::Module& module);
+  bool moduleHasValidGetSetState(const script::Module& module);
 
   void convertClass(const ClassTypePtr& type, torch::ModelDef* model_def);
 
@@ -534,7 +527,9 @@ class ScriptModuleSerializer final {
   // all tensors that will be stored
   std::vector<at::Tensor> tensor_table_;
 
-  std::vector<IValue> attribute_table_;
+  // A list of attributes (indexed by attr_def->id()) and module state (indexed
+  // by module_def->id())
+  std::vector<IValue> pickled_ivalues_;
 
   // all classes used by this module hierarchy
   std::vector<ClassTypePtr> class_table_;
@@ -664,8 +659,8 @@ void ScriptModuleSerializer::convertModel(
   convertModule(
       module, "", writer_.archiveName(), model_def->mutable_main_module());
 
-  // This may write some attributes to the tensor_table_
-  writeAttributeTable();
+
+  writePickleArchive("attributes.pkl", pickled_ivalues_);
 
   writeTensorTable(model_def);
   writeLibs(model_def);
@@ -677,6 +672,82 @@ void ScriptModuleSerializer::convertModel(
   }
 }
 
+bool ScriptModuleSerializer::moduleHasValidGetSetState(
+    const script::Module& module) {
+  // Check that the schemas for __getstate__ and __setstate__ are correct
+  auto getstate = module.module_object()->type()->getMethod("__getstate__");
+  if (getstate == nullptr) {
+    return false;
+  }
+  auto get_schema =
+      module.module_object()->type()->getMethod("__getstate__")->getSchema();
+
+  // Check __getstate__
+  //   __getstate__ is expected to be (self) -> T
+  AT_CHECK(
+      get_schema.arguments().size() == 1,
+      "'__getstate__' must have 'self' as its only argument, but found ",
+      get_schema.arguments().size(),
+      " arguments");
+  AT_CHECK(
+      get_schema.returns().size() == 1,
+      "'__getstate__' must return 1 value, but found ",
+      get_schema.returns().size());
+
+  // Check __setstate__ if the method exists
+  //   __setstate__ is expected to be (self, T) -> None
+  // TODO: use getMethod("__getstate__") once methods are not lowered
+  auto setstate = module.class_compilation_unit().find_function("__setstate__");
+  if (setstate == nullptr) {
+    return false;
+  }
+  auto set_schema = setstate->getSchema();
+
+  AT_CHECK(
+      set_schema.arguments().size() == 2,
+      "'__setstate__' must have 'self' and the state as its "
+      "only arguments, but found ",
+      set_schema.arguments().size(),
+      " arguments");
+  AT_CHECK(
+      set_schema.returns().size() == 1,
+      "'__setstate__' must return None, but found ",
+      set_schema.returns().size(),
+      " return values");
+  AT_CHECK(
+      set_schema.returns().at(0).type()->isSubtypeOf(NoneType::get()),
+      "'__setstate__' must return None, but found value of type",
+      set_schema.returns().at(0).type()->python_str());
+
+  // Check that the return type of __getstate__ matches the input to
+  // __setstate__
+  auto get_type = get_schema.returns().at(0).type();
+  auto set_type = set_schema.arguments().at(1).type();
+
+  AT_CHECK(
+      set_type->isSubtypeOf(get_type),
+      "'__getstate__'s return type (",
+      get_type->python_str(),
+      " does not match '__setstate__'s argument type (",
+      set_type->python_str(),
+      "))");
+
+  return true;
+}
+
+/// Run module.__getstate__() and return the result
+IValue ScriptModuleSerializer::moduleGetState(const script::Module& module) {
+  auto getstate = module.find_method("__getstate__");
+  AT_CHECK(
+      getstate != nullptr,
+      "Cannot call '__getstate__' method because"
+      " it does not exist");
+
+  Stack stack;
+  getstate->run(stack);
+  return stack.at(0);
+}
+
 size_t ScriptModuleSerializer::addTensor(const at::Tensor& tensor) {
   tensor_table_.push_back(tensor);
   return tensor_table_.size() - 1;
@@ -728,17 +799,18 @@ void ScriptModuleSerializer::writeTensorTable(torch::ModelDef* model_def) {
   }
 }
 
-void ScriptModuleSerializer::writeAttributeTable() {
+void ScriptModuleSerializer::writePickleArchive(
+    const std::string& name,
+    const std::vector<IValue>& ivalues) {
   Pickler pickler(&tensor_table_);
   pickler.start();
   pickler.startTuple();
-  for (const IValue& ivalue : attribute_table_) {
+  for (const IValue& ivalue : ivalues) {
     pickler.addIValue(ivalue);
   }
   pickler.endTuple();
   pickler.finish();
-  writer_.writeRecord(
-      "attributes.pkl", pickler.stack().data(), pickler.stack().size());
+  writer_.writeRecord(name, pickler.stack().data(), pickler.stack().size());
 }
 
 void ScriptModuleSerializer::convertModule(
@@ -748,19 +820,47 @@ void ScriptModuleSerializer::convertModule(
     torch::ModuleDef* module_def) {
   module_def->set_name(name);
   module_def->set_optimize(module.is_optimized());
-  for (const auto& elem : module.get_parameters()) {
+
+  // If __getstate__ and __setstate__ methods are provided, use those for
+  // serializing instead of serializing the attributes directly
+  bool user_provided_serialization = moduleHasValidGetSetState(module);
+  if (user_provided_serialization) {
+    // Run the '__getstate__' method on the module and store the result
+    pickled_ivalues_.emplace_back(moduleGetState(module));
+    module_def->set_get_state_attribute_id(pickled_ivalues_.size() - 1);
+  }
+
+  // Add all the parameters
+  for (const auto& param : module.get_parameters()) {
     torch::ParameterDef* param_def = module_def->add_parameters();
-    convertParameter(elem, param_def, /*is_buffer=*/false);
+    param_def->set_name(param.name());
+    param_def->set_is_buffer(false);
+    if (user_provided_serialization) {
+      // If a __getstate__ was used, don't write the actual tensor
+      param_def->set_tensor_id(-1);
+    } else {
+      param_def->set_tensor_id(addTensor(param.value().toTensor()));
+    }
   }
 
+  // Add all the attributes
   for (const auto& attribute : module.get_attributes()) {
     // Add attribute to ModuleDef
     torch::AttributeDef* attribute_def = module_def->add_attributes();
     attribute_def->set_name(attribute.name());
     attribute_def->set_type(attribute.type()->python_str());
 
-    attribute_table_.push_back(attribute.value());
-    attribute_def->set_id(attribute_table_.size() - 1);
+    if (!user_provided_serialization) {
+      // Write the attribute's index if it's actually saved, -1 if it needs to
+      // come from __getstate__
+      pickled_ivalues_.push_back(attribute.value());
+      attribute_def->set_id(pickled_ivalues_.size() - 1);
+    } else {
+      // The module had a __setstate__, so write the attribute name/type so
+      // it can be correctly imported, but it has no entry in the
+      // pickled_ivalues_ table
+      attribute_def->set_id(-1);
+    }
   }
 
   std::stringstream module_name;
@@ -768,7 +868,7 @@ void ScriptModuleSerializer::convertModule(
     module_name << prefix << "_";
   module_name << name;
 
-  if (module.get_methods().size() > 0) {
+  if (module.class_compilation_unit().get_functions().size() > 0) {
     std::ostringstream methods;
     methods << "op_version_set = " << CURRENT_OP_VERSION_SET << "\n";
     PythonPrint(
@@ -794,15 +894,6 @@ void ScriptModuleSerializer::convertModule(
   }
 }
 
-void ScriptModuleSerializer::convertParameter(
-    const script::Slot& param,
-    torch::ParameterDef* param_def,
-    bool is_parameter) {
-  param_def->set_name(param.name());
-  param_def->set_is_buffer(is_parameter);
-  param_def->set_tensor_id(addTensor(param.value().toTensor()));
-}
-
 // Pretty printing for ONNX
 constexpr char indent_char = ' ';
 constexpr size_t indent_multiplier = 2;
diff --git a/torch/csrc/jit/fuser/cpu/fused_kernel.cpp b/torch/csrc/jit/fuser/cpu/fused_kernel.cpp
index c044aca46234..c9603f5db3a8 100644
--- a/torch/csrc/jit/fuser/cpu/fused_kernel.cpp
+++ b/torch/csrc/jit/fuser/cpu/fused_kernel.cpp
@@ -88,7 +88,7 @@ static void runCompiler(
     config.openmp = false; // disable for future compiles
     return runCompiler(cpp_file, so_file);
   }
-  AT_CHECK(r == 0, "Failed to compile a fused CPU kernel");
+  TORCH_CHECK(r == 0, "Failed to compile a fused CPU kernel");
 }
 
 static const std::string disas_string = "objdump -M  intel -d \"${so_file}\"";
diff --git a/torch/csrc/jit/fuser/cuda/fused_kernel.cpp b/torch/csrc/jit/fuser/cuda/fused_kernel.cpp
index 521cfd2f2184..4f1809b5dd7f 100644
--- a/torch/csrc/jit/fuser/cuda/fused_kernel.cpp
+++ b/torch/csrc/jit/fuser/cuda/fused_kernel.cpp
@@ -39,18 +39,6 @@ namespace cuda {
 // INSTEAD USE, e.g. nvrtc().cuLoadModule(...)
 // If a function is missing add it to the list in thnvrtc.
 
-void checkCUDAVersion(const cudaDeviceProp& prop) {
-  if ((prop.major >= 6 && CUDA_VERSION < 8000) ||
-      (prop.major >= 7 && CUDA_VERSION < 9000)) {
-    std::stringstream err_string;
-    err_string
-        << "In CUDAFusedKernel, PyTorch compiled with insufficient CUDA version: "
-        << CUDA_VERSION << " for the current GPU device " << prop.name
-        << " with device capability " << prop.major << "." << prop.minor;
-    throw std::runtime_error(err_string.str());
-  }
-}
-
 #ifdef USE_DIRECT_NVRTC
 std::pair<std::unique_ptr<cpu::DynamicLibrary>, THNVRTC*> loadNVRTC() {
   return std::make_pair(nullptr, torch_load_nvrtc());
diff --git a/torch/csrc/jit/fuser/executor.cpp b/torch/csrc/jit/fuser/executor.cpp
index fab904338c44..49dfa40634a4 100644
--- a/torch/csrc/jit/fuser/executor.cpp
+++ b/torch/csrc/jit/fuser/executor.cpp
@@ -69,7 +69,7 @@ static c10::optional<std::vector<int64_t>> canRunKernel(
     const KernelSpec& spec,
     at::TensorList args) {
   // Short-circuits on size mismatch
-  AT_CHECK(
+  TORCH_CHECK(
       args.size() == spec.inputChunks().size(),
       "Expected ",
       spec.inputChunks().size(),
diff --git a/torch/csrc/jit/graph_executor.cpp b/torch/csrc/jit/graph_executor.cpp
index ac44b5ad08eb..fa3ae590015d 100644
--- a/torch/csrc/jit/graph_executor.cpp
+++ b/torch/csrc/jit/graph_executor.cpp
@@ -6,6 +6,7 @@
 #include <torch/csrc/jit/argument_spec.h>
 #include <torch/csrc/jit/autodiff.h>
 #include <torch/csrc/jit/custom_operator.h>
+#include <torch/csrc/jit/graph_executor_impl.h>
 #include <torch/csrc/jit/interpreter.h>
 #include <torch/csrc/jit/ir.h>
 #include <torch/csrc/jit/pass_manager.h>
@@ -27,6 +28,8 @@
 #include <torch/csrc/jit/passes/requires_grad_analysis.h>
 #include <torch/csrc/jit/passes/shape_analysis.h>
 #include <torch/csrc/jit/passes/specialize_autogradzero.h>
+#include <torch/csrc/jit/profiling_graph_executor_impl.h>
+#include <torch/csrc/jit/profiling_record.h>
 #include <torch/csrc/jit/resource_guard.h>
 #include <torch/csrc/jit/tracer.h>
 
@@ -58,6 +61,11 @@ std::shared_ptr<Graph> lastExecutedOptimizedGraph() {
   return last_executed_optimized_graph.lock();
 }
 
+void ExecutionPlan::run(Stack& stack) const {
+  InterpreterState(code).run(stack);
+  last_executed_optimized_graph = graph;
+}
+
 namespace {
 
 using tensor_list = std::vector<at::Tensor>;
@@ -70,31 +78,6 @@ using autograd::variable_list;
 const size_t autodiffSubgraphNodeThreshold = 2;
 const size_t autodiffSubgraphInlineThreshold = 5;
 
-struct ExecutionPlan {
-  ExecutionPlan() = default;
-  ExecutionPlan(std::shared_ptr<Graph> graph)
-      : code(graph), graph(std::move(graph)) {}
-
-  void run(Stack& stack) const {
-    InterpreterState(code).run(stack);
-    last_executed_optimized_graph = graph;
-  }
-
-  operator bool() const {
-    return static_cast<bool>(graph);
-  }
-
-  ExecutionPlanState getDebugState() {
-    ExecutionPlanState state;
-    state.code = &code;
-    state.graph = graph.get();
-    return state;
-  }
-
-  Code code;
-  std::shared_ptr<Graph> graph;
-};
-
 struct CaptureList {
   CaptureList(size_t capture_size) {
     capture_types_.reserve(capture_size);
@@ -489,28 +472,16 @@ GraphExecutor* getGradExecutor(Operation& op) {
 // and different requires_grad states, and handles specializations for each
 // situation. GraphExecutor is completely unaware of tracing or module
 // parameters to keep the tracing concerns separated.
-struct GraphExecutorImpl {
-  static std::shared_ptr<Graph> prepareGraph(std::shared_ptr<Graph>& graph) {
-    auto copy = graph->copy();
-    EraseShapeInformation(copy);
-    return copy;
-  }
-
-  GraphExecutorImpl(std::shared_ptr<Graph> graph, bool optimize)
-      : graph(prepareGraph(graph)),
-        // until we have correct alias analysis any use of mutable operators
-        // disables all optimization
-        optimize(optimize),
-        num_inputs(this->graph->inputs().size()),
-        arg_spec_creator_(*graph),
-        num_outputs(this->graph->outputs().size()) {
+struct GraphExecutorImpl : public GraphExecutorImplBase {
+  GraphExecutorImpl(const std::shared_ptr<Graph>& graph, bool optimize)
+      : GraphExecutorImplBase(graph, optimize), arg_spec_creator_(*graph) {
     logging::getLogger()->addStatValue(
         logging::runtime_counters::GRAPH_EXECUTORS_CONSTRUCTED, 1.0);
   }
 
   // entry point where execution begins
-  void run(Stack& stack) {
-    AT_CHECK(
+  void run(Stack& stack) override {
+    TORCH_CHECK(
         stack.size() >= num_inputs,
         "expected ",
         num_inputs,
@@ -529,7 +500,7 @@ struct GraphExecutorImpl {
     return execution_plan.run(stack);
   }
 
-  GraphExecutorState getDebugState() {
+  GraphExecutorState getDebugState() override {
     GraphExecutorState state;
     state.graph = graph.get();
     if (fallback) {
@@ -541,7 +512,7 @@ struct GraphExecutorImpl {
     return state;
   }
 
- private:
+ protected:
   friend struct GraphExecutor;
 
   const ExecutionPlan& getOrCompileFallback() {
@@ -608,10 +579,11 @@ struct GraphExecutorImpl {
       for (Node* dnode : diff_nodes) {
         auto diff_graph = std::move(dnode->g(attr::Subgraph));
         Gradient gradient = differentiate(diff_graph);
-        // Run post differentiation optimizations, Autodiff will replace some 
+        // Run post differentiation optimizations, Autodiff will replace some
         // parts of graph with new graph, these new graphs usually consists of
         // control flows and miss shape information on nodes, so we run shape
-        // prop and differentiable optimizations to ensure the graph is optimized
+        // prop and differentiable optimizations to ensure the graph is
+        // optimized
         PropagateInputShapes(gradient.f);
         runOptimization(gradient.f);
         // run non diff optimization on the forward graph
@@ -720,18 +692,9 @@ struct GraphExecutorImpl {
     }
   }
 
-  // The unoptimized starting graph. This field is effectively const, but we
-  // can't make it so because Graph::copy() is not const (and making it const is
-  // not that easy at this point).
-  std::shared_ptr<Graph> graph;
+  ~GraphExecutorImpl() override = default;
 
-  // If false, we'll run the graph as we get it, without any optimizations.
-  // Useful for debugging.
-  const bool optimize;
-  const size_t num_inputs;
   ArgumentSpecCreator arg_spec_creator_;
-  const size_t num_outputs;
-
   // Populated only when optimize is false (and in that case plan_cache will be
   // unused). The compiled version of graph.
   ExecutionPlan fallback;
@@ -739,14 +702,15 @@ struct GraphExecutorImpl {
   // Mapping from argument configurations to optimized versions of the graph
   // that are specialized to the spec.
   std::unordered_map<ArgumentSpec, ExecutionPlan> plan_cache;
-
-  // GraphExecutors can be accessed from multiple threads, so this thread needs
-  // to be held every time we access the fallback or plan_cache.
-  std::mutex compile_mutex;
 };
 
 GraphExecutor::GraphExecutor(std::shared_ptr<Graph> graph, bool optimize)
-    : pImpl(new GraphExecutorImpl(std::move(graph), optimize)) {}
+    : pImpl(
+          getProfilingMode()
+              ? dynamic_cast<GraphExecutorImplBase*>(
+                    new ProfilingGraphExecutorImpl(graph, optimize))
+              : dynamic_cast<GraphExecutorImplBase*>(
+                    new GraphExecutorImpl(graph, optimize))) {}
 
 void GraphExecutor::run(Stack& inputs) {
   return pImpl->run(inputs);
diff --git a/torch/csrc/jit/graph_executor.h b/torch/csrc/jit/graph_executor.h
index 6c2cb6cbc65b..de51a0189df6 100644
--- a/torch/csrc/jit/graph_executor.h
+++ b/torch/csrc/jit/graph_executor.h
@@ -26,7 +26,7 @@ struct GraphExecutorState {
   std::unordered_map<ArgumentSpec, ExecutionPlanState> execution_plans;
 };
 
-struct GraphExecutorImpl;
+struct GraphExecutorImplBase;
 struct TORCH_API GraphExecutor {
   GraphExecutor() = default;
   GraphExecutor(std::shared_ptr<Graph> graph, bool optimize = true);
@@ -38,7 +38,7 @@ struct TORCH_API GraphExecutor {
   GraphExecutorState getDebugState();
 
  private:
-  std::shared_ptr<GraphExecutorImpl> pImpl;
+  std::shared_ptr<GraphExecutorImplBase> pImpl;
 };
 
 // These passes need to run before it is valid to pass to the interpreter
@@ -48,6 +48,8 @@ TORCH_API void runRequiredPasses(const std::shared_ptr<Graph>& g);
 TORCH_API void debugSetAutodiffSubgraphInlining(bool state);
 TORCH_API std::shared_ptr<Graph> lastExecutedOptimizedGraph();
 
+TORCH_API bool& getProfilingMode();
+
 namespace detail {
 
 GraphExecutor* getGradExecutor(Operation& op);
diff --git a/torch/csrc/jit/graph_executor_impl.h b/torch/csrc/jit/graph_executor_impl.h
new file mode 100644
index 000000000000..d7d8b7a71f1f
--- /dev/null
+++ b/torch/csrc/jit/graph_executor_impl.h
@@ -0,0 +1,101 @@
+#pragma once
+#include <torch/csrc/jit/graph_executor.h>
+
+#include <ATen/core/ivalue.h>
+#include <c10/util/Exception.h>
+#include <torch/csrc/autograd/grad_mode.h>
+#include <torch/csrc/jit/argument_spec.h>
+#include <torch/csrc/jit/autodiff.h>
+#include <torch/csrc/jit/custom_operator.h>
+#include <torch/csrc/jit/interpreter.h>
+#include <torch/csrc/jit/ir.h>
+#include <torch/csrc/jit/profiling_record.h>
+#include <torch/csrc/jit/resource_guard.h>
+#include <torch/csrc/jit/symbolic_variable.h>
+#include <torch/csrc/jit/tracer.h>
+
+#include <torch/csrc/autograd/edge.h>
+#include <torch/csrc/autograd/function.h>
+#include <torch/csrc/jit/script/compiler.h>
+#include <torch/csrc/jit/script/logging.h>
+
+#include <cstdint>
+#include <iterator>
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+namespace torch {
+namespace jit {
+
+struct ExecutionPlan {
+  ExecutionPlan() = default;
+  ExecutionPlan(std::shared_ptr<Graph> graph)
+      : code(graph), graph(std::move(graph)) {}
+
+  void run(Stack& stack) const;
+
+  operator bool() const {
+    return static_cast<bool>(graph);
+  }
+
+  ExecutionPlanState getDebugState() {
+    ExecutionPlanState state;
+    state.code = &code;
+    state.graph = graph.get();
+    return state;
+  }
+
+  Code code;
+  std::shared_ptr<Graph> graph;
+};
+
+// a Graph can be created via tracing, or via a language-based frontend
+// GraphExecutor runs it. It can run the same graph on many different sizes
+// and different requires_grad states, and handles specializations for each
+// situation. GraphExecutor is completely unaware of tracing or module
+// parameters to keep the tracing concerns separated.
+struct GraphExecutorImplBase {
+  static std::shared_ptr<Graph> prepareGraph(
+      const std::shared_ptr<Graph>& graph) {
+    auto copy = graph->copy();
+    EraseShapeInformation(copy);
+    return copy;
+  }
+
+  GraphExecutorImplBase(const std::shared_ptr<Graph>& graph, bool optimize)
+      : graph(prepareGraph(graph)),
+        // until we have correct alias analysis any use of mutable operators
+        // disables all optimization
+        optimize(optimize),
+        num_inputs(this->graph->inputs().size()),
+        num_outputs(this->graph->outputs().size()) {}
+
+  // entry point where execution begins
+  virtual void run(Stack& stack) = 0;
+  virtual GraphExecutorState getDebugState() = 0;
+  virtual ~GraphExecutorImplBase() = default;
+
+ protected:
+  friend struct GraphExecutor;
+
+  // The unoptimized starting graph. This field is effectively const, but we
+  // can't make it so because Graph::copy() is not const (and making it const is
+  // not that easy at this point).
+  std::shared_ptr<Graph> graph;
+
+  // If false, we'll run the graph as we get it, without any optimizations.
+  // Useful for debugging.
+  const bool optimize;
+  const size_t num_inputs;
+  const size_t num_outputs;
+
+  // GraphExecutors can be accessed from multiple threads, so this thread needs
+  // to be held every time we access the fallback or plan_cache.
+  std::mutex compile_mutex;
+};
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/import.cpp b/torch/csrc/jit/import.cpp
index 8d97460f6b6f..79311a07e010 100644
--- a/torch/csrc/jit/import.cpp
+++ b/torch/csrc/jit/import.cpp
@@ -58,8 +58,11 @@ class ScriptModuleDeserializer final {
   void convertModule(const torch::ModuleDef& module_def);
 
   void loadTensorTable(torch::ModelDef* model_def);
-  void loadAttributeTable();
+  std::vector<IValue> loadPickleArchive(const std::string& name);
   void importCallback(const std::string& qualifier);
+  void moduleSetState(
+      const std::shared_ptr<script::Module>& module,
+      IValue state);
 
   caffe2::serialize::PyTorchStreamReader reader_;
   // this is a hack to make sure the script module created in C++ is the
@@ -69,7 +72,8 @@ class ScriptModuleDeserializer final {
   std::vector<std::string> moduleStack_;
 
   std::vector<at::Tensor> tensor_table_;
-  std::vector<IValue> attribute_table_;
+  std::vector<IValue> pickled_ivalues_;
+
   std::unordered_set<std::string> imported_libs_;
 
   std::shared_ptr<script::Module> main_module_;
@@ -130,16 +134,18 @@ void ScriptModuleDeserializer::deserialize(
   // Load extra files.
   for (const auto& kv : extra_files) {
     const std::string& key = "extra/" + kv.first;
-    at::DataPtr meta_ptr;
-    size_t meta_size;
-    std::tie(meta_ptr, meta_size) = reader_.getRecord(key);
-    extra_files[kv.first] =
-        std::string(static_cast<char*>(meta_ptr.get()), meta_size);
+    if (reader_.hasFile(key)) {
+      at::DataPtr meta_ptr;
+      size_t meta_size;
+      std::tie(meta_ptr, meta_size) = reader_.getRecord(key);
+      extra_files[kv.first] =
+          std::string(static_cast<char*>(meta_ptr.get()), meta_size);
+    }
   }
 
   loadTensorTable(&model_def);
   if (model_def.proto_version() >= 2) {
-    loadAttributeTable();
+    pickled_ivalues_ = loadPickleArchive("attributes.pkl");
   }
 
   // TODO: this can be simplified when C++/Python interop lands,
@@ -154,13 +160,13 @@ void ScriptModuleDeserializer::loadTensorTable(torch::ModelDef* model_def) {
   }
 }
 
-void ScriptModuleDeserializer::loadAttributeTable() {
+std::vector<IValue> ScriptModuleDeserializer::loadPickleArchive(const std::string& name) {
   at::DataPtr attributes_ptr;
   size_t attributes_size;
   std::tie(attributes_ptr, attributes_size) =
-      reader_.getRecord("attributes.pkl");
+      reader_.getRecord(name);
   Unpickler unpickler(attributes_ptr.get(), attributes_size, &tensor_table_);
-  attribute_table_ = unpickler.parse_ivalue_list();
+  return unpickler.parse_ivalue_list();
 }
 
 at::Tensor ScriptModuleDeserializer::loadTensor(
@@ -255,6 +261,21 @@ void ScriptModuleDeserializer::importCallback(const std::string& qualifier) {
       import_callback);
 }
 
+void ScriptModuleDeserializer::moduleSetState(
+    const std::shared_ptr<script::Module>& module,
+    IValue state) {
+  auto setstate = module->class_compilation_unit().find_function("__setstate__");
+
+  AT_CHECK(
+      setstate != nullptr,
+      "Cannot call '__setstate__' method because"
+      " it does not exist");
+
+  // TODO: once modules are first class in the interpreter and methods are not
+  // lowered, change this to `module->run_method("__setstate__", {state});`
+  setstate->run({module->module_object(), state});
+}
+
 void ScriptModuleDeserializer::convertModule(
     const torch::ModuleDef& module_def) {
   std::shared_ptr<script::Module> module = moduleLookup_(moduleStack_);
@@ -282,10 +303,16 @@ void ScriptModuleDeserializer::convertModule(
       continue;
     }
 
+    IValue ivalue;
+    if (attr_def.id() >= 0) {
+      // attribute has no value in the table, set it to None for now. After
+      // __getstate__, check that all the attributes that are not Optional
+      // can't be None
+      ivalue = pickled_ivalues_.at(attr_def.id());
+    }
+
     module->register_attribute(
-        attr_def.name(),
-        typeParser.parseType(attr_def.type()),
-        attribute_table_.at(attr_def.id()));
+        attr_def.name(), typeParser.parseType(attr_def.type()), ivalue);
   }
   if (module_def.has_torchscript_arena()) {
     at::DataPtr data;
@@ -303,6 +330,26 @@ void ScriptModuleDeserializer::convertModule(
         tensor_table_,
         import_callback);
   }
+
+  if (module_def.has_get_state_attribute_id()) {
+    moduleSetState(
+        module, pickled_ivalues_.at(module_def.get_state_attribute_id()));
+  }
+
+  for (const auto& slot : module->get_attributes()) {
+    // Verify that all the non-optional attributes have been initialized
+    // TODO: Issue #20497
+    if (slot.type()->kind() != TypeKind::OptionalType) {
+      AT_CHECK(
+          !slot.value().isNone(),
+          "The field '",
+          slot.name(),
+          "' was left unitialized after __setstate__, but expected a ",
+          "value of type '",
+          slot.type()->python_str(),
+          "'");
+    }
+  }
 }
 
 } // namespace
diff --git a/torch/csrc/jit/init.cpp b/torch/csrc/jit/init.cpp
index b4e2c4398753..369d12c63320 100644
--- a/torch/csrc/jit/init.cpp
+++ b/torch/csrc/jit/init.cpp
@@ -169,6 +169,38 @@ void initJITBindings(PyObject* module) {
                 std::tuple<std::string, float, int>>>(pyQParamDict);
             return InsertQuantDequantNodes(g, qparam_dict);
           })
+      .def(
+          "_jit_pass_insert_quantdequant_for_weight_bias",
+          [](std::shared_ptr<script::Module>& moduleObj,
+             const std::string& method_name,
+             const std::string& param_name,
+             py::function pyGetQParamFunc) {
+            // For different static params we pass different getQParamFunc via
+            // same interface exposed by the quantizer.
+            if (param_name == std::string("weight")) {
+              auto getQParamFunc =
+                  py::cast<std::function<std::tuple<std::string, float, int>(
+                      at::Tensor)>>(pyGetQParamFunc);
+              InsertQuantDequantNodesForParam(
+                  moduleObj,
+                  method_name,
+                  param_name,
+                  getQParamFunc,
+                  at::ScalarType::QInt8);
+            } else if (param_name == std::string("bias")) {
+              auto getQParamFunc =
+                  py::cast<std::function<std::tuple<std::string, float, int>(
+                      float, float)>>(pyGetQParamFunc);
+              InsertQuantDequantNodesForParam(
+                  moduleObj,
+                  method_name,
+                  param_name,
+                  getQParamFunc,
+                  at::ScalarType::QInt32);
+            } else {
+              TORCH_CHECK(false, "Invalid Param Name");
+            }
+          })
       .def(
           "_jit_pass_quantlint",
           [](std::shared_ptr<Graph>& g) { return QuantLinting(g); })
@@ -295,6 +327,9 @@ void initJITBindings(PyObject* module) {
             auto stack = toStack(args);
             checkAliasAnnotation(g, std::move(stack), unqualified_op_name);
           })
+      .def(
+          "_jit_set_profiling_mode",
+          [](bool profiling_flag) { getProfilingMode() = profiling_flag; })
       .def(
           "_jit_fuser_get_fused_kernel_code",
           [](Graph& g, std::vector<at::Tensor> inps) {
@@ -374,8 +409,8 @@ void initJITBindings(PyObject* module) {
         try {
           auto symbol = Symbol::fromQualString(qualified_name);
           auto operations = getAllOperatorsFor(symbol);
-          AT_CHECK(!operations.empty(), "No such operator ", qualified_name);
-          AT_CHECK(
+          TORCH_CHECK(!operations.empty(), "No such operator ", qualified_name);
+          TORCH_CHECK(
               operations.size() == 1,
               "Found ",
               operations.size(),
diff --git a/torch/csrc/jit/interpreter.cpp b/torch/csrc/jit/interpreter.cpp
index e052456707f1..808412883d0a 100644
--- a/torch/csrc/jit/interpreter.cpp
+++ b/torch/csrc/jit/interpreter.cpp
@@ -1,6 +1,7 @@
 #include <torch/csrc/jit/interpreter.h>
 
 #include <ATen/core/ivalue.h>
+#include <ATen/Parallel.h>
 #include <c10/core/thread_pool.h>
 #include <c10/util/Exception.h>
 #include <torch/csrc/autograd/edge.h>
@@ -330,7 +331,7 @@ struct Instruction {
   UseList inputs;
   ListHandle<int> outputs;
   Symbol debug_name; // used in dump to understand the generated code
-  std::shared_ptr<SourceLocation> debug_location; // for error reporting
+  c10::optional<SourceRange> debug_location; // for error reporting
 };
 
 int relativeJump(int from_inst, int to_inst) {
@@ -377,7 +378,7 @@ struct CodeImpl {
 
   void insertNodesFromBlock(Block* block) {
     for (auto node : block->nodes()) {
-      const auto& source_location = node->getSourceLocation();
+      SourceRange source_location = node->sourceRange();
       switch (node->kind()) {
         case prim::If: {
           // x = if c:
@@ -481,7 +482,7 @@ struct CodeImpl {
   size_t insertInstruction(Node* n) {
     auto inst = insertInstruction(
         n->kind(),
-        n->getSourceLocation(),
+        n->sourceRange(),
         n->inputs(),
         moveFlags(n),
         n->outputs());
@@ -490,7 +491,7 @@ struct CodeImpl {
   }
   size_t insertInstruction(
       Symbol sym,
-      std::shared_ptr<SourceLocation> debug_location,
+      const SourceRange& debug_location,
       ArrayRef<Value*> inputs,
       ArrayRef<uint8_t> move_flags,
       ArrayRef<Value*> outputs) {
@@ -520,7 +521,7 @@ struct CodeImpl {
   }
 
   size_t insertAssign(
-      std::shared_ptr<SourceLocation> debug_location,
+      const SourceRange& debug_location,
       ArrayRef<Value*> inputs,
       ArrayRef<uint8_t> move_flags,
       ArrayRef<Value*> outputs) {
@@ -546,7 +547,7 @@ struct CodeImpl {
     list.size = 0;
   }
   void listInsert(ListHandle<int>& list, int value) {
-    AT_CHECK(
+    TORCH_CHECK(
         list.start + list.size == (int)int_data.size(),
         "another list already started");
     int_data.push_back(value);
@@ -557,7 +558,7 @@ struct CodeImpl {
     list.size = 0;
   }
   void listInsert(ListHandle<bool>& list, int value) {
-    AT_CHECK(
+    TORCH_CHECK(
         list.start + list.size == (int)bool_data.size(),
         "another list already started");
     bool_data.push_back(value);
@@ -700,8 +701,8 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
         // the current thread will continue running before it suspends.
         InterpreterState state(intrusive_from_this());
         e.future->addCallback([state]() {
-          c10::global_work_queue().run(InterpreterContinuation(
-              state, Stack(), autograd::GradMode::is_enabled()));
+          at::launch(InterpreterContinuation(state, Stack(),
+              autograd::GradMode::is_enabled()));
         });
 
         return true;
@@ -713,14 +714,10 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
       } catch (std::exception& e) {
         // Error from the current thread
         bool is_jit_exception = dynamic_cast<JITException*>(&e);
-        if (instructions[pc].debug_location) {
-          handleError(
-              instructions[pc].debug_location->wrapException(
-                  e, "operation failed in interpreter"),
-              is_jit_exception);
-        } else {
-          handleError(e.what(), is_jit_exception);
-        }
+        handleError(
+            instructions[pc].debug_location->wrapException(
+                e, "operation failed in interpreter"),
+            is_jit_exception);
         return false;
       }
     }
diff --git a/torch/csrc/jit/ir.cpp b/torch/csrc/jit/ir.cpp
index cb56dd710df7..d254109ea2dd 100644
--- a/torch/csrc/jit/ir.cpp
+++ b/torch/csrc/jit/ir.cpp
@@ -200,6 +200,14 @@ void Node::printAttributes(std::ostream& out, bool ignore_subgraph = false)
   out << "]";
 }
 
+SourceRange Node::sourceRange() const {
+ if(source_range_) {
+   return *source_range_;
+ }
+ std::stringstream ss;
+ return SourceRange(ss.str());
+}
+
 static std::ostream& indent(std::ostream& out, size_t level) {
   for (size_t i = 0; i < level; ++i) {
     out << "  ";
@@ -224,8 +232,10 @@ std::ostream& Node::print(
       if (numAttributes() > 1 && kind() != prim::DifferentiableGraph) {
         printAttributes(out, /*ignore_subgraph=*/true);
       }
+
       groups->push_back(this);
     } else {
+
       out << kind().toQualString();
       if (hasAttributes()) {
         printAttributes(out);
@@ -241,6 +251,7 @@ std::ostream& Node::print(
     out << ", ";
     out << "scope: " << scName << "\n";
   }
+
   for (size_t i = 0; i < blocks().size(); ++i) {
     auto b = blocks()[i];
     indent(out, level + 1) << "block" << i << "("
@@ -251,6 +262,7 @@ std::ostream& Node::print(
     }
     indent(out, level + 2) << "-> (" << b->outputs() << ")\n";
   }
+
   return out;
 }
 
@@ -539,7 +551,6 @@ Block::Block(Graph* graph_, Node* node_)
       output_(graph_->create(prim::Return, 0)),
       input_(graph_->create(prim::Param, 0)),
       owning_node_(node_) {
-
   input_->next() = output_;
   input_->prev() = output_;
   output_->next() = input_;
@@ -642,6 +653,16 @@ void Graph::remapTypes(const std::function<TypePtr(TypePtr)>& type_map) {
   block()->remapTypes(type_map);
 }
 
+void Value::inferTypeFrom(const at::Tensor& output) {
+  if (output.is_mkldnn()) {
+    // mkldnn tensor as opaque tensor doesn't have strides, so we can
+    // not create a CompleteTensorType
+    setType(DimensionedTensorType::create(output));
+    return;
+  }
+  setType(CompleteTensorType::create(output));
+}
+
 bool Value::mustBeNone() const {
   return node_->mustBeNone();
 }
@@ -973,7 +994,7 @@ void Node::destroy() {
 }
 
 void Node::cloneFrom(Node* s) {
-  setSourceLocation(s->getSourceLocation());
+  s->source_range_ = s->source_range_;
   if (s->scope_ && !s->scope_->isBlank()) {
     scope_ = s->scope_;
   }
@@ -1107,7 +1128,9 @@ Node* Node::insertBefore(Node* n) {
 Node* Node::insertAfter(Node* n) {
   AT_ASSERT(!inBlockList() && n->inBlockList());
   AT_ASSERT(n->owningBlock());
-  AT_ASSERTM(n->kind() != prim::Return, "Attempting to insert a Node after the Return node or before the Param node");
+  AT_ASSERTM(
+      n->kind() != prim::Return,
+      "Attempting to insert a Node after the Return node or before the Param node");
   this->owning_block_ = n->owningBlock();
   Node* next = n->next();
   n->next() = this;
diff --git a/torch/csrc/jit/ir.h b/torch/csrc/jit/ir.h
index 3031dd2e4f57..89d7d2b451b2 100644
--- a/torch/csrc/jit/ir.h
+++ b/torch/csrc/jit/ir.h
@@ -162,9 +162,7 @@ struct Value {
 
  public:
   Value* setType(TypePtr type);
-  void inferTypeFrom(const at::Tensor& output) {
-    setType(CompleteTensorType::create(output));
-  }
+  TORCH_API void inferTypeFrom(const at::Tensor& output);
   const TypePtr& type() const {
     AT_ASSERT(type_ != nullptr);
     return type_;
@@ -249,7 +247,7 @@ struct TORCH_API Node {
   std::vector<Block*> blocks_;
   Graph* graph_;
   Block* owning_block_;
-  std::shared_ptr<SourceLocation> source_location_;
+  c10::optional<SourceRange> source_range_;
   ScopePtr scope_;
   // Assumes FunctionSchemas are persistent, so we don't manage their lifetime.
   // This field is effective a cache that's populated on attribute lookups and
@@ -287,13 +285,12 @@ struct TORCH_API Node {
   NodeKind kind() const {
     return kind_;
   }
-  Node* setSourceLocation(std::shared_ptr<SourceLocation> sl) {
-    source_location_ = std::move(sl);
+  Node* setSourceRange(SourceRange r) {
+    source_range_ = std::move(r);
     return this;
   }
-  std::shared_ptr<SourceLocation> getSourceLocation() const {
-    return source_location_;
-  }
+  SourceRange sourceRange() const;
+
   Graph* owningGraph() {
     return graph_;
   }
@@ -591,8 +588,6 @@ struct TORCH_API Node {
   // template variable, returning nullptr if the cast is invalid..
   //
   // Example usage: if(auto s = n.cast<Select>()) { ... }
-  //
-  // TODO: Make this const correct
   template <typename T>
   T* cast() {
     if (T::Kind == kind()) {
@@ -600,9 +595,17 @@ struct TORCH_API Node {
     }
     return nullptr;
   }
+  template <typename T>
+  const T* cast() const {
+    if (T::Kind == kind()) {
+      return static_cast<const T*>(this);
+    }
+    return nullptr;
+  }
+
   template <typename T>
   T* expect() {
-    AT_CHECK(
+    TORCH_CHECK(
         T::Kind == kind(),
         "expected a ",
         T::Kind.toDisplayString(),
@@ -1262,6 +1265,10 @@ struct ProfileOp : public Node {
     return callback_;
   }
 
+  void setCallback(std::function<void(std::vector<IValue>&)> callback) {
+    callback_ = callback;
+  }
+
  private:
   std::function<void(std::vector<IValue>&)> callback_;
 };
diff --git a/torch/csrc/jit/operator.cpp b/torch/csrc/jit/operator.cpp
index 32baa69e16f4..f04d1371b722 100644
--- a/torch/csrc/jit/operator.cpp
+++ b/torch/csrc/jit/operator.cpp
@@ -68,7 +68,7 @@ struct OperatorRegistry {
         }
       }
 #endif
-      AT_CHECK(
+      TORCH_CHECK(
           op_ptr_it != operators_by_sig.end(),
           "Couldn't find an operator for ",
           name,
@@ -237,7 +237,7 @@ const Operator& getOperatorFor(const Node* node) {
   if (op)
     return *op;
 
-  auto er = script::ErrorReport(node->getSourceLocation());
+  auto er = script::ErrorReport(node->sourceRange());
   er << "Schema not found for node. File a bug report.\n";
   er << "Node: " << *node << "\n";
   er << "Input types:";
diff --git a/torch/csrc/jit/passes/alias_analysis.cpp b/torch/csrc/jit/passes/alias_analysis.cpp
index 497936930b85..5555cacfe20a 100644
--- a/torch/csrc/jit/passes/alias_analysis.cpp
+++ b/torch/csrc/jit/passes/alias_analysis.cpp
@@ -9,15 +9,33 @@
 namespace torch {
 namespace jit {
 
+// Get a typekind that can be used as a key to distinguish different kinds of
+// mutable types. If the type is not mutable, return nullopt.
+//
+// TODO: We use these rules to divide wildcards into distinct "buckets", where
+// every wildcard that resolves to the same kind will alias each other. We can
+// introduce more granularity here (e.g. List[int] will never alias
+// List[float]).
+c10::optional<TypeKind> AliasDb::getMutableTypeKind(const TypePtr& type) {
+  if (type->isSubtypeOf(TensorType::get())) {
+    return TypeKind::TensorType;
+  }
+
+  switch (type->kind()) {
+    case TypeKind::ListType:
+    case TypeKind::TupleType:
+    case TypeKind::DictType:
+    case TypeKind::ClassType:
+      return type->kind();
+    case TypeKind::OptionalType:
+      return getMutableTypeKind(type->cast<OptionalType>()->getElementType());
+    default:
+      return c10::nullopt;
+  }
+}
+
 bool AliasDb::shouldAnnotate(const TypePtr& type) {
-  return type->isSubtypeOf(TensorType::get()) ||
-      type->kind() == TypeKind::ListType ||
-      type->kind() == TypeKind::TupleType ||
-      type->kind() == TypeKind::DictType || type->kind() == TypeKind::VarType ||
-      type->kind() == TypeKind::FutureType ||
-      type->kind() == TypeKind::ClassType ||
-      (type->kind() == TypeKind::OptionalType &&
-       shouldAnnotate(type->cast<OptionalType>()->getElementType()));
+  return getMutableTypeKind(type) != c10::nullopt;
 }
 
 // We only need to annotate values that either are mutable or could contain
@@ -43,48 +61,6 @@ AliasDb::AliasDb(std::shared_ptr<Graph> graph) : graph_(std::move(graph)) {
   analyze(graph_);
 }
 
-// Does `n` use or write to any wildcard aliases?
-bool AliasDb::hasWildcard(const Node* n) const {
-  for (const auto input : n->inputs()) {
-    if (isWildcard(input)) {
-      return true;
-    }
-  }
-  for (const auto output : n->outputs()) {
-    if (isWildcard(output)) {
-      return true;
-    }
-  }
-  return false;
-}
-
-bool AliasDb::isWildcard(const Value* v) const {
-  return wildcards_.count(v);
-}
-
-bool AliasDb::writesTo(Node* n, const Value* v) const {
-  if (!shouldAnnotate(v) || v->mustBeNone()) {
-    // This is a non-aliasing value
-    return false;
-  }
-  if (isWildcard(v)) {
-    return wildcardWriters_.count(n);
-  }
-
-  if (!elementMap_.count(v) || !writeIndex_.count(n)) {
-    return false;
-  }
-
-  // Can short-circuit if we know this node writes directly to `v`
-  if (writeIndex_.at(n).count(v)) {
-    return true;
-  }
-
-  // Otherwise, check if `v` may alias any of written-to values in `n`
-  const auto vSet = ValueSet{v};
-  return mayAlias(vSet, writeIndex_.at(n));
-}
-
 bool AliasDb::hasWriters(const Node* n) const {
   for (const auto input : n->inputs()) {
     if (hasWriters(input)) {
@@ -100,22 +76,10 @@ bool AliasDb::hasWriters(const Node* n) const {
 }
 
 bool AliasDb::hasWriters(const Value* v) const {
-  if (isWildcard(v)) {
-    // If `n` has a wildcard, any write in the graph may write to it.
-    // So the only way we know there are no writers is if there are no writes
-    // at all.
-    return numWrites_ != 0;
-  }
-
   if (!elementMap_.count(v) || v->mustBeNone()) {
     return false;
   }
 
-  if (wildcardWriters_.size() > 0) {
-    // A write to the wildcard may be a write to any value.
-    return true;
-  }
-
   if (isWriteCacheStale_) {
     rebuildWriteCache();
   }
@@ -129,44 +93,6 @@ bool AliasDb::hasWriters(const Value* v) const {
   return false;
 }
 
-bool AliasDb::hasWrites(Node* n) const {
-  for (const auto input : n->inputs()) {
-    if (writesTo(n, input)) {
-      return true;
-    }
-  }
-  for (const auto output : n->outputs()) {
-    if (writesTo(n, output)) {
-      return true;
-    }
-  }
-  return false;
-}
-
-bool AliasDb::writesToInputAlias(Node* n) const {
-  std::vector<const Value*> writes;
-  for (const auto input : n->inputs()) {
-    if (writesTo(n, input)) {
-      writes.push_back(input);
-    }
-  }
-  for (const auto output : n->outputs()) {
-    if (writesTo(n, output)) {
-      writes.push_back(output);
-    }
-  }
-
-  // For all writes, check if the written value may alias a graph input
-  return std::any_of(writes.cbegin(), writes.cend(), [&](const Value* v) {
-    return std::any_of(
-        graph_->inputs().cbegin(),
-        graph_->inputs().cend(),
-        [&](const Value* graphInput) {
-          return shouldAnnotate(graphInput) && mayAlias(graphInput, v);
-        });
-  });
-}
-
 void AliasDb::getWritesImpl(Block* b, ValueSet& ret, bool recurseBlocks) const {
   for (auto node : b->nodes()) {
     getWritesImpl(node, ret, recurseBlocks);
@@ -174,14 +100,10 @@ void AliasDb::getWritesImpl(Block* b, ValueSet& ret, bool recurseBlocks) const {
 }
 
 void AliasDb::getWritesImpl(Node* n, ValueSet& ret, bool recurseBlocks) const {
-  for (const auto input : n->inputs()) {
-    if (writesTo(n, input)) {
-      ret.insert(input);
-    }
-  }
-  for (const auto output : n->outputs()) {
-    if (writesTo(n, output)) {
-      ret.insert(output);
+  if (writeIndex_.count(n)) {
+    const auto& writes = writeIndex_.at(n);
+    for (const auto write : writes) {
+      ret.insert(write);
     }
   }
 
@@ -192,13 +114,6 @@ void AliasDb::getWritesImpl(Node* n, ValueSet& ret, bool recurseBlocks) const {
   }
 }
 
-// Get all writes by all nodes in a block, recursively exploring sub-blocks
-ValueSet AliasDb::getWrites(Block* b) const {
-  ValueSet writes;
-  getWritesImpl(b, writes, /*recurseBlocks=*/true);
-  return writes;
-}
-
 // Does `n` write to an alias of one of the values in `vs`?
 bool AliasDb::writesToAlias(Node* n, const ValueSet& vs, bool recurseBlocks)
     const {
@@ -236,6 +151,14 @@ ValueSet AliasDb::getReads(Node* n, bool recurseBlocks) const {
   return reads;
 }
 
+static std::string getElementName(const Element* e) {
+  if (e->value == nullptr) {
+    return "WILDCARD";
+  } else {
+    return e->value->uniqueName();
+  }
+}
+
 void AliasDb::dump() const {
   std::cout << "\n===1. GRAPH===\n";
   graph_->dump();
@@ -244,28 +167,22 @@ void AliasDb::dump() const {
   for (const auto& ptrPair : elementMap_) {
     const auto element = ptrPair.second;
     if (element->pointsTo.size() > 0) {
-      std::cout << element->value->uniqueName() << " points to: ";
+      std::cout << getElementName(element) << " points to: ";
       for (const auto pointedTo : element->pointsTo) {
-        std::cout << pointedTo->value->uniqueName() << ", ";
+        std::cout << getElementName(pointedTo) << ", ";
       }
       std::cout << "\n";
     }
     if (element->contained_elements.size() > 0) {
-      std::cout << element->value->uniqueName() << " contains: ";
+      std::cout << getElementName(element) << " contains: ";
       for (const auto contained : element->contained_elements) {
-        std::cout << contained->value->uniqueName() << ", ";
+        std::cout << getElementName(contained) << ", ";
       }
       std::cout << "\n";
     }
   }
 
-  std::cout << "\n===3. WILDCARDS===\n";
-  for (const auto wildcard : wildcards_) {
-    std::cout << wildcard->uniqueName() << ", ";
-  }
-  std::cout << "\n";
-
-  std::cout << "\n===4. Writes===\n";
+  std::cout << "\n===3. Writes===\n";
   for (const auto& pr : writeIndex_) {
     const auto node = pr.first;
     const auto& values = pr.second;
@@ -279,75 +196,10 @@ void AliasDb::dump() const {
   std::cout << "\n";
 }
 
-// TODO: need to create a dummy "graph input alias" value in MemoryDAG for all
-// inputs of the same type to point to. Currently they all point to the first
-// element, which is technically wrong.
-void AliasDb::makeAllAlias(const std::vector<Value*>& values) {
-  if (values.size() > 0) {
-    giveFreshAlias(values[0]);
-  }
-  for (const auto value : values) {
-    makePointerTo(value, values[0]);
-  }
-}
-
 void AliasDb::analyze(const std::shared_ptr<Graph>& graph) {
-  // Assign aliases to the graph's inputs, assuming that all inputs of a given
-  // type may alias to each other.
-
-  // 1. Partition inputs by their type
-  std::map<TypeKind, std::vector<Value*>> listTypes;
-  std::unordered_map<TupleTypePtr, std::vector<Value*>> tupleTypes;
-  std::unordered_map<DictTypePtr, std::vector<Value*>> dictTypes;
-  std::unordered_map<ClassTypePtr, std::vector<Value*>> classTypes;
-  std::vector<Value*> tensors;
-
   for (auto input : graph->inputs()) {
-    auto inputType = input->type();
-    // unwrap optional types
-    if (inputType->kind() == TypeKind::OptionalType) {
-      inputType = inputType->cast<OptionalType>()->getElementType();
-    }
-
-    if (inputType->isSubtypeOf(TensorType::get())) {
-      tensors.push_back(input);
-    } else if (inputType->kind() == TypeKind::ListType) {
-      auto containedType = inputType->containedTypes().at(0);
-      // All tensor subtypes may alias to each other, so we should consider all
-      // lists of them to alias to each other.
-      if (containedType->isSubtypeOf(TensorType::get())) {
-        containedType = TensorType::get();
-      }
-      listTypes[containedType->kind()].push_back(input);
-    } else if (inputType->kind() == TypeKind::TupleType) {
-      auto tupleType = inputType->cast<TupleType>();
-      tupleTypes[tupleType].push_back(input);
-    } else if (inputType->kind() == TypeKind::DictType) {
-      auto dictType = inputType->cast<DictType>();
-      dictTypes[dictType].push_back(input);
-    } else if (inputType->kind() == TypeKind::ClassType) {
-      auto classType = inputType->cast<ClassType>();
-      classTypes[classType].push_back(input);
-    } else {
-      AT_ASSERT(!shouldAnnotate(input));
-    }
+    setWildcard(input);
   }
-
-  // 2. Make all partitions alias each other
-  for (const auto& pr : listTypes) {
-    makeAllAlias(pr.second);
-  }
-  for (const auto& pr : tupleTypes) {
-    makeAllAlias(pr.second);
-  }
-  for (const auto& pr : dictTypes) {
-    makeAllAlias(pr.second);
-  }
-  for (const auto& pr : classTypes) {
-    makeAllAlias(pr.second);
-  }
-  makeAllAlias(tensors);
-
   analyze(graph->block());
 }
 
@@ -359,11 +211,6 @@ void AliasDb::analyze(Block* block) {
 
 void AliasDb::analyze(Node* node) {
   analyzeImpl(node);
-
-  // After analyzing, update the wildcard index
-  if (hasWildcard(node)) {
-    wildcardNodes_.insert(node);
-  }
 }
 
 // Returns true if analysis was run using
@@ -405,8 +252,6 @@ void AliasDb::analyzeImpl(Node* node) {
     case prim::GradOf:
       return analyzeGradOf(node);
     case prim::Constant:
-    case prim::DictConstruct:
-    case prim::ListConstruct:
     case prim::AutogradZero:
     case prim::AutogradAdd:
     case prim::FusedConcat:
@@ -417,6 +262,9 @@ void AliasDb::analyzeImpl(Node* node) {
     case prim::Function:
     case prim::CreateObject:
       return analyzeCreator(node);
+    case prim::DictConstruct:
+    case prim::ListConstruct:
+      return analyzeContainerConstruct(node);
     case prim::TupleUnpack:
     case prim::TupleIndex:
     case prim::DictIndex:
@@ -431,6 +279,14 @@ void AliasDb::analyzeImpl(Node* node) {
       return analyzeBroadcastingChunk(node);
     case prim::SetAttr:
       return analyzeSetAttr(node);
+    case prim::profile:
+      AT_ERROR("Analyzing prim::profile isn't yet implemented");
+      // TODO: simply mapping inputs' aliases to outputs'
+      // should work but a) we should probably avoid exposing
+      // prim::profile to optimizations b) the alias semantics
+      // might be more complicated than just mapAliases
+      // mapAliases(node->inputs(), node->outputs());
+      return;
     case aten::add:
     case aten::sub:
     case aten::mul:
@@ -464,7 +320,7 @@ void AliasDb::analyzeImpl(Node* node) {
     // We don't have alias info for this node. Either schematize it, or
     // add it an analyze* method for it.
     if (hasMutableOutputs) {
-      throw script::ErrorReport(node->getSourceLocation())
+      throw script::ErrorReport(node->sourceRange())
           << "Alias information not found for node. File a bug report.\n"
           << "Node: " << *node << "\n";
     }
@@ -475,7 +331,8 @@ void AliasDb::analyzeImpl(Node* node) {
     return analyzeCustomOp(node);
   }
 
-  // Bind formal alias annotation to actual alias sets
+  // Bind the schema's "formal" alias annotation to the actual values those
+  // schema arguments represent
   std::unordered_map<Symbol, Value*> formalToActual;
   for (size_t i = 0; i < schema.arguments().size(); i++) {
     const auto& formal = schema.arguments()[i].alias_info();
@@ -490,11 +347,12 @@ void AliasDb::analyzeImpl(Node* node) {
       continue;
     }
 
-    // We don't support composite types for alias analysis yet.
+    // Do sanity checks on the alias annotation
+    // - We don't support composite types for alias analysis yet.
     AT_ASSERT(formal->containedTypes().size() == 0);
-    // TODO neither unions nor wildcards make sense on an input. We should
-    // disallow them in function schema
-    AT_ASSERT(!formal->isWildcard())
+    // - Doesn't make sense for a value to start annotated as a wildcard.
+    AT_ASSERT(!formal->isWildcardBefore());
+
     const auto& formalAlias = formal->beforeSet();
 
     // skip if we've already bound this alias
@@ -509,6 +367,15 @@ void AliasDb::analyzeImpl(Node* node) {
     if (formal->isWrite()) {
       registerWrite(actualValue, node);
     }
+
+    // Now deal with sets after the '->'
+    if (formal->isWildcardAfter()) {
+      setWildcard(actualValue);
+    } else {
+      // We don't understand anything else in the after yet, so assert there's
+      // been no change.
+      AT_ASSERT(formal->beforeSets() == formal->afterSets());
+    }
   }
 
   // Use the formal-actual mapping to give aliases to the outputs
@@ -529,7 +396,7 @@ void AliasDb::analyzeImpl(Node* node) {
     // We don't support composite types for alias analysis yet.
     AT_ASSERT(formal->containedTypes().size() == 0);
 
-    if (formal->isWildcard()) {
+    if (formal->isWildcardBefore() || formal->isWildcardAfter()) {
       setWildcard(actual);
       continue;
     }
@@ -567,15 +434,7 @@ void AliasDb::registerWrite(const Value* v, Node* n) {
     // don't need to register a write if the value isn't mutable
     return;
   }
-
-  numWrites_++;
-
-  if (isWildcard(v)) {
-    wildcardWriters_.insert(n);
-    return;
-  }
-
-  AT_ASSERT(elementMap_.count(v));
+  TORCH_INTERNAL_ASSERT(elementMap_.count(v));
   writeIndex_[n].insert(v);
 }
 
@@ -626,9 +485,6 @@ void AliasDb::analyzeGradOf(Node* node) {
 
 void AliasDb::analyzeSubgraph(Node* node) {
   const auto subgraph = node->g(attr::Subgraph).get();
-
-  subgraphToOwner_.insert({subgraph, node});
-
   const auto subgraphBlock = subgraph->block();
   mapAliases(subgraphBlock->inputs(), node->inputs());
 
@@ -654,9 +510,7 @@ void AliasDb::analyzeCreator(Node* node) {
 // gives up and creates wildcards for everything.
 void AliasDb::analyzeExtractor(Node* node) {
   for (const auto output : node->outputs()) {
-    if (shouldAnnotate(output)) {
-      setWildcard(output);
-    }
+    setWildcard(output);
   }
 }
 
@@ -667,16 +521,10 @@ void AliasDb::analyzeChunk(Node* node) {
   }
 }
 
-// Propagate aliasing and write information from the subgraph outputs to the
-// outputs of the corresponding aten::wait() calls, since that's where the
-// values will eventually emerge.
 void AliasDb::analyzeFork(Node* node) {
-  const auto subgraph = node->g(attr::Subgraph).get();
-  subgraphToOwner_.insert({subgraph, node});
-
-  const auto subgraphBlock = subgraph->block();
-  mapAliases(subgraphBlock->inputs(), node->inputs());
-  analyze(subgraphBlock);
+  for (const auto input : node->inputs()) {
+    setWildcard(input);
+  }
 
   // Give the future that the fork emits a fresh value
   for (const auto output : node->outputs()) {
@@ -685,51 +533,23 @@ void AliasDb::analyzeFork(Node* node) {
 }
 
 void AliasDb::analyzeWait(Node* node) {
-  const auto fut = node->input();
-  AT_ASSERT(fut->type()->kind() == TypeKind::FutureType);
-
-  if (isWildcard(fut)) {
-    for (const auto output : node->outputs()) {
-      setWildcard(output);
-    }
-    return;
+  TORCH_INTERNAL_ASSERT(node->kind() == aten::wait);
+  for (const auto output : node->outputs()) {
+    setWildcard(output);
   }
-
-  const auto originFuts = getMemoryLocations(fut);
-  for (const auto originFut : originFuts) {
-    const auto subgraphNode = originFut->node();
-
-    const auto subgraph = subgraphNode->g(attr::Subgraph).get();
-    const auto subgraphWrites = getWrites(subgraph->block());
-
-    // Retrieve aliasing info from the subgraph
-    mapAliases(node->outputs(), subgraph->outputs());
-
-    // Propagate write information to the `wait` node.
-    //
-    // We need to do this for all writes in the entire subgraph, so that we
-    // disallow reorders past a call to "aten::wait".
-    //
-    // Consider the following Fork where the subgraph writes to %a:
-    //
-    //   %c : Future[Tensor] = prim::Fork(%a, %b) <-- writes to %a
-    //   ...
-    //   aten::wait(%c)
-    //   aten::use(%a)   <-- we can't move this node before the `wait` safely!
-    //
-    // Say we define the "live interval" of a fork the interval between the
-    // `fork` and its first corresponding `wait` (inclusive).
-    //
-    // Any writes in the subgraph can happen at any point in the live interval,
-    // so it's not safe to re-order any reads to those memory locations from
-    // outside the live interval to inside.
-    //
-    // In reality, any reads *inside* the live interval are undefined behavior,
-    // since the writes may or may not have been executed yet. But we'll let
-    // users do that and shoot themselves in the foot for now.
-    for (const auto write : subgraphWrites) {
-      registerWrite(write, node);
-    }
+  // the forked subgraph that `wait` is waiting on may write to any of its
+  // inputs. We don't have a reliable way of recovering the fork inputs, so
+  // for safety we just register a write to every wildcard.
+  for (const auto& pr : wildcardIndex_) {
+    // TODO: Given the way the write query API is written, we can't regiser a
+    // write directly against the wildcard element. So find a wildcard value in
+    // the graph to write to.
+    const auto el = pr.second;
+    const auto& pointedFrom = el->pointedFrom;
+    TORCH_INTERNAL_ASSERT(!pointedFrom.empty());
+    const auto wildcardValue = (*pointedFrom.begin())->value;
+    TORCH_INTERNAL_ASSERT(wildcardValue);
+    registerWrite(wildcardValue, node);
   }
 }
 
@@ -751,6 +571,9 @@ void AliasDb::analyzeSetAttr(Node* node) {
   const auto self = node->inputs().at(0);
   AT_ASSERT(self->type()->kind() == TypeKind::ClassType);
   registerWrite(self, node);
+  // Also the value being set must become a wildcard.
+  const auto newValue = node->inputs().at(1);
+  setWildcard(newValue);
 }
 
 // Custom ops may write to any input and produce wildcards
@@ -766,6 +589,23 @@ void AliasDb::analyzeCustomOp(Node* node) {
   }
 }
 
+// List or dict construct: create an aliasing element for the actual container,
+// then mark all inputs as wildcards, since they've gone inside the container.
+// TODO: tuples are treated differently since we actually compare the contained
+// values for aliasing, so we don't need wildcards.
+void AliasDb::analyzeContainerConstruct(Node* node) {
+  AT_ASSERT(
+      node->kind() == prim::ListConstruct ||
+      node->kind() == prim::DictConstruct);
+
+  for (auto input : node->inputs()) {
+    setWildcard(input);
+  }
+  for (auto output : node->outputs()) {
+    giveFreshAlias(output);
+  }
+}
+
 // BroadcastingChunk: all inputs are broadcasted, and then individually chunked.
 // This is an intermediate node used only in the graph fuser.
 void AliasDb::analyzeBroadcastingChunk(Node* node) {
@@ -782,7 +622,7 @@ void AliasDb::analyzeBroadcastingChunk(Node* node) {
   }
 }
 
-// Register the fact that `value` is a pointer to `to`
+// Register the fact that `from` is a pointer to `to`
 void AliasDb::makePointerTo(const Value* from, const Value* to) {
   if (!shouldAnnotate(from)) {
     AT_ASSERT(!shouldAnnotate(to));
@@ -803,13 +643,6 @@ void AliasDb::makePointerTo(const Value* from, const Value* to) {
   // At this point, we should be dealing with two mutable types.
   AT_ASSERT(shouldAnnotate(from) && shouldAnnotate(to));
 
-  // If either value is a wildcard, don't insert anything into the graph;
-  // wildcards are tracked separately since they have different aliasing rules.
-  if (isWildcard(to) || isWildcard(from)) {
-    setWildcard(from);
-    return;
-  }
-
   auto fromEl = getOrCreateElement(from);
   auto toEl = getOrCreateElement(to);
 
@@ -823,11 +656,6 @@ void AliasDb::addToContainedElements(
     return;
   }
 
-  // wildcards tracked separately
-  if (isWildcard(elem)) {
-    return;
-  }
-
   AT_ASSERT(isContainerType(container->type()));
 
   auto elemEl = getOrCreateElement(elem);
@@ -841,18 +669,10 @@ bool AliasDb::mayAlias(const Value* a, const Value* b) const {
     return false;
   }
 
-  if (isWildcard(a) || isWildcard(b)) {
-    return true;
-  }
-
   return memoryDAG_->mayAlias(elementMap_.at(a), elementMap_.at(b));
 }
 
 bool AliasDb::cannotCheckAliasContainment(const Value* elem) const {
-  if (isWildcard(elem)) {
-    return true;
-  }
-
   if (isContainerType(elem->type())) {
     if (elem->node()->kind() != prim::TupleConstruct) {
       return true;
@@ -915,7 +735,7 @@ void AliasDb::giveFreshAlias(const Value* value) {
     return;
   }
 
-  if (isTracked(value)) {
+  if (elementMap_.count(value)) {
     // Inside a loop, we may have given a fresh alias to this value already, so
     // skip
     return;
@@ -925,16 +745,12 @@ void AliasDb::giveFreshAlias(const Value* value) {
 }
 
 Element* AliasDb::getOrCreateElement(const Value* value) {
-  if (!isTracked(value)) {
+  if (!elementMap_.count(value)) {
     giveFreshAlias(value);
   }
   return elementMap_.at(value);
 }
 
-bool AliasDb::isTracked(const Value* v) const {
-  return isWildcard(v) || elementMap_.count(v);
-}
-
 bool AliasDb::moveAfterTopologicallyValid(Node* n, Node* movePoint) {
   return tryMove(n, movePoint, MoveSide::AFTER, /*dryRun=*/false);
 }
@@ -978,9 +794,6 @@ class AliasDb::WorkingSet {
     for (const auto& read : aliasDb_.getReads(n, /*recurseBlocks=*/true)) {
       reads_.insert(read);
     }
-    if (aliasDb_.hasWildcard(n)) {
-      numWildcards_++;
-    }
   }
 
   void eraseMover() {
@@ -1005,9 +818,6 @@ class AliasDb::WorkingSet {
         reads_.erase(it);
       }
     }
-    if (aliasDb_.hasWildcard(mover)) {
-      numWildcards_--;
-    }
     nodes_.pop_front();
   }
 
@@ -1034,18 +844,6 @@ class AliasDb::WorkingSet {
   }
 
   bool hasMutabilityDependency(Node* n) const {
-    // 1. Handle wildcard dependencies:
-    // If the working set has a wildcard, `n` can't write to anything.
-    if (numWildcards_ > 0 && aliasDb_.hasWrites(n)) {
-      return true;
-    }
-
-    // If `n` has a wildcard, the working set can't write to anything.
-    if (aliasDb_.hasWildcard(n) && writes_.size() > 0) {
-      return true;
-    }
-
-    // 2. Handle regular mutable dependencies
     // Check that `n` does not write to anything used by the working set
     const auto nWrites = aliasDb_.getWrites(n, /*recurseBlocks=*/true);
     if (aliasDb_.mayAlias(nWrites, reads_)) {
@@ -1121,7 +919,6 @@ class AliasDb::WorkingSet {
   // Values written to by the working set => number of nodes writing to value
   std::unordered_multiset<const Value*> writes_;
   std::unordered_multiset<const Value*> reads_;
-  size_t numWildcards_ = 0;
 };
 
 // Try to move `toMove` before/after `movePoint` while preserving value
@@ -1237,53 +1034,16 @@ void AliasDb::move(Node* toMove, Node* movePoint, MoveSide moveSide) {
   }
 }
 
-bool AliasDb::hasUntrackedEffects(Node* node) const {
-  bool touchesWildcard = false;
-  if (const auto lastWildcard = getLastWildcard()) {
-    touchesWildcard = hasWrites(node) &&
-        (isBeforeSameGraph(node, *lastWildcard) || node == *lastWildcard);
-  }
-
-  return writesToInputAlias(node) || touchesWildcard;
-}
-
-// Nodes must be in the same graph in order to do `isBefore` or `isAfter`. This
-// traverses the subgraph "chain" upward until we find two nodes that share an
-// owning graph.
-//
-// NOTE: this is n^2 in subgraph depth. Right now the maximum depth is like 2,
-// but if we ever do huge nested subgraphs we'll need to reconsider this.
-bool AliasDb::isBeforeSameGraph(const Node* a, const Node* b) const {
-  auto lhs = a;
-  while (true) {
-    auto rhs = b;
-    while (true) {
-      if (lhs->owningGraph() == rhs->owningGraph()) {
-        return lhs->isBefore(rhs);
-      }
-      if (!subgraphToOwner_.count(rhs->owningGraph())) {
-        break;
-      }
-      rhs = subgraphToOwner_.at(rhs->owningGraph());
-    }
-    if (!subgraphToOwner_.count(lhs->owningGraph())) {
-      break;
-    }
-    lhs = subgraphToOwner_.at(lhs->owningGraph());
+bool AliasDb::writesToWildcard(Node* n) const {
+  if (!writeIndex_.count(n)) {
+    return false;
   }
-  AT_ASSERT(false);
-}
+  const auto& writes = writeIndex_.at(n);
 
-c10::optional<const Node*> AliasDb::getLastWildcard() const {
-  auto it = std::max_element(
-      wildcardNodes_.cbegin(),
-      wildcardNodes_.cend(),
-      [this](const Node* a, const Node* b) { return isBeforeSameGraph(a, b); });
-  if (it != wildcardNodes_.end()) {
-    return *it;
-  } else {
-    return c10::nullopt;
-  }
+  // For all writes, check if the written value is a wildcard
+  return std::any_of(writes.cbegin(), writes.cend(), [&](const Value* v) {
+    return mayAliasWildcard(v);
+  });
 }
 
 bool aliasAnalysisHasSpecialCaseFor(Symbol symbol) {
@@ -1319,6 +1079,7 @@ bool aliasAnalysisHasSpecialCaseFor(Symbol symbol) {
       prim::AutogradAdd,
       prim::GetAttr,
       prim::SetAttr,
+      prim::profile,
       aten::wait,
       aten::add,
       aten::sub,
@@ -1341,12 +1102,51 @@ bool aliasAnalysisHasSpecialCaseFor(Symbol symbol) {
   return handled.count(symbol) || purposefully_not_handled.count(symbol);
 }
 
+bool AliasDb::mayAliasWildcard(const Value* v) const {
+  if (!shouldAnnotate(v)) {
+    return false;
+  }
+
+  if (auto e = getWildcard(v->type())) {
+    return memoryDAG_->mayAlias(elementMap_.at(v), e);
+  }
+  // There were no wildcards of this type, so return false.
+  return false;
+}
+
+// Search the wildcard index for an element that corresponds to the given type.
+Element* AliasDb::getOrCreateWildcard(const TypePtr& type) {
+  TORCH_INTERNAL_ASSERT(shouldAnnotate(type));
+  const auto kind = getMutableTypeKind(type);
+  TORCH_INTERNAL_ASSERT(kind);
+
+  if (!wildcardIndex_.count(*kind)) {
+    // create a new empty Element to stand in for the wildcard set.
+    wildcardIndex_.emplace(*kind, memoryDAG_->makeFreshValue(nullptr));
+  }
+  return wildcardIndex_.at(*kind);
+}
+
+// Search the wildcard index for an element that corresponds to the given type.
+// Const version returns nullptr
+Element* AliasDb::getWildcard(const TypePtr& type) const {
+  TORCH_INTERNAL_ASSERT(shouldAnnotate(type));
+  const auto kind = getMutableTypeKind(type);
+  TORCH_INTERNAL_ASSERT(kind);
+  if (!wildcardIndex_.count(*kind)) {
+    return nullptr;
+  }
+  return wildcardIndex_.at(*kind);
+}
+
 // Register `v` as a wildcard value.
 void AliasDb::setWildcard(const Value* v) {
   if (!shouldAnnotate(v)) {
     return;
   }
-  wildcards_.insert(v);
+  auto e = getOrCreateWildcard(v->type());
+  TORCH_INTERNAL_ASSERT(e != nullptr);
+  memoryDAG_->makePointerTo(getOrCreateElement(v), e);
 }
 
 void AliasDb::rebuildWriteCache() const {
@@ -1361,17 +1161,5 @@ void AliasDb::rebuildWriteCache() const {
   }
   isWriteCacheStale_ = false;
 }
-
-ValueSet AliasDb::getMemoryLocations(const Value* v) const {
-  ValueSet ret;
-  if (!elementMap_.count(v)) {
-    return ret;
-  }
-
-  for (const auto el : elementMap_.at(v)->getMemoryLocations()) {
-    ret.insert(el->value);
-  }
-  return ret;
-}
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/passes/alias_analysis.h b/torch/csrc/jit/passes/alias_analysis.h
index 18fd0c86d8b5..66774be2cfbb 100644
--- a/torch/csrc/jit/passes/alias_analysis.h
+++ b/torch/csrc/jit/passes/alias_analysis.h
@@ -38,20 +38,22 @@ class AliasDb {
   //
   // These nodes are considered not safe to eliminate or mutate under any
   // circumstances.
-  bool hasUntrackedEffects(Node* n) const;
+  bool writesToWildcard(Node* n) const;
 
   // Does `n` write to an alias of one of the values in `vs`?
   // if `recurseBlocks` is true, consider writes on the nodes in `n`s sub-blocks
-  TORCH_API bool writesToAlias(Node* n, const ValueSet& vs, bool recurseBlocks = false)
-      const;
+  TORCH_API bool writesToAlias(
+      Node* n,
+      const ValueSet& vs,
+      bool recurseBlocks = false) const;
 
   // Does `a` and `b` potentially share a memory location or do either
   // hold in memory any element that exists in the other
-  bool mayContainAlias(Value* a, Value* b) const;
+  TORCH_API bool mayContainAlias(Value* a, Value* b) const;
 
   // Do any values in group `a` share a memory location or hold in memory
   // any element that exists in group `b`
-  bool mayContainAlias(
+  TORCH_API bool mayContainAlias(
       const at::ArrayRef<Value*>& a,
       const at::ArrayRef<Value*>& b) const;
 
@@ -65,26 +67,17 @@ class AliasDb {
   // `Elements`.
   template <
       typename... Other1,
-      template <typename, typename...> class T,
+      template <typename, typename...>
+      class T,
       typename... Other2,
-      template <typename, typename...> class U>
+      template <typename, typename...>
+      class U>
   bool mayAlias(
       const T<const Value*, Other1...>& a,
       const U<const Value*, Other2...>& b) const {
     if (a.empty() || b.empty()) {
       return false;
     }
-    // Short-circuit for special case: if any value is a wildcard, the two sets
-    // may alias
-    if (std::any_of(
-            a.cbegin(),
-            a.cend(),
-            [this](const Value* v) { return isWildcard(v); }) ||
-        std::any_of(b.cbegin(), b.cend(), [this](const Value* v) {
-          return isWildcard(v);
-        })) {
-      return true;
-    }
 
     T<Element*> aElements;
     for (const Value* v : a) {
@@ -135,14 +128,11 @@ class AliasDb {
   /**
    * Write and read internal API
    */
-  // Does `n` write to any alias sets?
-  bool hasWrites(Node* n) const;
   // Get all the values that `n` writes to.
   // NOTE: this only returns values directly written to, not aliases thereof
   //
   // if `recurseBlocks` is true, gather writes on the nodes in `n`s sub-blocks
   ValueSet getWrites(Node* n, bool recurseBlocks = false) const;
-  ValueSet getWrites(Block* b) const;
   void getWritesImpl(Block* b, ValueSet& ret, bool recurseBlocks = false) const;
   void getWritesImpl(Node* n, ValueSet& ret, bool recurseBlocks = false) const;
   // Do any nodes write to `v`s memory location?
@@ -154,26 +144,11 @@ class AliasDb {
   ValueSet getReads(Node* n, bool recurseBlocks = false) const;
   void getReadsImpl(Node* n, ValueSet& ret, bool recurseBlocks = false) const;
 
-  // Does `n` write to a value that may alias one of the graph inputs?
-  bool writesToInputAlias(Node* n) const;
-  // Does `n` write to `v` or any aliases of `v`?
-  bool writesTo(Node* n, const Value* v) const;
-
   /**
    * Wildcard methods
    */
-  // is `v` a wildcard?
-  TORCH_API bool isWildcard(const Value* v) const;
   // Register `v` as a wildcard value.
   void setWildcard(const Value* v);
-  // Get all nodes that write to a wildcard value.
-  const std::unordered_set<Node*>& getWildcardWriters() const {
-    return wildcardWriters_;
-  }
-  // Does `n` use or write to any wildcard aliases?
-  bool hasWildcard(const Node* n) const;
-  // Returns nullopt if there are no wildcard nodes
-  c10::optional<const Node*> getLastWildcard() const;
 
   // Is the element a wildcard or an unhandled container type,
   // or does the element contain an element for which that's true
@@ -199,6 +174,7 @@ class AliasDb {
   void analyzeSetAttr(Node* node);
   void analyzeTupleConstruct(Node* node);
   void analyzeCustomOp(Node* node);
+  void analyzeContainerConstruct(Node* node);
   bool tryRegisteredAnalysis(Node* node);
 
   /**
@@ -206,45 +182,37 @@ class AliasDb {
    */
   void makeAllAlias(const std::vector<Value*>& values);
   void makePointerTo(const Value* value, const Value* to);
-  void addToContainedElements(const Value* element, const Value* container);
+  TORCH_API void addToContainedElements(
+      const Value* element,
+      const Value* container);
   void mapAliases(at::ArrayRef<Value*> to, at::ArrayRef<Value*> from);
   void giveFreshAlias(const Value* value);
   Element* getOrCreateElement(const Value* value);
 
   static bool shouldAnnotate(const Value* v);
   static bool shouldAnnotate(const TypePtr& type);
+  static c10::optional<TypeKind> getMutableTypeKind(const TypePtr& type);
 
   static bool isContainerType(const TypePtr& type);
 
-  bool hasUsesAfter(Symbol alias, const Node* n) const;
-  bool isBeforeSameGraph(const Node* lhs, const Node* rhs) const;
-
-  // Returns true iff `v` is part of the alias tracker/is a wildcard
-  bool isTracked(const Value* v) const;
-
-  // Get the values that represent the memory locations that `v` may point to.
-  // Return values are guaranteed to be "fresh" tensors--they do not point to
-  // anything else.
-  ValueSet getMemoryLocations(const Value* v) const;
-
   std::shared_ptr<Graph> graph_;
-  std::unordered_map<const Graph*, const Node*> subgraphToOwner_;
 
   // The points-to graph that stores aliasing relationships
   std::unique_ptr<MemoryDAG> memoryDAG_;
   // Mapping of values to MemoryDAG elements
   std::unordered_map<const Value*, Element*> elementMap_;
+  // All wildcard elements (one for each unique mutable type).
+  std::map<TypeKind, Element*> wildcardIndex_;
+  Element* getWildcard(const TypePtr& type) const;
+  Element* getOrCreateWildcard(const TypePtr& type);
+  bool mayAliasWildcard(const Value* v) const;
 
-  // All values that may point to a wildcard value.
-  ValueSet wildcards_;
-  // All nodes that write to a wildcard
-  std::unordered_set<Node*> wildcardWriters_;
-  // All nodes that contain a wildcard
-  std::unordered_set<const Node*> wildcardNodes_;
-
-  // State for tracking write info
-  size_t numWrites_ = 0;
+  /**
+   * State for tracking write info.
+   */
+  // Map of nodes to the values that they write to
   std::unordered_map<Node*, ValueSet> writeIndex_;
+  // Set of all memory locations that may have been written to.
   mutable std::unordered_set<const Element*> writeCache_;
   mutable bool isWriteCacheStale_ = true;
   void rebuildWriteCache() const;
diff --git a/torch/csrc/jit/passes/constant_propagation.cpp b/torch/csrc/jit/passes/constant_propagation.cpp
index d2faaa1a8a91..7986aaa77a23 100644
--- a/torch/csrc/jit/passes/constant_propagation.cpp
+++ b/torch/csrc/jit/passes/constant_propagation.cpp
@@ -132,7 +132,7 @@ void replaceAndRemoveIfOutput(Node* n, size_t i, Value* replacement) {
 
 // remove extra outputs from the node
 bool removeExtraIfOutputs(Node* n) {
-  AT_CHECK(n->kind() == prim::If, "Only supported for If nodes");
+  TORCH_CHECK(n->kind() == prim::If, "Only supported for If nodes");
   auto true_block = n->blocks()[0];
   auto false_block = n->blocks()[1];
   auto graph = n->owningGraph();
diff --git a/torch/csrc/jit/passes/dead_code_elimination.cpp b/torch/csrc/jit/passes/dead_code_elimination.cpp
index 0673fb603823..cf8761c75217 100644
--- a/torch/csrc/jit/passes/dead_code_elimination.cpp
+++ b/torch/csrc/jit/passes/dead_code_elimination.cpp
@@ -202,7 +202,7 @@ class DeadCodeEliminator {
       auto schema = node->maybeSchema();
       return schema && schema->is_mutable();
     } else {
-      return aliasDb_->hasUntrackedEffects(node);
+      return aliasDb_->writesToWildcard(node);
     }
   }
 
diff --git a/torch/csrc/jit/passes/decompose_ops.cpp b/torch/csrc/jit/passes/decompose_ops.cpp
index d46dd5e0d4f6..2f3612caa3fc 100644
--- a/torch/csrc/jit/passes/decompose_ops.cpp
+++ b/torch/csrc/jit/passes/decompose_ops.cpp
@@ -1,4 +1,5 @@
 #include <torch/csrc/jit/operator.h>
+#include <torch/csrc/jit/custom_operator.h>
 #include <torch/csrc/jit/script/compiler.h>
 #include <torch/csrc/jit/passes/decompose_ops.h>
 #include <torch/csrc/jit/passes/dead_code_elimination.h>
@@ -9,6 +10,73 @@
 namespace torch {
 namespace jit {
 
+// helper to determine if an optional tensor argument/value passed in is
+// statically defined (neither a None constant nor a Optional[Tensor] type)
+// return yes, no, or no value if we can't tell
+c10::optional<bool> isDefined(Value* tensor) {
+  if (tensor->type()->isSubtypeOf(TensorType::get())) {
+    return true;
+  }
+  if (tensor->node()->mustBeNone()) {
+    return false;
+  }
+  return {};
+}
+
+bool isDecomposableNorm(Node* normalize_op) {
+  static const OperatorSet decomposable_normalization_ops = {
+      "aten::batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, bool cudnn_enabled) -> Tensor",
+      "aten::layer_norm(Tensor input, int[] normalized_shape, Tensor? weight, Tensor? bias, float eps, bool cudnn_enable) -> Tensor",
+  };
+  Value* input = normalize_op->namedInput(attr::input);
+  auto tensor_type = input->type()->cast<DimensionedTensorType>();
+  // As of now, we do the decomposition for batchnorm/layernorm on GPU device only
+  if (!tensor_type || tensor_type->device().is_cpu()) {
+    return false;
+  }
+
+  if (decomposable_normalization_ops.find(normalize_op)) {
+    // If we can't determine if weight and bias is defined statically there's
+    // really no point in decomposing normalization into simpler ops, since it
+    // won't get fused into a single kernel.
+    return isDefined(normalize_op->namedInput(attr::weight)).has_value() &&
+        isDefined(normalize_op->namedInput(attr::bias)).has_value();
+  }
+  return false;
+}
+
+RegisterOperators reg_bn_unsqueeze({Operator(
+    "aten::_ncf_unsqueeze(Tensor self, int ndim) -> Tensor",
+    [](const Node* node) {
+      return [](Stack& stack) {
+        const int64_t ndim = pop(stack).toInt();
+        auto self = pop(stack).toTensor();
+        c10::SmallVector<int64_t, 8> sizes(ndim, 1);
+        AT_ASSERT(self.dim() == 1);
+        sizes.at(1) = self.size(0);
+        push(stack, self.reshape(sizes));
+        return 0;
+      };
+    })});
+
+RegisterOperators reg_ln_view({Operator(
+    "aten::_ncf_view(Tensor self, int[] input_shape, int normalized_ndim) -> Tensor",
+    [](const Node* node) {
+      return [](Stack& stack) {
+        const int64_t normalized_ndim = pop(stack).toInt();
+        auto input_shape = pop(stack).toIntListRef();
+        auto self = pop(stack).toTensor();
+        const int64_t input_ndim = input_shape.size();
+        c10::SmallVector<int64_t, 8> sizes(input_ndim, 1);
+        for (int i = 0; i < input_ndim - normalized_ndim; ++i) {
+          sizes.at(i) = input_shape[i];
+        }
+        push(stack, self.reshape(sizes));
+        return 0;
+      };
+    })});
+
+
 bool DecomposeOps(Block* block, script::CompilationUnit& decompose_funcs) {
   bool decomposed = false;
   for (auto it = block->nodes().begin(), end = block->nodes().end(); it != end;
@@ -39,6 +107,74 @@ bool DecomposeOps(Block* block, script::CompilationUnit& decompose_funcs) {
       new_output->setType(it->output()->type());
       it->output()->replaceAllUsesWith(new_output);
       it.destroyCurrent();
+    } else if (it->matches(
+          "aten::batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, bool cudnn_enabled) -> Tensor")) {
+      if (!isDecomposableNorm(*it)) {
+        continue;
+      }
+      decomposed = true;
+      WithInsertPoint insert_guard{*it};
+      Graph* graph = it->owningGraph();
+      Value* input = it->namedInput(attr::input);
+      Value* input_dim = graph->insert(aten::dim, {input});
+      std::vector<Value*> inputs {
+        input,
+        it->namedInput(attr::running_mean),
+        it->namedInput(attr::running_var),
+        it->namedInput(attr::training),
+        it->namedInput(attr::momentum),
+        it->namedInput(attr::eps)
+      };
+
+      // inline the compiled decomposed batchnorm
+      std::shared_ptr<Graph> d_graph = decompose_funcs.get_function("batch_norm").graph();
+      Value* new_output = inlineCallTo(*graph, *d_graph, inputs).at(0);
+
+      // post processing the graph
+      Value* weight = it->namedInput(attr::weight);
+      Value* bias = it->namedInput(attr::bias);
+      if (isDefined(weight).value()) {
+        Value* expanded_weight =
+            graph->insert(aten::_ncf_unsqueeze, {weight, input_dim});
+        new_output = graph->insert(aten::mul, {new_output, expanded_weight});
+      }
+      if (isDefined(bias).value()) {
+        Value* expanded_bias =
+            graph->insert(aten::_ncf_unsqueeze, {bias, input_dim});
+        new_output = graph->insert(aten::add, {new_output, expanded_bias});
+      }
+      it->output()->replaceAllUsesWith(new_output);
+      it.destroyCurrent();
+    } else if (it->matches(
+          "aten::layer_norm(Tensor input, int[] normalized_shape, Tensor? weight, Tensor? bias, float eps, bool cudnn_enable) -> Tensor")) {
+      if (!isDecomposableNorm(*it)) {
+        continue;
+      }
+      decomposed = true;
+      WithInsertPoint insert_guard{*it};
+      Graph* graph = it->owningGraph();
+      std::vector<Value*> inputs {
+        it->namedInput(attr::input),
+        it->namedInput(attr::normalized_shape),
+        it->namedInput(attr::eps),
+        it->namedInput(attr::cudnn_enable)
+      };
+
+      // inline the compiled decomposed layernorm
+      std::shared_ptr<Graph> d_graph = decompose_funcs.get_function("layer_norm").graph();
+      Value* new_output = inlineCallTo(*graph, *d_graph, inputs).at(0);
+
+      // post processing the graph
+      Value* weight = it->namedInput(attr::weight);
+      Value* bias = it->namedInput(attr::bias);
+      if (isDefined(weight).value()) {
+        new_output = graph->insert(aten::mul, {new_output, weight});
+      } 
+      if (isDefined(bias).value()) {
+        new_output = graph->insert(aten::add, {new_output, bias});
+      }
+      it->output()->replaceAllUsesWith(new_output);
+      it.destroyCurrent();
     }
   }
   return decomposed;
@@ -48,6 +184,31 @@ void DecomposeOps(std::shared_ptr<Graph>& graph) {
   static script::CompilationUnit decompose_funcs(R"SCRIPT(
       def addmm(self: Tensor, mat1: Tensor, mat2: Tensor, beta: number = 1.0, alpha: number = 1.0):
           return self + mat1.mm(mat2)
+
+      def batch_norm(input : Tensor, running_mean : Optional[Tensor], running_var : Optional[Tensor], training : bool, momentum : float, eps : float) -> Tensor:
+          if training:
+              norm_mean, norm_var = torch.batch_norm_update_stats(input, running_mean, running_var, momentum)
+          else:
+              norm_mean = torch._unwrap_optional(running_mean)
+              norm_var = torch._unwrap_optional(running_var)
+          norm_mean = torch._ncf_unsqueeze(norm_mean, input.dim())
+          norm_var = torch._ncf_unsqueeze(norm_var, input.dim())
+          norm_invstd = 1 / (torch.sqrt(norm_var + eps))
+          return ((input - norm_mean) * norm_invstd)
+
+      def layer_norm(input : Tensor, normalized_shape : List[int], eps : float, cudnn_enable : bool) -> Tensor:
+          input_ndim = input.dim()
+          normalized_ndim = len(normalized_shape)
+          n = 1
+          for i in range(input_ndim - normalized_ndim):
+              n *= input.size(i)
+          input_reshape = input.contiguous().view(1, n, -1)
+          mean, invstd = torch.batch_norm_stats(input_reshape, eps)
+          input_shape = input.size()
+          mean = torch._ncf_view(mean, input_shape, normalized_ndim)
+          invstd = torch._ncf_view(invstd, input_shape, normalized_ndim)
+
+          return (input - mean) * invstd
       )SCRIPT");
   bool is_decomposed = DecomposeOps(graph->block(), decompose_funcs);
   if (is_decomposed) {
diff --git a/torch/csrc/jit/passes/graph_fuser.cpp b/torch/csrc/jit/passes/graph_fuser.cpp
index 114bef1446ab..79db4c0a2e5c 100644
--- a/torch/csrc/jit/passes/graph_fuser.cpp
+++ b/torch/csrc/jit/passes/graph_fuser.cpp
@@ -120,64 +120,6 @@ bool isSimpleMap(Node* node) {
   return true;
 }
 
-RegisterOperators reg_bn_unsqueeze({Operator(
-    "aten::_ncf_unsqueeze(Tensor self, int ndim) -> Tensor",
-    [](const Node* node) {
-      return [](Stack& stack) {
-        const int64_t ndim = pop(stack).toInt();
-        auto self = pop(stack).toTensor();
-        c10::SmallVector<int64_t, 8> sizes(ndim, 1);
-        AT_ASSERT(self.dim() == 1);
-        sizes.at(1) = self.size(0);
-        push(stack, self.reshape(sizes));
-        return 0;
-      };
-    })});
-
-RegisterOperators reg_ln_view({Operator(
-    "aten::_ncf_view(Tensor self, int[] input_shape, int normalized_ndim) -> Tensor",
-    [](const Node* node) {
-      return [](Stack& stack) {
-        const int64_t normalized_ndim = pop(stack).toInt();
-        auto input_shape = pop(stack).toIntListRef();
-        auto self = pop(stack).toTensor();
-        const int64_t input_ndim = input_shape.size();
-        c10::SmallVector<int64_t, 8> sizes(input_ndim, 1);
-        for (int i = 0; i < input_ndim - normalized_ndim; ++i) {
-          sizes.at(i) = input_shape[i];
-        }
-        push(stack, self.reshape(sizes));
-        return 0;
-      };
-    })});
-
-// Yes, no, or no value if we can't tell
-c10::optional<bool> isDefined(Value* tensor) {
-  if (tensor->type()->isSubtypeOf(TensorType::get())) {
-    return true;
-  }
-  if (tensor->node()->mustBeNone()) {
-    return false;
-  }
-  return {};
-}
-
-bool isFusableNorm(Node* normalize_op) {
-  static const OperatorSet decomposable_normalization_ops = {
-      "aten::batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, bool cudnn_enabled) -> Tensor",
-      "aten::layer_norm(Tensor input, int[] normalized_shape, Tensor? weight, Tensor? bias, float eps, bool cudnn_enable) -> Tensor",
-  };
-
-  if (decomposable_normalization_ops.find(normalize_op)) {
-    // If we can't determine if weight and bias is defined statically there's
-    // really no point in decomposing normalization into simpler ops, since it
-    // won't get fused into a single kernel.
-    return isDefined(normalize_op->namedInput(attr::weight)).has_value() &&
-        isDefined(normalize_op->namedInput(attr::bias)).has_value();
-  }
-  return false;
-}
-
 Value* broadcastSizes(at::ArrayRef<Value*> sizes) {
   AT_ASSERT(!sizes.empty());
   Graph* graph = sizes[0]->owningGraph();
@@ -250,7 +192,7 @@ struct GraphFuser {
         fusableDevice &= isFusableDevice(output);
       }
     }
-    return fusableDevice && (isFusableMap(node) || isFusableNorm(node));
+    return fusableDevice && isFusableMap(node);
   }
 
   bool isFusableMap(Node* node) {
@@ -312,113 +254,6 @@ struct GraphFuser {
     return *n->g(attr::Subgraph);
   }
 
-  Value* decomposeCommonNormalization(
-      Node* normalization_op,
-      const char* source,
-      const std::string& method_name,
-      const std::vector<Value*>& inputs) {
-    std::shared_ptr<Graph> nm_graph;
-    std::once_flag flag;
-    std::call_once(
-        flag,
-        [](std::shared_ptr<Graph>* graph_ptr,
-           const char* source,
-           const std::string& method_name) {
-          script::CompilationUnit cu;
-          cu.define(source, script::nativeResolver(), nullptr);
-          *graph_ptr = cu.get_function(method_name).graph();
-        },
-        &nm_graph,
-        source,
-        method_name);
-
-    WithInsertPoint insert_guard{normalization_op};
-    return inlineCallTo(*normalization_op->owningGraph(), *nm_graph, inputs)
-        .at(0);
-  }
-
-  void decomposeNormalizationOps(Node* normalization_op) {
-    static const char* bm_source = R"SCRIPT(
-        def batch_norm(input : Tensor, running_mean : Optional[Tensor], running_var : Optional[Tensor], training : bool, momentum : float, eps : float) -> Tensor:
-            if training:
-                norm_mean, norm_var = torch.batch_norm_update_stats(input, running_mean, running_var, momentum)
-            else:
-                norm_mean = torch._unwrap_optional(running_mean)
-                norm_var = torch._unwrap_optional(running_var)
-            norm_mean = torch._ncf_unsqueeze(norm_mean, input.dim())
-            norm_var = torch._ncf_unsqueeze(norm_var, input.dim())
-            norm_invstd = 1 / (torch.sqrt(norm_var + eps))
-            return ((input - norm_mean) * norm_invstd)
-      )SCRIPT";
-    static const char* lm_source = R"SCRIPT(
-        def layer_norm(input : Tensor, normalized_shape : List[int], eps : float, cudnn_enable : bool) -> Tensor:
-            input_ndim = input.dim()
-            normalized_ndim = len(normalized_shape)
-            n = 1
-            for i in range(input_ndim - normalized_ndim):
-                n *= input.size(i)
-            input_reshape = input.contiguous().view(1, n, -1)
-            mean, invstd = torch.batch_norm_stats(input_reshape, eps)
-            input_shape = input.size()
-            mean = torch._ncf_view(mean, input_shape, normalized_ndim)
-            invstd = torch._ncf_view(invstd, input_shape, normalized_ndim)
-
-            return (input - mean) * invstd
-      )SCRIPT";
-    AT_ASSERT(isFusableNorm(normalization_op));
-    WithInsertPoint insert_guard{normalization_op};
-    Value* input = normalization_op->namedInput(attr::input);
-    if (normalization_op->kind() == aten::batch_norm) {
-      Value* input_dim = graph_->insert(aten::dim, {input});
-      std::vector<Value*> inputs{
-          input,
-          normalization_op->namedInput(attr::running_mean),
-          normalization_op->namedInput(attr::running_var),
-          normalization_op->namedInput(attr::training),
-          normalization_op->namedInput(attr::momentum),
-          normalization_op->namedInput(attr::eps)};
-
-      Value* new_output = decomposeCommonNormalization(
-          normalization_op, bm_source, "batch_norm", inputs);
-      auto weight = normalization_op->namedInput(attr::weight);
-      auto bias = normalization_op->namedInput(attr::bias);
-      if (isDefined(weight).value()) {
-        Value* expanded_weight =
-            graph_->insert(aten::_ncf_unsqueeze, {weight, input_dim});
-        new_output = graph_->insert(aten::mul, {new_output, expanded_weight});
-      }
-      if (isDefined(bias).value()) {
-        Value* expanded_bias =
-            graph_->insert(aten::_ncf_unsqueeze, {bias, input_dim});
-        new_output = graph_->insert(aten::add, {new_output, expanded_bias});
-      }
-      normalization_op->output()->replaceAllUsesWith(new_output);
-      normalization_op->destroy();
-
-    } else if (normalization_op->kind() == aten::layer_norm) {
-      std::vector<Value*> inputs{
-          input,
-          normalization_op->namedInput(attr::normalized_shape),
-          normalization_op->namedInput(attr::eps),
-          normalization_op->namedInput(attr::cudnn_enable)};
-      Value* new_output = decomposeCommonNormalization(
-          normalization_op, lm_source, "layer_norm", inputs);
-      auto weight = normalization_op->namedInput(attr::weight);
-      auto bias = normalization_op->namedInput(attr::bias);
-      auto weight_defined = isDefined(weight).value();
-      auto bias_defined = isDefined(bias).value();
-      if (weight_defined && bias_defined) {
-        new_output = graph_->insert(aten::addcmul, {bias, new_output, weight});
-      } else if (weight_defined) {
-        new_output = graph_->insert(aten::mul, {new_output, weight});
-      } else if (bias_defined) {
-        new_output = graph_->insert(aten::add, {new_output, bias});
-      }
-      normalization_op->output()->replaceAllUsesWith(new_output);
-      normalization_op->destroy();
-    }
-  }
-
   void mergeFusionGroups(Node* consumer_group, Node* producer_group) {
     // Now we have two fusion groups!
     // Revert the fusion - place all inner nodes of producer back in the outer
@@ -619,18 +454,6 @@ struct GraphFuser {
       group = createSingletonFusionGroup(consumer);
     }
 
-    if (kind_ == prim::FusionGroup &&
-        (producer->node()->matches(
-             "aten::layer_norm(Tensor input, int[] normalized_shape, Tensor? weight, Tensor? bias, float eps, bool cudnn_enable) -> Tensor") ||
-         producer->node()->matches(
-             "aten::batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, bool cudnn_enabled) -> Tensor"))) {
-      // We don't do any fusions in here, but simply decompose the normalization
-      // ops into a kernel that computes the stats + pointwise ops which will be
-      // considered in this fusion next.
-      decomposeNormalizationOps(producer->node());
-      return group;
-    }
-
     if (producer->node()->kind() == kind_) {
       mergeFusionGroups(group, producer->node());
       return group;
diff --git a/torch/csrc/jit/passes/insert_guards.cpp b/torch/csrc/jit/passes/insert_guards.cpp
new file mode 100644
index 000000000000..7670a3647237
--- /dev/null
+++ b/torch/csrc/jit/passes/insert_guards.cpp
@@ -0,0 +1,62 @@
+#include <torch/csrc/jit/passes/insert_guards.h>
+#include <memory>
+#include <unordered_set>
+
+namespace torch {
+namespace jit {
+
+struct GuardInserter {
+  GuardInserter(std::shared_ptr<Graph> graph) : graph_(std::move(graph)) {}
+
+  void run() {
+    insertGuards(graph_->block());
+    removeProfilingNodes(graph_->block());
+  }
+
+ private:
+  void removeProfilingNodes(Block* b) {
+    for (auto it = b->nodes().begin(); it != b->nodes().end(); it++) {
+      if (it->kind() == prim::profile) {
+        it.destroyCurrent();
+      } else {
+        for (Block* ib : it->blocks()) {
+          removeProfilingNodes(ib);
+        }
+      }
+    }
+  }
+
+  void insertGuards(Block* b) {
+    for (auto it = b->nodes().begin(); it != b->nodes().end(); it++) {
+      auto n = *it;
+      if (n->kind() == prim::profile && n->outputs().size() == 1) {
+        // n->input() is Tensor type
+        auto guard = graph_->create(prim::Guard, {n->input()}, 1);
+        auto go = guard->output();
+        // make a *copy* of ProfilingTensorType, in case we'd like
+        // to make changes to it independently from the one being
+        // profiled
+        auto copy = ProfiledTensorType::create(
+            n->output()->type()->expect<ProfiledTensorType>());
+        go->setType(copy);
+        guard->insertBefore(n);
+        n->output()->replaceAllUsesWith(go);
+        it.destroyCurrent();
+      } else {
+        for (Block* ib : n->blocks()) {
+          insertGuards(ib);
+        }
+      }
+    }
+  }
+
+  std::shared_ptr<Graph> graph_;
+};
+
+void InsertGuards(std::shared_ptr<Graph> graph) {
+  GuardInserter gi(std::move(graph));
+  gi.run();
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/passes/insert_guards.h b/torch/csrc/jit/passes/insert_guards.h
new file mode 100644
index 000000000000..e6df760be920
--- /dev/null
+++ b/torch/csrc/jit/passes/insert_guards.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/jit_type.h>
+#include <ATen/core/stack.h>
+#include <torch/csrc/WindowsTorchApiMacro.h>
+#include <torch/csrc/jit/ir.h>
+
+#include <list>
+#include <vector>
+
+namespace torch {
+namespace jit {
+
+using ::c10::ProfiledTensorTypePtr;
+
+TORCH_API void InsertGuards(std::shared_ptr<Graph> graph);
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/passes/lower_tuples.cpp b/torch/csrc/jit/passes/lower_tuples.cpp
index 1025b24d6bc3..3cfad7f75435 100644
--- a/torch/csrc/jit/passes/lower_tuples.cpp
+++ b/torch/csrc/jit/passes/lower_tuples.cpp
@@ -44,7 +44,7 @@ void removeTupleNodes(Node* n, bool must_remove_tuples) {
     auto maybe_int = constant_as<int64_t>(idx);
     if (!maybe_int) {
       if (must_remove_tuples) {
-        AT_ERROR(n->getSourceLocation(), "tuple index with non-constant index");
+        AT_ERROR(n->sourceRange(), "tuple index with non-constant index");
       }
       return;
     }
@@ -90,10 +90,10 @@ static void VisitNode(Node* n, Node* insert_point) {
   for (size_t i = 0; i < n->inputs().size();) {
     auto input = n->inputs()[i];
     if (TupleTypePtr tt = input->type()->cast<TupleType>()) {
-      AT_CHECK(
+      TORCH_CHECK(
           white_list.count(n->kind()) > 0,
           "tuple appears in op that does not forward tuples");
-      AT_CHECK(
+      TORCH_CHECK(
           input->node()->kind() == prim::TupleConstruct,
           "tuple use not matched to tuple construct");
       for (size_t j = 0; j < tt->elements().size(); ++j) {
@@ -119,7 +119,7 @@ static void VisitNode(Node* n, Node* insert_point) {
     //    tup = (t0, t1)
     // is placed at the current insertion point
     if (TupleTypePtr tt = output->type()->cast<TupleType>()) {
-      AT_CHECK(
+      TORCH_CHECK(
           white_list.count(n->kind()) > 0,
           "tuple appears in op that does not forward tuples");
       for (size_t j = 0; j < tt->elements().size(); j++) {
@@ -157,7 +157,7 @@ static void LowerAllTuples(Block* block) {
 
 static void EnsureNoTuples(ArrayRef<Value*> values) {
   for (Value* v : values) {
-    AT_CHECK(
+    TORCH_CHECK(
         v->type()->kind() != TypeKind::TupleType, "Couldn't lower all tuples.");
   }
 }
diff --git a/torch/csrc/jit/passes/onnx.cpp b/torch/csrc/jit/passes/onnx.cpp
index 950fbe3bbebe..c6ab87124c6d 100644
--- a/torch/csrc/jit/passes/onnx.cpp
+++ b/torch/csrc/jit/passes/onnx.cpp
@@ -171,13 +171,14 @@ void BlockToONNX(
   torch::autograd::SymbolicContext ctx{};
   ctx.block = new_block;
   py::object onnx = py::module::import("torch.onnx");
-  py::object onnx_symbolic = py::module::import("torch.onnx.symbolic");
+  py::object onnx_symbolic = py::module::import("torch.onnx.symbolic_helper");
+  py::object onnx_registry = py::module::import("torch.onnx.symbolic_registry");
 
   // Returns a node that n maps to in the new graph
   auto envFn = [&env](Value* n) -> Value* {
     auto it = env.find(n);
-    AT_CHECK(it != env.end(), "Dangling node reference");
-    AT_CHECK(it->second, "Unused node was subsequently used");
+    TORCH_CHECK(it != env.end(), "Dangling node reference");
+    TORCH_CHECK(it->second, "Unused node was subsequently used");
     return it->second;
   };
 
@@ -211,7 +212,7 @@ void BlockToONNX(
         outputs[i]->setType(old->type());
         // Copy over source location and scope information to all nodes
         // created by the symbolic
-        outputs[i]->node()->setSourceLocation(node->getSourceLocation());
+        outputs[i]->node()->setSourceRange(node->sourceRange());
         outputs[i]->node()->setScope(node->scope());
         env[old] = outputs[i];
       } else {
@@ -295,7 +296,6 @@ void BlockToONNX(
     if (func) {
       pyobj = func->get();
     }
-
     if (!py::hasattr(pyobj, "symbolic")) {
       cloneNode(op);
       return;
@@ -312,13 +312,13 @@ void BlockToONNX(
     for (auto arg_type : op->cconv) {
       py::object obj;
       if (arg_type == 'c') {
-        AT_CHECK(
+        TORCH_CHECK(
             scalar_it != op->scalar_args.end(),
             "expected too many scalar args");
         obj = py::reinterpret_borrow<py::object>(
             py::handle((scalar_it++)->get()));
       } else if (arg_type == 'd') {
-        AT_CHECK(node_it != inputs.end(), "expected too many inputs");
+        TORCH_CHECK(node_it != inputs.end(), "expected too many inputs");
         obj = py::cast(envFn(*node_it++));
       } else {
         throw std::runtime_error("unexpected calling convention");
@@ -331,6 +331,8 @@ void BlockToONNX(
     // Call the symbolic function
     // Use a little trampoline function so we can give good error messages
     // upon argument mismatch
+    py::object opset_version = onnx_symbolic.attr("_export_onnx_opset_version");
+    onnx_registry.attr("register_op")(op->name(), pyobj.attr("symbolic"), "", opset_version);
     py::object raw_output = onnx.attr("_run_symbolic_method")(
         op->name(), pyobj.attr("symbolic"), py_symbolic_args);
 
diff --git a/torch/csrc/jit/passes/python_print.cpp b/torch/csrc/jit/passes/python_print.cpp
index f36084cdf0ef..327373f6afd3 100644
--- a/torch/csrc/jit/passes/python_print.cpp
+++ b/torch/csrc/jit/passes/python_print.cpp
@@ -531,7 +531,7 @@ struct PythonPrintPass {
       // this must be a while loop, but check that there isn't _also_ a trip
       // count
       if (trip_count_is_specified) {
-        throw script::ErrorReport(stmt.node()->getSourceLocation())
+        throw script::ErrorReport(stmt.node()->sourceRange())
             << "loop cannot be printed as python "
             << "because it has gone through an optimization "
             << "that combined while and for loops. File a bug.";
@@ -665,20 +665,11 @@ struct PythonPrintPass {
 
     if (!print_const && node->kind() == prim::Constant)
       return;
-    if (node->kind() == prim::PythonOp) {
-      auto value = static_cast<const PythonOp*>(node);
-      if (enforce_importable_ && value->ignore_on_export) {
-        // Op has been marked as ignored, so insert an error in its place
-        indent();
-        body_ << "ops.prim.IgnoredPythonOp()\n";
-        return;
-      }
-    }
     splitLongInlines(node->inputs());
     switch (node->kind()) {
       case prim::Return:
         if (enforce_importable_ && node->inputs().size() != 1) {
-          throw script::ErrorReport(node->getSourceLocation())
+          throw script::ErrorReport(node->sourceRange())
               << "Exportable methods must have a single return value. "
               << "Normal use of ScriptMethods should enforce this.";
         }
@@ -733,7 +724,7 @@ struct PythonPrintPass {
       } break;
       case prim::Function: {
         if (enforce_importable_) {
-          throw script::ErrorReport(node->getSourceLocation())
+          throw script::ErrorReport(node->sourceRange())
               << "closures are not exportable";
         }
         assignValuesToTheirUniqueNames(node->outputs());
@@ -849,16 +840,20 @@ struct PythonPrintPass {
     switch (node->kind()) {
       case prim::PythonOp: {
         auto value = static_cast<const PythonOp*>(node);
-        if (enforce_importable_) {
-          throw script::ErrorReport(node->getSourceLocation())
+        if (enforce_importable_ && !value->ignore_on_export) {
+          throw script::ErrorReport(node->sourceRange())
               << "could not export python function call " << value->name()
               << ". Remove calls to Python functions before export. "
               << "Did you forget add @script or @script_method annotation? "
               << "If this is a nn.ModuleList, add it to __constants__.";
         }
 
-        stmt << "^" << value->name();
-        value->writeScalars(stmt);
+        if (value->ignore_on_export) {
+          stmt << "ops.prim.IgnoredPythonOp";
+        } else {
+          stmt << "^" << value->name();
+          value->writeScalars(stmt);
+        }
         printValueList(stmt, node->inputs(), "(", ")");
       } break;
       case prim::Constant: {
@@ -1204,6 +1199,7 @@ bool printerHasSpecialCaseFor(Symbol sym) {
       prim::MMTreeReduce, // used as an optimization
       prim::MMBatchSide, // used as an optimization
       prim::Store, // used in interpreter only
+      prim::profile, // used in interpreter only
 
   };
 
diff --git a/torch/csrc/jit/passes/quantization.cpp b/torch/csrc/jit/passes/quantization.cpp
index d65c773ee058..e75417bd6578 100644
--- a/torch/csrc/jit/passes/quantization.cpp
+++ b/torch/csrc/jit/passes/quantization.cpp
@@ -10,9 +10,14 @@ namespace torch {
 namespace jit {
 namespace {
 // QuantizerUtils
-
-bool checkIfNodeQuantizable(Node* n) {
-  AT_ASSERT(n != nullptr);
+struct param_info_t {
+  Value* v;
+  Node* n; // Value consumer
+  size_t idx; // Index in input param vector
+};
+
+Operator* checkIfNodeQuantizable(Node* n) {
+  TORCH_INTERNAL_ASSERT(n != nullptr);
   // This is map for quantizable nodes. It will be expanded in future to
   // support more ops and patterns.
   static const OperatorSet quantnodeLookup = {
@@ -25,6 +30,80 @@ int groups, bool benchmark, bool deterministic, bool cudnn_enabled) -> Tensor"};
   return quantnodeLookup.find(n);
 }
 
+Value* getScaleValue(Node* n) {
+  if (n->kind().toQualString() != std::string("aten::dequantize_linear")) {
+    return nullptr;
+  }
+  TORCH_CHECK(n->inputs().size() == 4);
+  // Fetch scale from the dequant node
+  return n->inputs()[1];
+}
+
+Node* traverseToQuantNode(Node* dq) {
+  TORCH_INTERNAL_ASSERT(dq != nullptr);
+  TORCH_INTERNAL_ASSERT(dq->inputs().size() != 0);
+  Node* intrepr = dq->inputs()[0]->node();
+  TORCH_INTERNAL_ASSERT(intrepr != nullptr);
+  TORCH_INTERNAL_ASSERT(intrepr->inputs().size() != 0);
+  return intrepr->inputs()[0]->node();
+}
+
+// Look for index of particular param in op schema
+size_t getParamIndexinOpArgs(Node* n, const std::string& param_name) {
+  TORCH_INTERNAL_ASSERT(n != nullptr);
+  Operator* optr = checkIfNodeQuantizable(n);
+  if (optr == nullptr) {
+    return static_cast<size_t>(-1);
+  }
+  auto& opargs = optr->schema().arguments();
+  for (size_t idx = 0; idx < opargs.size(); idx++) {
+    if (opargs[idx].name() == param_name) {
+      return idx;
+    }
+  }
+  return static_cast<size_t>(-1);
+}
+
+std::vector<param_info_t> getQuantizableParamsofName(
+    script::Method& method,
+    const std::string& param_name) {
+  std::vector<param_info_t> params_to_insert_qdq;
+  auto graph = method.graph();
+  // Parameters input to this method
+  size_t param_input_len = method.initial_ivalues().size();
+  // External inputs to this method
+  size_t ext_input_len = method.num_inputs();
+  std::unordered_map<Node*, size_t> node_paramidx_map;
+
+  for (size_t idx = 0; idx < param_input_len; idx++) {
+    auto& v = graph->inputs()[idx + ext_input_len];
+    if (!v->type()->isSubtypeOf(TensorType::get()) || !v->hasUses()) {
+      continue;
+    }
+    Node* n = v->uses()[0].user;
+
+    // For every param name, we match it against the quantizable op schema and
+    // find its position. if the param is present we store it in vector so
+    // later we can insert quant-dequant nodes. Caching the param index helps
+    // faster lookup for same kind of node visited multiple timees.
+    size_t param_idx;
+    auto it = node_paramidx_map.find(n);
+    if (it != node_paramidx_map.end()) {
+      param_idx = it->second;
+    } else {
+      param_idx = getParamIndexinOpArgs(n, param_name);
+      node_paramidx_map.emplace(n, param_idx);
+    }
+    if (param_idx >= n->inputs().size() || n->inputs()[param_idx] != v) {
+      // Either node does not contain param or this Value
+      // is not of type param_name
+      continue;
+    }
+    params_to_insert_qdq.emplace_back(param_info_t{v, n, idx});
+  }
+  return params_to_insert_qdq;
+}
+
 std::vector<Value*> insertQuantParamNodes(
     Node* quant,
     const std::tuple<std::string, float, int>& qparam) {
@@ -43,7 +122,7 @@ std::vector<Value*> insertQuantParamNodes(
 }
 
 Value* insertScalarType(Node* ins_node, at::ScalarType t) {
-  AT_ASSERT(t != at::ScalarType::Undefined);
+  TORCH_INTERNAL_ASSERT(t != at::ScalarType::Undefined);
   WithInsertPoint ins(ins_node);
   // ScalarType inserted before ins_node node which is
   // beginning of the quant-dequant pattern
@@ -53,51 +132,47 @@ Value* insertScalarType(Node* ins_node, at::ScalarType t) {
 }
 
 // Create Quant Node
-Node* createQuantNode(Value* v, Node* n) {
-  Node* quant = n->owningGraph()->create(
-      at::Symbol::fromQualString("aten::quantize_linear"));
-  AT_ASSERTM(quant != nullptr, "Failed to create quant node");
+Node* createQuantNode(Value* v, Graph* g) {
+  Node* quant = g->create(at::Symbol::fromQualString("aten::quantize_linear"));
+  TORCH_INTERNAL_ASSERT(quant != nullptr, "Failed to create quant node");
   quant->output()->setUniqueName(v->uniqueName() + ".quant");
-  quant->setScope(n->scope());
   return quant;
 }
 
 // Create Dequant node
-Node* createDeQuantNode(Value* v, Node* n) {
-  Node* dequant = n->owningGraph()->create(
-      at::Symbol::fromQualString("aten::dequantize_linear"));
-  AT_ASSERTM(dequant != nullptr, "Failed to create dequant node");
+Node* createDeQuantNode(Value* v, Graph* g) {
+  Node* dequant =
+      g->create(at::Symbol::fromQualString("aten::dequantize_linear"));
+  TORCH_INTERNAL_ASSERT(dequant != nullptr, "Failed to create dequant node");
   dequant->output()->setUniqueName(v->uniqueName() + ".dequant");
-  dequant->setScope(n->scope());
   return dequant;
 }
 
 // Create IntTensor Node
-Node* createIntReprNode(Value* v, Node* n) {
-  Node* intrepr =
-      n->owningGraph()->create(at::Symbol::fromQualString("aten::int_repr"));
-  AT_ASSERTM(intrepr != nullptr, "Failed to create inttensor node");
+Node* createIntReprNode(Value* v, Graph* g) {
+  Node* intrepr = g->create(at::Symbol::fromQualString("aten::int_repr"));
+  TORCH_INTERNAL_ASSERT(intrepr != nullptr, "Failed to create inttensor node");
   intrepr->output()->setUniqueName(v->uniqueName() + ".intrepr");
-  intrepr->setScope(n->scope());
   return intrepr;
 }
 
 // Insert Quant-Dequant node pattern for quantizable node outputs
-void addQuantDeQuantNodes(
+Node* addQuantDeQuantNodesFor(
     Value* v,
+    Node* insert_point,
     const std::tuple<std::string, float, int>& qparam,
-    at::ScalarType t = at::ScalarType::Undefined) {
-  AT_ASSERT(v != nullptr);
-  Node* n = v->node();
-  Node* quant = createQuantNode(v, n);
-  Node* intrepr = createIntReprNode(v, n);
-  Node* dequant = createDeQuantNode(v, n);
+    at::ScalarType t) {
+  TORCH_INTERNAL_ASSERT(v != nullptr);
+  WithCurrentScope scope_guard(
+      *insert_point->owningGraph(), insert_point->scope());
+  Node* quant = createQuantNode(v, insert_point->owningGraph());
+  Node* intrepr = createIntReprNode(v, insert_point->owningGraph());
+  Node* dequant = createDeQuantNode(v, insert_point->owningGraph());
 
   // Add quant-intrepr-dequant nodes and replace for all uses of Value
-  quant->insertAfter(n);
+  quant->insertAfter(insert_point);
   intrepr->insertAfter(quant);
   dequant->insertAfter(intrepr);
-  v->replaceAllUsesWith(dequant->output());
 
   // Attach inputs to quantization pattern nodes
   quant->addInput(v);
@@ -109,52 +184,12 @@ void addQuantDeQuantNodes(
     quant->addInput(qparam_value);
     dequant->addInput(qparam_value);
   }
-  // optional argument required only for quantization
-  // of specific attributes eg: bias.
-  if (t != at::ScalarType::Undefined) {
-    Value* scalartype_v = insertScalarType(quant, t);
-    AT_ASSERT(scalartype_v != nullptr);
-    quant->addInput(scalartype_v);
-    dequant->addInput(scalartype_v);
-  }
-}
-
-// Insert Quant-Dequant node pattern for specific input to node n
-void addQuantDeQuantNodesForInput(
-    Value* v,
-    Node* n,
-    const std::tuple<std::string, float, int>& qparam,
-    at::ScalarType t = at::ScalarType::Undefined) {
-  AT_ASSERT(v != nullptr);
-  AT_ASSERT(n != nullptr);
-  Node* quant = createQuantNode(v, n);
-  Node* intrepr = createIntReprNode(v, n);
-  Node* dequant = createDeQuantNode(v, n);
-
-  // Insert the quant-intrepr-dequant node for the V->N
-  // pair which is identified as quantizable during
-  // graph iteration
-  dequant->insertBefore(n);
-  intrepr->insertBefore(dequant);
-  quant->insertBefore(intrepr);
-  n->replaceInputWith(v, dequant->output());
-
-  // Attach inputs to quantization pattern nodes
-  quant->addInput(v);
-  intrepr->addInput(quant->output());
-  dequant->addInput(intrepr->output());
-  // Insert qparam nodes
-  auto qparam_values = insertQuantParamNodes(quant, qparam);
-  for (Value* qparam_value : qparam_values) {
-    quant->addInput(qparam_value);
-    dequant->addInput(qparam_value);
-  }
-  if (t != at::ScalarType::Undefined) {
-    Value* scalartype_v = insertScalarType(quant, t);
-    AT_ASSERT(scalartype_v != nullptr);
-    quant->addInput(scalartype_v);
-    dequant->addInput(scalartype_v);
-  }
+  // Add ScalarType Node for q-dq
+  Value* scalartype_v = insertScalarType(quant, t);
+  TORCH_INTERNAL_ASSERT(scalartype_v != nullptr);
+  quant->addInput(scalartype_v);
+  dequant->addInput(scalartype_v);
+  return dequant;
 }
 
 template <typename... ArgT>
@@ -169,7 +204,7 @@ bool matchQParamDictKeytoObserver(
   // For observer node, qparam dict key matches the
   // second input name for observer node
   Value* vname = n->inputs()[1];
-  AT_ASSERT(toIValue(vname).has_value());
+  TORCH_INTERNAL_ASSERT(toIValue(vname).has_value());
   IValue valuekey = toIValue(vname).value();
   if (!valuekey.isString()) {
     return false;
@@ -194,7 +229,7 @@ static Node* addObserverFor(
     Value* v,
     Node* original_observer_node,
     Node* insert_point) {
-  AT_ASSERT(insert_point != nullptr);
+  TORCH_INTERNAL_ASSERT(insert_point != nullptr);
   WithInsertPoint ins(insert_point);
 
   // We need to pass the value name to observer function - create a constant
@@ -225,10 +260,10 @@ void InsertObserverNodes(
     const std::shared_ptr<Graph>& graph,
     Node* observer_node,
     size_t num_activation_inputs) {
-  AT_ASSERT(graph != nullptr);
+  TORCH_CHECK(graph != nullptr);
   // num_activation_inputs is the number of activations or external data
   // excluding the parameters
-  AT_ASSERT(num_activation_inputs <= graph->inputs().size());
+  TORCH_CHECK(num_activation_inputs <= graph->inputs().size());
   // For storing all values that need to be instrumented with an observer call.
   std::vector<Value*> values_to_observe;
 
@@ -312,7 +347,7 @@ void InsertQuantDequantNodes(
   blocks_to_visit.push(graph->block());
   // For storing quantizable values - node pairs that are external
   // or intermediate inputs to quantizable nodes
-  std::vector<std::pair<Value*, Node*>> quantInputs;
+  std::vector<param_info_t> quantInputs;
   // For storing quantizable values that are output of quantizable nodes
   // Since same value can go to multiple nodes, we use set so that
   // we insert quant-dequant node pairs for value only once
@@ -379,7 +414,7 @@ void InsertQuantDequantNodes(
           //           N1 is not quantizable node but N4 and N7 are
           //           quantizable nodes. So we add the (V1, N4) and
           //           (V2, N7) as insertion points for quant-dequant nodes
-          quantInputs.emplace_back(v, n);
+          quantInputs.emplace_back(param_info_t{v, n, 0});
         }
       }
     } // End Loop for nodes within block
@@ -402,17 +437,33 @@ void InsertQuantDequantNodes(
   }
 
   // Insert the quant-dequant pair for values output from quantizable nodes
-  for (auto& ele : quantOutputs) {
-    if (qparam_value_dict.count(ele) != 0) {
-      addQuantDeQuantNodes(ele, qparam_value_dict[ele]);
+  for (auto& v_to_quant : quantOutputs) {
+    if (qparam_value_dict.count(v_to_quant) != 0) {
+      Node* dq = addQuantDeQuantNodesFor(
+          v_to_quant,
+          v_to_quant->node(),
+          qparam_value_dict[v_to_quant],
+          at::ScalarType::QUInt8);
+      TORCH_INTERNAL_ASSERT(dq != nullptr);
+      v_to_quant->replaceAllUsesWith(dq->output());
+      // Above step replaces v->quant with vdq->quant. We need to restore link.
+      // Below chain traverse up from dq to q node.
+      Node* q = traverseToQuantNode(dq);
+      TORCH_INTERNAL_ASSERT(q != nullptr);
+      q->replaceInputWith(dq->output(), v_to_quant);
     }
   }
 
   // Insert the quant-dequant pair for values inputs to quantizable nodes
-  for (auto& ele : quantInputs) {
-    if (qparam_value_dict.count(ele.first) != 0) {
-      addQuantDeQuantNodesForInput(
-          ele.first, ele.second, qparam_value_dict[ele.first]);
+  for (auto& param_info : quantInputs) {
+    if (qparam_value_dict.count(param_info.v) != 0) {
+      Node* dq = addQuantDeQuantNodesFor(
+          param_info.v,
+          param_info.v->node(),
+          qparam_value_dict[param_info.v],
+          at::ScalarType::QUInt8);
+      TORCH_INTERNAL_ASSERT(dq != nullptr);
+      param_info.n->replaceInputWith(param_info.v, dq->output());
     }
   }
 }
@@ -425,5 +476,107 @@ void FoldQuantNodesIntoInputsOutputs(std::shared_ptr<Graph>& graph) {
   throw std::runtime_error("Pass not implemented yet!");
 }
 
+void InsertQuantDequantNodesForParam(
+    script::Method& method,
+    const std::string& param_name,
+    const std::function<std::tuple<std::string, float, int>(at::Tensor)>&
+        getQParamFunc,
+    at::ScalarType t) {
+  TORCH_CHECK(getQParamFunc != nullptr);
+  auto params_to_insert_qdq = getQuantizableParamsofName(method, param_name);
+
+  for (auto& param_info : params_to_insert_qdq) {
+    auto& param_slot = method.initial_ivalues()[param_info.idx];
+    const auto& itensor = param_slot.value();
+    at::Tensor tensor_var = itensor.toTensor().detach();
+    auto qparam = getQParamFunc(tensor_var);
+    Node* dq = addQuantDeQuantNodesFor(
+        param_info.v, param_info.v->node()->next(), qparam, t);
+    TORCH_INTERNAL_ASSERT(dq != nullptr);
+    param_info.n->replaceInputWith(param_info.v, dq->output());
+  }
+}
+
+void InsertQuantDequantNodesForParam(
+    script::Method& method,
+    const std::string& param_name,
+    const std::function<std::tuple<std::string, float, int>(float, float)>&
+        getQParamFunc,
+    at::ScalarType t) {
+  TORCH_CHECK(getQParamFunc != nullptr);
+  auto params_to_insert_qdq = getQuantizableParamsofName(method, param_name);
+
+  for (param_info_t& param_info : params_to_insert_qdq) {
+    // This getQParamFunc requires scale for weight and activation because for
+    // quantized ops that involve matmul with weight and bias(WX+b), input scale
+    // for bias is computed from input activation and weight. if weight attr
+    // not present we skip inserting q-dq node.
+    Node* n = param_info.n;
+    // Check if this node has weight attr as input
+    size_t param_index = getParamIndexinOpArgs(n, std::string("weight"));
+    if (param_index >= n->inputs().size()) {
+      // No attribute by name weight
+      continue;
+    }
+
+    std::vector<size_t> node_inputs_idx{0, param_index};
+    std::array<float, 2> scale_factors = {0, 0};
+    bool skip_node = false;
+    for (size_t idx = 0; idx < node_inputs_idx.size(); idx++) {
+      size_t input_index = node_inputs_idx[idx];
+      Value* input_value = n->inputs()[input_index];
+      Node* n_input_value = input_value->node();
+      Value* scale_value = getScaleValue(n_input_value);
+      if (!scale_value) {
+        // Dequant node pattern for input is missing
+        skip_node = true;
+        break;
+      }
+      c10::IValue scale_ivalue = toIValue(scale_value).value();
+      float input_scale = static_cast<float>(scale_ivalue.toDouble());
+      TORCH_CHECK(input_scale != 0.0);
+      scale_factors[idx] = input_scale;
+    }
+    if (skip_node) {
+      continue;
+    }
+    auto bias_qparam = getQParamFunc(scale_factors[0], scale_factors[1]);
+    Node* dq = addQuantDeQuantNodesFor(
+        param_info.v, param_info.v->node()->next(), bias_qparam, t);
+    TORCH_INTERNAL_ASSERT(dq != nullptr);
+    param_info.n->replaceInputWith(param_info.v, dq->output());
+  }
+}
+
+// Exposing the template api helps reuse the same interface for different
+// qparamfunc for different qschemes and params.
+template <typename Fn>
+void InsertQuantDequantNodesForParam(
+    std::shared_ptr<script::Module>& moduleObj,
+    const std::string& method_name,
+    const std::string& param_name,
+    const Fn& getQParamFunc,
+    at::ScalarType t) {
+  auto& method = moduleObj->get_method(method_name);
+  InsertQuantDequantNodesForParam(method, param_name, getQParamFunc, t);
+}
+
+// Explicit Supported Template specialization for getQParamFunc.
+template TORCH_API void InsertQuantDequantNodesForParam(
+    std::shared_ptr<script::Module>& moduleObj,
+    const std::string& method_name,
+    const std::string& param_name,
+    const std::function<std::tuple<std::string, float, int>(at::Tensor)>&
+        getQParamFunc,
+    at::ScalarType t);
+
+template TORCH_API void InsertQuantDequantNodesForParam(
+    std::shared_ptr<script::Module>& moduleObj,
+    const std::string& method_name,
+    const std::string& param_name,
+    const std::function<std::tuple<std::string, float, int>(float, float)>&
+        getQParamFunc,
+    at::ScalarType t);
+
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/passes/quantization.h b/torch/csrc/jit/passes/quantization.h
index 9bb9a12c8d84..acc6a5b26970 100644
--- a/torch/csrc/jit/passes/quantization.h
+++ b/torch/csrc/jit/passes/quantization.h
@@ -81,5 +81,24 @@ TORCH_API void QuantLinting(std::shared_ptr<Graph>& graph);
  */
 TORCH_API void FoldQuantNodesIntoInputsOutputs(std::shared_ptr<Graph>& graph);
 
+/** \brief Inserts quant-dequant nodes for attributes.
+ *
+ * This is similar to Quant-Dequant pass but it inserts quant-dequant nodes
+ * for module parameters. It changes the numerical semantics of the original
+ * model and thus we only run it when user explicitly wants that. Later passes
+ * only cleanup the IR and make sure the model runs faster/consumes less memory
+ * \moduleObj is the module object whose containing methods are modified.
+ * \param method_name whose graph is instrumented for quant-dequant nodes.
+ * \param param_name parameter for which the nodes are inserted.
+ * \param getQParamFunc function to compute qparams.
+ * \at::ScalarType t Datatype for param
+ */
+template <typename Fn>
+TORCH_API void InsertQuantDequantNodesForParam(
+    std::shared_ptr<script::Module>& moduleObj,
+    const std::string& method_name,
+    const std::string& param_name,
+    const Fn& getQParamFunc,
+    at::ScalarType t);
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/passes/remove_inplace_ops.cpp b/torch/csrc/jit/passes/remove_inplace_ops.cpp
index 69748efb1f72..ff6147d48e04 100644
--- a/torch/csrc/jit/passes/remove_inplace_ops.cpp
+++ b/torch/csrc/jit/passes/remove_inplace_ops.cpp
@@ -36,7 +36,7 @@ void RemoveInplaceOps(Block* block) {
       // create a replacement out of place op
       auto newNode = graph->create(inPlaceToOutOfPlace.at(node->kind()));
       newNode->insertBefore(node);
-
+      newNode->setScope(node->scope());
       // copy inputs
       for (auto input : node->inputs()) {
         newNode->addInput(input);
diff --git a/torch/csrc/jit/passes/requires_grad_analysis.cpp b/torch/csrc/jit/passes/requires_grad_analysis.cpp
index 038eb50cf07c..4c1f0b9066aa 100644
--- a/torch/csrc/jit/passes/requires_grad_analysis.cpp
+++ b/torch/csrc/jit/passes/requires_grad_analysis.cpp
@@ -2,6 +2,7 @@
 #include <torch/csrc/jit/constants.h>
 #include <torch/csrc/jit/ir.h>
 #include <torch/csrc/jit/operator.h>
+#include <torch/csrc/jit/passes/requires_grad_analysis.h>
 
 #include <vector>
 
diff --git a/torch/csrc/jit/passes/shape_analysis.cpp b/torch/csrc/jit/passes/shape_analysis.cpp
index 12417317658a..8651ae567330 100644
--- a/torch/csrc/jit/passes/shape_analysis.cpp
+++ b/torch/csrc/jit/passes/shape_analysis.cpp
@@ -73,11 +73,7 @@ class ShapePropagator {
       } catch (propagation_error& e) {
         setUnshapedType(node);
       } catch (std::exception& e) {
-        if (auto sl = node->getSourceLocation()) {
-          sl->wrapAndRethrowException(e, "operation failed shape propagation");
-        } else {
-          throw;
-        }
+         node->sourceRange().wrapAndRethrowException(e, "operation failed shape propagation");
       }
     }
   }
@@ -89,12 +85,8 @@ class ShapePropagator {
   bool resizesInput(Node* n) {
     static std::unordered_set<Symbol> resize_ops{
         aten::resize_,    aten::resize_as_, aten::copy_,    aten::set_,
-        aten::add_,       aten::addbmm_,    aten::addcdiv_, aten::addcmul_,
-        aten::addmv_,     aten::addr_,      aten::baddbmm_, aten::ge_,
-        aten::gt_,        aten::le_,        aten::lerp_,    aten::lt_,
-        aten::mul_,       aten::ne_,        aten::sub_,     aten::unsqueeze_,
-        aten::t_, // could preserve DimensionedTensorType Here
-        aten::transpose_,
+        aten::unsqueeze_,
+        aten::t_, aten::transpose_, // could preserve DimensionedTensorType Here
     };
 
     if (resize_ops.count(n->kind()))
@@ -705,7 +697,7 @@ class ShapePropagator {
             "aten::atan(Tensor self) -> Tensor",
             "aten::ceil(Tensor self) -> Tensor",
             "aten::clone(Tensor self) -> Tensor",
-            "aten::contiguous(Tensor self) -> Tensor",
+            "aten::contiguous(Tensor self, *, MemoryFormat memory_format=contiguous_format) -> Tensor",
             "aten::bernoulli(Tensor self, *, Generator? generator) -> Tensor",
             "aten::celu(Tensor self, Scalar alpha) -> Tensor",
             "aten::clamp(Tensor self, Scalar? min, Scalar? max) -> Tensor",
diff --git a/torch/csrc/jit/passes/subgraph_rewrite.cpp b/torch/csrc/jit/passes/subgraph_rewrite.cpp
index d5d935a4393e..1b9dd8529639 100644
--- a/torch/csrc/jit/passes/subgraph_rewrite.cpp
+++ b/torch/csrc/jit/passes/subgraph_rewrite.cpp
@@ -70,10 +70,10 @@ void SubgraphRewriter::rewriteSinglePatternOnGraph(
     // we matched.
     std::vector<Value*> inputs, outputs;
     for (Value* v : pattern_graph.inputs()) {
-      inputs.push_back(const_cast<Value*>(match.values_map.at(v)));
+      inputs.push_back(match.values_map.at(v));
     }
     for (Value* v : pattern_graph.outputs()) {
-      outputs.push_back(const_cast<Value*>(match.values_map.at(v)));
+      outputs.push_back(match.values_map.at(v));
     }
 
     // Insert a clone of replacement subgraph after the matched subgraph.
@@ -81,7 +81,7 @@ void SubgraphRewriter::rewriteSinglePatternOnGraph(
     // new subgraph, and we will get `new_outputs` vector containing values
     // produced by this new subgraph - we will then rewrite old outputs with the
     // new ones.
-    WithInsertPoint insert_point(const_cast<Node*>(match.anchor));
+    WithInsertPoint insert_point(match.anchor);
     std::vector<Value*> new_outputs =
         inlineCallTo(*graph, replacement_graph, inputs);
 
@@ -94,7 +94,7 @@ void SubgraphRewriter::rewriteSinglePatternOnGraph(
     // Record all planned deletions
     for (Node* pattern_n : pattern_graph.nodes()) {
       if (match.nodes_map.count(pattern_n)) {
-        Node* n = const_cast<Node*>(match.nodes_map.at(pattern_n));
+        Node* n = match.nodes_map.at(pattern_n);
         nodes_to_delete_.insert(n);
       }
     }
@@ -116,7 +116,7 @@ void SubgraphRewriter::rewriteSinglePatternOnGraph(
 
 bool SubgraphRewriter::overlapsWithPreviousMatches(const Match* match) {
   for (auto n : match->nodes_map) {
-    if (nodes_to_delete_.count(const_cast<Node*>(n.second))) {
+    if (nodes_to_delete_.count(n.second)) {
       return true;
     }
   }
diff --git a/torch/csrc/jit/passes/utils/memory_dag.h b/torch/csrc/jit/passes/utils/memory_dag.h
index 47ac79d6f475..373503d9a8a1 100644
--- a/torch/csrc/jit/passes/utils/memory_dag.h
+++ b/torch/csrc/jit/passes/utils/memory_dag.h
@@ -29,20 +29,27 @@ struct Value;
 //
 // So, by traversing the "points-to" graph to the leaves, you can determine
 // which memory locations an element may point to.
-class MemoryDAG {
+class TORCH_API MemoryDAG {
  public:
+
+  // explicitly delete copy constructor because otherwise windows build is confused for an exported class
+  // see https://stackoverflow.com/a/51033485/105137
+  MemoryDAG() {}
+  MemoryDAG(const MemoryDAG&)=delete;
+  MemoryDAG& operator=(const MemoryDAG&)=delete;
+
   // Make `from` point at `to`.
-  TORCH_API void makePointerTo(Element* from, Element* to);
+  void makePointerTo(Element* from, Element* to);
 
   void addToContainedElements(Element* contained, Element* container);
 
   // Make a fresh element (i.e. an element that doesn't point to anything) and
   // return it.
-  TORCH_API Element* makeFreshValue(const Value* v);
+  Element* makeFreshValue(const Value* v);
 
   // Do `a` and `b` potentially share a memory location?
   bool mayAlias(const Element* a, const Element* b) const;
-  TORCH_API bool mayAlias(Element* a, Element* b) const;
+  bool mayAlias(Element* a, Element* b) const;
 
   // Does a hold reference to any memory that is stored in elem, or vice versa?
   bool mayContainAlias(const Element* a, const Element* b) const;
diff --git a/torch/csrc/jit/pickler.cpp b/torch/csrc/jit/pickler.cpp
index f9ccc99d49f0..12cc354d68f4 100644
--- a/torch/csrc/jit/pickler.cpp
+++ b/torch/csrc/jit/pickler.cpp
@@ -17,6 +17,12 @@ PicklerClass getClass(const std::string& str) {
     return PicklerClass::TENSOR;
   } else if (str == "build_intlist") {
     return PicklerClass::INTLIST;
+  } else if (str == "build_tensorlist") {
+    return PicklerClass::TENSORLIST;
+  } else if (str == "build_doublelist") {
+    return PicklerClass::DOUBLELIST;
+  } else if (str == "build_boollist") {
+    return PicklerClass::BOOLLIST;
   }
 
   // TODO [unpickler refactor]
@@ -31,11 +37,20 @@ PicklerClass getClass(const std::string& str) {
 const std::string& getClassName(PicklerClass cls) {
   static const std::string tensor_class("build_tensor_from_id\n");
   static const std::string intlist_class("build_intlist\n");
+  static const std::string tensorlist_class("build_tensorlist\n");
+  static const std::string doublelist_class("build_doublelist\n");
+  static const std::string boollist_class("build_boollist\n");
   switch (cls) {
     case PicklerClass::TENSOR:
       return tensor_class;
     case PicklerClass::INTLIST:
       return intlist_class;
+    case PicklerClass::TENSORLIST:
+      return tensorlist_class;
+    case PicklerClass::DOUBLELIST:
+      return doublelist_class;
+    case PicklerClass::BOOLLIST:
+      return boollist_class;
     default:
       AT_ERROR("Unknown class for pickler");
   }
@@ -159,13 +174,39 @@ void Pickler::addIValue(const IValue& ivalue) {
   } else if (ivalue.isString()) {
     pushMemoizedString(ivalue);
   } else if (ivalue.isGenericList()) {
-    pushList(ivalue);
+    pushGenericList(ivalue);
   } else if (ivalue.isGenericDict()) {
     pushDict(ivalue);
   } else if (ivalue.isNone()) {
     push<OpCode>(OpCode::NONE);
   } else if (ivalue.isIntList()) {
-    pushIntList(ivalue);
+    pushSpecializedList(
+        ivalue, PicklerClass::INTLIST, [=](const IValue& ivalue) {
+          for (const auto& item : ivalue.toIntListRef()) {
+            addIValue(item);
+          }
+        });
+  } else if (ivalue.isTensorList()) {
+    pushSpecializedList(
+        ivalue, PicklerClass::TENSORLIST, [=](const IValue& ivalue) {
+          for (const auto& item : ivalue.toTensorListRef()) {
+            addIValue(item);
+          }
+        });
+  } else if (ivalue.isDoubleList()) {
+    pushSpecializedList(
+        ivalue, PicklerClass::DOUBLELIST, [=](const IValue& ivalue) {
+          for (const auto& item : ivalue.toDoubleListRef()) {
+            addIValue(item);
+          }
+        });
+  } else if (ivalue.isBoolList()) {
+    pushSpecializedList(
+        ivalue, PicklerClass::BOOLLIST, [=](const IValue& ivalue) {
+          for (const auto& item : ivalue.toBoolListRef()) {
+            addIValue(bool(item));
+          }
+        });
   } else {
     AT_ERROR("Unknown IValue type for pickling: ", ivalue.tagKind());
   }
@@ -186,6 +227,12 @@ const void* Pickler::getPointer(const IValue& ivalue) {
     return ivalue.toString().get();
   } else if (ivalue.isIntList()) {
     return ivalue.toIntList().get();
+  } else if (ivalue.isTensorList()) {
+    return ivalue.toTensorList().get();
+  } else if (ivalue.isDoubleList()) {
+    return ivalue.toDoubleList().get();
+  } else if (ivalue.isBoolList()) {
+    return ivalue.toBoolList().get();
   }
 
   return nullptr;
@@ -343,9 +390,11 @@ void Pickler::pushTensorReference(const IValue& ivalue) {
   push<OpCode>(OpCode::REDUCE);
 }
 
-void Pickler::pushIntList(const IValue& ivalue) {
-  pushClass(PicklerClass::INTLIST);
-
+void Pickler::pushSpecializedList(
+    const IValue& ivalue,
+    PicklerClass cls,
+    const std::function<void(const IValue&)>& item_pusher) {
+  pushClass(cls);
 
   // Reduce arguments are spread (e.g. `*args`) before calling the global,
   // so wrap in a tuple
@@ -355,10 +404,8 @@ void Pickler::pushIntList(const IValue& ivalue) {
   // Mark list
   push<OpCode>(OpCode::MARK);
 
-  // Add items
-  for (const auto& item : ivalue.toIntListRef()) {
-    addIValue(item);
-  }
+  // Add all items
+  item_pusher(ivalue);
 
   // Finish list
   push<OpCode>(OpCode::APPENDS);
@@ -399,7 +446,7 @@ void Pickler::pushDict(const IValue& ivalue) {
 }
 
 void Pickler::pushMemoization(const void* item) {
-  AT_CHECK(item != nullptr, "Pickler cannot memoize a nullptr");
+  TORCH_CHECK(item != nullptr, "Pickler cannot memoize a nullptr");
   memo_map_[item] = pushNextBinPut();
 }
 
@@ -420,7 +467,7 @@ size_t Pickler::pushNextBinPut() {
 void Pickler::pushMemoization(const IValue& ivalue) {
   auto ptr = getPointer(ivalue);
   memoized_ivalues_.push_back(ivalue);
-  AT_CHECK(
+  TORCH_CHECK(
       ptr != nullptr,
       "Pickler cannot memoize ",
       ivalue.tagKind(),
@@ -429,7 +476,7 @@ void Pickler::pushMemoization(const IValue& ivalue) {
   pushMemoization(ptr);
 }
 
-void Pickler::pushList(const IValue& ivalue) {
+void Pickler::pushGenericList(const IValue& ivalue) {
   auto list = ivalue.toGenericListRef();
   push<OpCode>(OpCode::EMPTY_LIST);
   pushMemoization(ivalue);
@@ -458,7 +505,7 @@ void Pickler::pushTuple(const IValue& ivalue) {
 
 std::vector<IValue> Unpickler::parse_ivalue_list() {
   run();
-  AT_CHECK(
+  TORCH_CHECK(
       stack_.size() == 1,
       "Unpickler expected 1 element on the stack, but found ",
       stack_.size());
@@ -488,12 +535,12 @@ double Unpickler::readFloat() {
 
 void Unpickler::run() {
   // Expect a PROTO opcode and protocol number at the start of blob
-  AT_CHECK(
+  TORCH_CHECK(
       readOpCode() == OpCode::PROTO,
       "Expected PROTO opcode at the start"
       " of pickle archive");
   uint8_t protocol = read<uint8_t>();
-  AT_CHECK(
+  TORCH_CHECK(
       protocol == 2,
       "Only Pickle protocol 2 is supported, found protocol = ",
       protocol);
@@ -509,7 +556,6 @@ void Unpickler::run() {
   AT_ERROR("Overran buffer while unpickling data, didn't find STOP opcode");
 }
 
-
 OpCode Unpickler::readInstruction() {
   auto opcode = readOpCode();
   switch (opcode) {
@@ -517,11 +563,11 @@ OpCode Unpickler::readInstruction() {
       if (last_opcode_ == OpCode::NEWOBJ) {
         // TODO [unpickler refactor] remove this case
         // It's a list specialization, the enum ID of which is on the stack
-        AT_CHECK(
+        TORCH_CHECK(
             stack_.size() > 0,
             "Unpickler found an empty stack when it expected a value");
         auto value = stack_.back().ivalue().toInt();
-        AT_CHECK(
+        TORCH_CHECK(
             value >= 0 && value <= std::numeric_limits<uint8_t>::max(),
             "Unpickler could not decode PicklerClass for ",
             value);
@@ -534,6 +580,14 @@ OpCode Unpickler::readInstruction() {
         // specialization
         if (stack_.back().pickler_class() == PicklerClass::INTLIST) {
           stack_.emplace_back(std::vector<int64_t>());
+        } else if (stack_.back().pickler_class() == PicklerClass::INTLIST) {
+          stack_.emplace_back(std::vector<int64_t>());
+        } else if (stack_.back().pickler_class() == PicklerClass::TENSORLIST) {
+          stack_.emplace_back(std::vector<at::Tensor>());
+        } else if (stack_.back().pickler_class() == PicklerClass::DOUBLELIST) {
+          stack_.emplace_back(std::vector<double>());
+        } else if (stack_.back().pickler_class() == PicklerClass::BOOLLIST) {
+          stack_.emplace_back(std::vector<bool>());
         } else {
           AT_ERROR("Unknown list specialization");
         }
@@ -552,7 +606,7 @@ OpCode Unpickler::readInstruction() {
       memo_table_.push_back(stack_.back());
     } break;
     case OpCode::LONG_BINPUT: {
-      AT_CHECK(
+      TORCH_CHECK(
           std::numeric_limits<size_t>::max() >=
               std::numeric_limits<uint32_t>::max(),
           "Found a LONG_BINPUT opcode, but size_t on this system is "
@@ -573,6 +627,9 @@ OpCode Unpickler::readInstruction() {
     case OpCode::NEWFALSE: {
       stack_.emplace_back(false);
     } break;
+    case OpCode::NONE: {
+      stack_.emplace_back(IValue());
+    } break;
     case OpCode::BININT1: {
       int8_t value = read<int8_t>();
       stack_.emplace_back(int64_t(value));
@@ -652,20 +709,19 @@ OpCode Unpickler::readInstruction() {
       auto setitem_data = stack_.back().ivalue();
       stack_.pop_back();
 
-
       auto class_name =
-        static_cast<PicklerClass>(uint8_t(stack_.back().ivalue().toInt()));
+          static_cast<PicklerClass>(uint8_t(stack_.back().ivalue().toInt()));
       stack_.pop_back();
 
       switch (class_name) {
-      case PicklerClass::TENSOR:
-        stack_.emplace_back(tensor_table_->at(setitem_data.toInt()));
-        break;
-      case PicklerClass::INTLIST:
-        stack_.emplace_back(setitem_data);
-        break;
-      default:
-        AT_ERROR("Unknown pickler class id");
+        case PicklerClass::TENSOR:
+          stack_.emplace_back(tensor_table_->at(setitem_data.toInt()));
+          break;
+        case PicklerClass::INTLIST:
+          stack_.emplace_back(setitem_data);
+          break;
+        default:
+          AT_ERROR("Unknown pickler class id");
       }
     } break;
     case OpCode::REDUCE: {
@@ -685,33 +741,69 @@ OpCode Unpickler::readInstruction() {
         case PicklerClass::INTLIST:
           stack_.emplace_back(data->elements().at(0).toIntListRef());
           break;
+        case PicklerClass::TENSORLIST:
+          stack_.emplace_back(data->elements().at(0).toTensorListRef());
+          break;
+        case PicklerClass::DOUBLELIST:
+          stack_.emplace_back(data->elements().at(0).toDoubleListRef());
+          break;
+        case PicklerClass::BOOLLIST:
+          stack_.emplace_back(data->elements().at(0).toBoolListRef());
+          break;
         default:
           AT_ERROR("Unknown pickler class id");
       }
     } break;
     default:
-      AT_ERROR("Unknown opcode for unpickling at ", reinterpret_cast<void*>(opcode),": ", static_cast<uint8_t>(opcode));
+      AT_ERROR(
+          "Unknown opcode for unpickling at ",
+          reinterpret_cast<void*>(opcode),
+          ": ",
+          static_cast<uint8_t>(opcode));
   }
   return opcode;
 }
 
+// Pop all the list items off of the stack and append them to the list at the
+// corresponding MARK
 void Unpickler::readList() {
   size_t start = marks_.back();
   marks_.pop_back();
-  auto list_ivalue = stack_.at(start - 1);
+  auto list_ivalue = stack_.at(start - 1).ivalue();
   auto num_elements = stack_.size() - start;
-  if (list_ivalue.ivalue().isIntList()) {
-    auto list = stack_.at(start - 1).ivalue().toIntList();
-    list->elements().reserve(num_elements);
-    for (auto it = stack_.begin() + start; it != stack_.end(); ++it) {
-      list->elements().emplace_back(it->ivalue().toInt());
+  auto elements = at::ArrayRef<StackItem>(stack_).slice(start);
+  if (list_ivalue.isIntList()) {
+    auto& list = list_ivalue.toIntList()->elements();
+    list.reserve(num_elements);
+    for (const auto& elem : elements) {
+      list.emplace_back(elem.ivalue().toInt());
     }
-  } else {
-    auto list = stack_.at(start - 1).ivalue().toGenericList();
-    list->elements().reserve(num_elements);
-    for (auto it = stack_.begin() + start; it != stack_.end(); ++it) {
-      list->elements().emplace_back(it->ivalue());
+  } else if (list_ivalue.isTensorList()) {
+    auto& list = list_ivalue.toTensorList()->elements();
+    list.reserve(num_elements);
+    for (const auto& elem : elements) {
+      list.emplace_back(elem.ivalue().toTensor());
+    }
+  } else if (list_ivalue.isDoubleList()) {
+    auto& list = list_ivalue.toDoubleList()->elements();
+    list.reserve(num_elements);
+    for (const auto& elem : elements) {
+      list.emplace_back(elem.ivalue().toDouble());
+    }
+  } else if (list_ivalue.isBoolList()) {
+    auto& list = list_ivalue.toBoolList()->elements();
+    list.reserve(num_elements);
+    for (const auto& elem : elements) {
+      list.push_back(elem.ivalue().toBool());
     }
+  } else if (list_ivalue.isGenericList()) {
+    auto& list = list_ivalue.toGenericList()->elements();
+    list.reserve(num_elements);
+    for (const auto& elem : elements) {
+      list.emplace_back(elem.ivalue());
+    }
+  } else {
+    AT_ERROR("Unknown IValue list kind: ", list_ivalue.tagKind());
   }
 
   stack_.erase(stack_.begin() + start, stack_.end());
@@ -734,7 +826,7 @@ std::string Unpickler::readString() {
     }
 
     // Simple check just in case there is no terminating '\n'
-    AT_CHECK(
+    TORCH_CHECK(
         is_valid_python_id_char(c),
         "Found character '",
         uint8_t(c),
@@ -743,7 +835,7 @@ std::string Unpickler::readString() {
 
     // Increment after to exclude newline from string
     ++n;
-    AT_CHECK(
+    TORCH_CHECK(
         chars + n < char_end_ptr,
         "Unpickler overran buffer while reading a string (expected a newline)");
   }
@@ -773,7 +865,7 @@ std::pair<at::Tensor, uint64_t> getWriteableTensor(const at::Tensor& tensor) {
                              {static_cast<int64_t>(tensor.storage().size())},
                              /* stride = */ {1})
                          .cpu();
-    AT_CHECK(
+    TORCH_CHECK(
         storage_tensor.element_size() * storage_tensor.storage().size() ==
             record_size,
         "Storage tensor size did not match record size");
diff --git a/torch/csrc/jit/pickler.h b/torch/csrc/jit/pickler.h
index 3827d5d8ee76..abb0e66205c8 100644
--- a/torch/csrc/jit/pickler.h
+++ b/torch/csrc/jit/pickler.h
@@ -90,6 +90,12 @@ enum PicklerClass : uint8_t {
   TENSOR = 0,
   // List[int]
   INTLIST = 1,
+  // List[Tensor]
+  TENSORLIST = 2,
+  // List[float]
+  DOUBLELIST = 3,
+  // List[bool]
+  BOOLLIST = 4
 };
 
 using ::c10::IValue;
@@ -122,6 +128,7 @@ class Pickler {
  private:
   void pushDict(const IValue& ivalue);
   void pushDouble(const IValue& ivalue);
+  void pushGenericList(const IValue& ivalue);
   void pushInt(const IValue& ivalue);
   void pushIntList(const IValue& ivalue);
   void pushList(const IValue& ivalue);
@@ -134,6 +141,10 @@ class Pickler {
 
   void pushBinGet(uint32_t memo_id);
   void pushClass(PicklerClass cls);
+  void pushSpecializedList(
+      const IValue& ivalue,
+      PicklerClass cls,
+      const std::function<void(const IValue&)>& item_pusher);
   void pushGlobal(const std::string& name);
   void pushMemoization(const void* item);
   void pushString(const std::string& string);
@@ -186,19 +197,19 @@ struct StackItem {
   StackItem(PicklerClass pickler_class)
       : pickler_class_(pickler_class), ivalue_(c10::nullopt) {}
 
-  IValue ivalue() {
+  IValue ivalue() const {
     return *ivalue_;
   }
 
-  PicklerClass pickler_class() {
+  PicklerClass pickler_class() const {
     return *pickler_class_;
   }
 
-  c10::optional<IValue> ivalue_opt() {
+  c10::optional<IValue> ivalue_opt() const {
     return ivalue_;
   }
 
-  c10::optional<PicklerClass> pickler_class_opt() {
+  c10::optional<PicklerClass> pickler_class_opt() const {
     return pickler_class_;
   }
 
@@ -231,7 +242,7 @@ class Unpickler {
   // so that the number of bytes read / type read is explicit
   template <typename T>
   T read() {
-    AT_CHECK(
+    TORCH_CHECK(
         bytes_ + sizeof(T) <= end_ptr_,
         "Unpickler overran buffer while reading a value");
     T item;
diff --git a/torch/csrc/jit/profiling_graph_executor_impl.cpp b/torch/csrc/jit/profiling_graph_executor_impl.cpp
new file mode 100644
index 000000000000..cb7b8d5c0a6e
--- /dev/null
+++ b/torch/csrc/jit/profiling_graph_executor_impl.cpp
@@ -0,0 +1,42 @@
+#include <torch/csrc/jit/profiling_graph_executor_impl.h>
+
+namespace torch {
+namespace jit {
+
+thread_local bool profiling_mode = false;
+bool& getProfilingMode() {
+  return profiling_mode;
+}
+
+void ProfilingGraphExecutorImpl::run(Stack& stack) {
+  TORCH_CHECK(
+      stack.size() >= num_inputs,
+      "expected ",
+      num_inputs,
+      " inputs, but got only ",
+      stack.size());
+
+  {
+    std::lock_guard<std::mutex> lock(compile_mutex);
+    if (!pr_) {
+      auto g = graph->copy();
+      runRequiredPasses(g);
+      pr_ = ProfilingRecord::instrumentGraph(g);
+      exec_plan_ = caffe2::make_unique<ExecutionPlan>(pr_->profiled_graph_);
+    }
+  }
+
+  if (pr_->profiling_count_ > 0) {
+    exec_plan_->run(stack);
+  } else {
+    AT_ERROR("Not yet implemented");
+  }
+  return;
+}
+
+GraphExecutorState ProfilingGraphExecutorImpl::getDebugState() {
+  AT_ERROR("not supported");
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/profiling_graph_executor_impl.h b/torch/csrc/jit/profiling_graph_executor_impl.h
new file mode 100644
index 000000000000..2017628b26d0
--- /dev/null
+++ b/torch/csrc/jit/profiling_graph_executor_impl.h
@@ -0,0 +1,20 @@
+#pragma once
+#include <torch/csrc/jit/graph_executor_impl.h>
+
+namespace torch {
+namespace jit {
+
+struct ProfilingGraphExecutorImpl : public GraphExecutorImplBase {
+  using GraphExecutorImplBase::GraphExecutorImplBase;
+
+  void run(Stack& stack) override;
+  GraphExecutorState getDebugState() override;
+  ~ProfilingGraphExecutorImpl() override = default;
+
+ private:
+  std::unique_ptr<ProfilingRecord> pr_;
+  std::unique_ptr<ExecutionPlan> exec_plan_;
+};
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/profiling_record.cpp b/torch/csrc/jit/profiling_record.cpp
index 1d7d08659399..4876ef95bb93 100644
--- a/torch/csrc/jit/profiling_record.cpp
+++ b/torch/csrc/jit/profiling_record.cpp
@@ -6,7 +6,7 @@ namespace jit {
 ProfilingRecord::ProfilingRecord(std::shared_ptr<Graph> g)
     : profiled_graph_(std::move(g)), profiling_count_(3) {}
 
-Node* ProfilingRecord::createProfileNode(
+ProfileOp* ProfilingRecord::createProfileNode(
     const std::function<void(Stack&)>& fp,
     at::ArrayRef<Value*> inputs) {
   auto pn = new ProfileOp(profiled_graph_.get(), fp);
@@ -18,32 +18,37 @@ Node* ProfilingRecord::createProfileNode(
 }
 
 void ProfilingRecord::instrumentBlock(Block* block) {
-  // iterating backwards allows us to easily insert profile nodes
-  // without affecting an iterator
-  for (auto it = block->nodes().rend(); it != block->nodes().rbegin(); --it) {
+  for (auto it = block->nodes().begin(); it != block->nodes().end(); ++it) {
     auto n = *it;
-    for (auto o : n->outputs()) {
-      if (!o->type()->isSubclass(TypeKind::TensorType)) {
+    for (auto i : n->inputs()) {
+      if (!i->type()->isSubclass(TypeKind::TensorType) ||
+          i->node()->kind() == prim::profile) {
         continue;
       }
 
-      std::function<void(Stack&)> shape_profiler = [this, o](Stack& stack) {
+      auto pn = createProfileNode(nullptr, {i});
+      auto pno = pn->addOutput();
+      pno->setType(i->type());
+      std::function<void(Stack&)> shape_profiler = [this, pno](Stack& stack) {
         IValue t;
         pop(stack, t);
         if (t.isTensor()) {
           auto pttp = ProfiledTensorType::create(t.toTensor());
           std::lock_guard<std::mutex> lock(this->mutex_);
-          if (o->type()->isSubclass(TypeKind::ProfiledTensorType)) {
-            auto type = o->type()->cast<ProfiledTensorType>();
-            o->setType(type->merge(pttp));
+          if (pno->type()->isSubclass(TypeKind::ProfiledTensorType)) {
+            auto type = pno->type()->cast<ProfiledTensorType>();
+            pno->setType(type->merge(pttp));
           } else {
-            o->setType(pttp);
+            pno->setType(pttp);
           }
         }
+        // passing t through
+        push(stack, t);
       };
 
-      auto pn = createProfileNode(shape_profiler, {o});
-      pn->insertAfter(n);
+      pn->setCallback(shape_profiler);
+      pn->insertBefore(n);
+      n->replaceInputWith(i, pn->output());
     }
 
     for (auto b : n->blocks()) {
@@ -61,7 +66,10 @@ std::unique_ptr<ProfilingRecord> ProfilingRecord::instrumentGraph(
   pr->instrumentBlock(new_g->block());
   std::function<void(Stack&)> counter = [raw_pr](Stack&) {
     std::lock_guard<std::mutex> lock(raw_pr->mutex_);
-    raw_pr->profiling_count_--;
+    if (raw_pr->profiling_count_ > 0)
+    {
+        raw_pr->profiling_count_--;
+    }
   };
 
   auto pop = pr->createProfileNode(counter, {});
diff --git a/torch/csrc/jit/profiling_record.h b/torch/csrc/jit/profiling_record.h
index 29191e5acde1..a2531d17d6bc 100644
--- a/torch/csrc/jit/profiling_record.h
+++ b/torch/csrc/jit/profiling_record.h
@@ -30,7 +30,7 @@ struct ProfilingRecord {
   size_t profiling_count_;
 
  private:
-  Node* createProfileNode(
+  ProfileOp* createProfileNode(
       const std::function<void(Stack&)>& fp,
       at::ArrayRef<Value*> inputs);
   void instrumentBlock(Block* block);
diff --git a/torch/csrc/jit/pybind_utils.h b/torch/csrc/jit/pybind_utils.h
index 35e2d8969596..8e4486dfbb21 100644
--- a/torch/csrc/jit/pybind_utils.h
+++ b/torch/csrc/jit/pybind_utils.h
@@ -94,6 +94,11 @@ inline TypedIValue toTypedIValue(py::handle input) {
     if (ten.is_sparse()) {
       AT_ERROR("sparse tensors not supported");
     }
+    if (ten.is_mkldnn()) {
+      // mkldnn tensor as opaque tensor doesn't have strides, so we can
+      // not create a CompleteTensorType
+      return TypedIValue(ten, DimensionedTensorType::create(ten));
+    }
     return TypedIValue(ten, CompleteTensorType::create(ten));
   } else if (six::isTuple(input)) {
     py::tuple input_tuple = py::cast<py::tuple>(input);
diff --git a/torch/csrc/jit/python_ir.cpp b/torch/csrc/jit/python_ir.cpp
index 05e82e4200c1..e89c75e9c390 100644
--- a/torch/csrc/jit/python_ir.cpp
+++ b/torch/csrc/jit/python_ir.cpp
@@ -439,15 +439,9 @@ void initPythonIRBindings(PyObject* module_) {
             return ss.str();
           })
       .def(
-          "getSourceLocation",
-          [](Node& n) -> py::object {
-            std::stringstream ss;
-            if (auto sl = n.getSourceLocation()) {
-              sl->highlight(ss);
-              return py::str(ss.str());
-            } else {
-              return py::none();
-            }
+          "sourceRange",
+          [](Node& n) {
+            return n.sourceRange().str();
           })
       .def("hasMultipleOutputs", [](Node& n) { return n.outputs().size() > 1; })
       .def("outputsSize", [](Node& n) { return n.outputs().size(); })
@@ -687,7 +681,9 @@ void initPythonIRBindings(PyObject* module_) {
   py::class_<DictType, Type, std::shared_ptr<DictType>>(m, "DictType")
       .def(py::init([](TypePtr key, TypePtr value) {
         return DictType::create(key, value);
-      }));
+      }))
+      .def("getKeyType", &DictType::getKeyType)
+      .def("getValueType", &DictType::getValueType);
   py::class_<OptionalType, Type, std::shared_ptr<OptionalType>>(
       m, "OptionalType")
       .def(py::init([](TypePtr a) { return OptionalType::create(a); }))
diff --git a/torch/csrc/jit/python_tracer.cpp b/torch/csrc/jit/python_tracer.cpp
index 00069532e9a2..478262a86cc1 100644
--- a/torch/csrc/jit/python_tracer.cpp
+++ b/torch/csrc/jit/python_tracer.cpp
@@ -101,9 +101,7 @@ Node* preRecordPythonTrace(
 }
 
 void pythonRecordSourceLocation(Node* n) {
-  auto sl =
-      std::make_shared<StringSourceLocation>(getPythonInterpreterStackTrace());
-  n->setSourceLocation(sl);
+  n->setSourceRange(SourceRange(getPythonInterpreterStackTrace()));
 }
 
 void pythonWarn(const std::string& reason) {
diff --git a/torch/csrc/jit/register_c10_ops.cpp b/torch/csrc/jit/register_c10_ops.cpp
index c0d4c56ec996..2d52edd00c49 100644
--- a/torch/csrc/jit/register_c10_ops.cpp
+++ b/torch/csrc/jit/register_c10_ops.cpp
@@ -7,11 +7,70 @@ namespace torch {
 namespace jit {
 namespace {
 
-at::Tensor unwrap(at::Tensor&& tensor) {
+at::Tensor unwrap_tensor(at::Tensor&& tensor) {
   if (tensor.requires_grad()) {
     throw std::runtime_error("Autograd not yet supported for c10 ops.");
   }
-  return torch::autograd::Variable(std::move(tensor)).data();
+  if (tensor.is_variable()) {
+    return torch::autograd::Variable(std::move(tensor)).data();
+  } else {
+    return std::move(tensor);
+  }
+}
+
+IValue unwrap(IValue&& ivalue) {
+  // TODO Remove the .defined() check once we don't have undefined tensors on the stack anymore (@wanchaol is working on this)
+  if (ivalue.isTensor() && ivalue.toTensor().defined()) {
+    return unwrap_tensor(std::move(ivalue).toTensor());
+  } else if (ivalue.isTensorList()) {
+    for (auto& item : ivalue.toTensorList()->elements()) {
+      item = unwrap_tensor(std::move(item));
+    }
+    return std::move(ivalue);
+  } else if (ivalue.isGenericList()) {
+    for (auto& item : ivalue.toGenericList()->elements()) {
+      item = unwrap(std::move(item));
+    }
+    return std::move(ivalue);
+  } else if (ivalue.isGenericDict()) {
+    for (auto& item : ivalue.toGenericDict()->elements()) {
+      item.setValue(unwrap(item.value()));
+    }
+    return std::move(ivalue);
+  } else {
+    return std::move(ivalue);
+  }
+}
+
+at::Tensor wrap_tensor(at::Tensor&& tensor) {
+  if (tensor.is_variable()) {
+    return std::move(tensor);
+  } else {
+    return torch::autograd::make_variable(std::move(tensor));
+  }
+}
+
+IValue wrap(IValue&& ivalue) {
+  if (ivalue.isTensor()) {
+    return wrap_tensor(std::move(ivalue).toTensor());
+  } else if (ivalue.isTensorList()) {
+    for (auto& item : ivalue.toTensorList()->elements()) {
+      item = wrap_tensor(std::move(item));
+    }
+    return std::move(ivalue);
+  } else if (ivalue.isGenericList()) {
+    for (auto& item : ivalue.toGenericList()->elements()) {
+      item = wrap(std::move(item));
+    }
+    return std::move(ivalue);
+  } else if (ivalue.isGenericDict()) {
+    for (auto& item : ivalue.toGenericDict()->elements()) {
+      item.setValue(wrap(item.value()));
+    }
+    return std::move(ivalue);
+  } else {
+    return std::move(ivalue);
+  }
 }
 
 // TODO This currently only handles tensors with requires_grad==False correctly.
@@ -112,23 +171,14 @@ Operator createOperatorFromC10(const c10::OperatorHandle& op) {
 
       // unwrap tensor inputs from variable
       for (auto iter = stack.end() - input_size; iter != stack.end(); ++iter) {
-        // TODO Remove the .defined() check once we don't have undefined tensors on the stack anymore (@wanchaol is working on this)
-        if (iter->isTensor() && iter->toTensor().defined()) {
-          *iter = unwrap(std::move(*iter).toTensor());
-        } else if (iter->isTensorList()) {
-          for (auto& item : iter->toTensorList()->elements()) {
-            item = unwrap(std::move(item));
-          }
-        }
+        *iter = unwrap(std::move(*iter));
       }
 
       c10::Dispatcher::singleton().lookup(op, &stack).call(&stack);
 
       // wrap tensor outputs as variable
       for (auto iter = stack.end() - output_size; iter != stack.end(); ++iter) {
-        if (iter->isTensor()) {
-          *iter = torch::autograd::make_variable(std::move(*iter).toTensor());
-        }
+        *iter = wrap(std::move(*iter));
       }
 
       if (jit::tracer::isTracing()) {
diff --git a/torch/csrc/jit/register_prim_ops.cpp b/torch/csrc/jit/register_prim_ops.cpp
index e96c22372034..157b0d630ac1 100644
--- a/torch/csrc/jit/register_prim_ops.cpp
+++ b/torch/csrc/jit/register_prim_ops.cpp
@@ -8,9 +8,8 @@
 #include <torch/csrc/jit/fuser/interface.h>
 #include <torch/csrc/jit/graph_executor.h>
 #include <torch/csrc/jit/ir.h>
-#include <torch/csrc/jit/pickler.h>
-#include <torch/csrc/jit/script/logging.h>
 #include <torch/csrc/jit/operator.h>
+#include <torch/csrc/jit/pickler.h>
 #include <torch/csrc/jit/profiling_record.h>
 #include <torch/csrc/jit/script/compilation_unit.h>
 #include <torch/csrc/jit/script/error_report.h>
@@ -18,21 +17,23 @@
 #include <torch/csrc/jit/script/logging.h>
 
 #include <ATen/ExpandUtils.h>
+#include <ATen/Parallel.h>
 #include <ATen/WrapDimUtils.h>
-#include <ATen/core/ivalue.h>
 #include <ATen/core/Dict.h>
+#include <ATen/core/ivalue.h>
 #include <c10/core/thread_pool.h>
 #include <c10/util/SmallVector.h>
 
+#include <cctype>
 #include <algorithm>
 #include <cmath>
 #include <exception>
+#include <fstream>
 #include <iostream>
 #include <limits>
 #include <memory>
 #include <mutex>
 #include <ostream>
-#include <fstream>
 #include <stdexcept>
 #include <string>
 #include <typeinfo>
@@ -97,6 +98,16 @@ static int64_t floordiv(int64_t a, int64_t b) {
   }
 }
 
+static int gcd(int a, int b) {
+  while (b != 0) {
+    int r = a % b;
+    a = b;
+    b = r;
+  }
+  // in python gcd returns non-negative values
+  return std::abs(a);
+}
+
 // reference function THPVariable_to in python_variable_methods.cpp
 static at::Tensor to_dispatch(
     at::Tensor self,
@@ -130,11 +141,9 @@ int64_t normalizeIndex(int64_t idx, int64_t list_size) {
 
 RegisterOperators reg(
     {Operator(
-         "prim::profile(...) -> ()",
+         prim::profile,
          [](const Node* node) {
-           // TODO: figure out why cast isn't marked as const
-           auto n = const_cast<Node*>(node); // NOLINT
-           auto callback = n->cast<ProfileOp>()->getCallback();
+           auto callback = node->cast<ProfileOp>()->getCallback();
            return [callback](Stack& stack) {
              callback(stack);
              return 0;
@@ -150,6 +159,14 @@ RegisterOperators reg(
              return 0;
            };
          }),
+     Operator(
+         "prim::Guard(Tensor(a) t) -> Tensor(a)",
+         [](const Node* node) {
+           return [](Stack& stack) {
+             AT_ERROR("Should be replaced by prim::BailOut");
+             return 0;
+           };
+         }),
      Operator(
          "prim::rangelist(int n) -> int[]",
          [](Stack& stack) {
@@ -505,7 +522,7 @@ RegisterOperators reg(
              std::vector<int64_t> regular_shape = shape;
              std::vector<int64_t> last_shape = shape;
              int64_t dim = at::maybe_wrap_dim(raw_dim, shape.size());
-             AT_CHECK(
+             TORCH_CHECK(
                  dim < (int64_t)regular_shape.size(),
                  "Dimension out of range for chunk");
              int64_t split_size = (regular_shape[dim] + chunks - 1) / chunks;
@@ -546,7 +563,7 @@ RegisterOperators reg(
          }),
 
      Operator(
-         "prim::IgnoredPythonOp(...) -> ()",
+         "prim::IgnoredPythonOp(...) -> None",
          [](Stack& stack) {
            throw JITException(
                "This Python function is annotated to be ignored"
@@ -749,7 +766,7 @@ RegisterOperators reg(
              int64_t num_results = result.size();
              if (num_results != chunks) {
                if (num_results > chunks) {
-                 AT_CHECK(
+                 TORCH_CHECK(
                      num_results == chunks,
                      "Expected chunk to return ",
                      chunks,
@@ -757,7 +774,7 @@ RegisterOperators reg(
                      num_results);
                }
                for (int64_t i = num_results; i < chunks; ++i) {
-                 AT_CHECK(
+                 TORCH_CHECK(
                      !outputs_used[i],
                      "Expected chunk to return at least ",
                      chunks,
@@ -780,7 +797,7 @@ RegisterOperators reg(
              return [=](Stack& stack) {
                auto ilist = pop(stack);
                const auto& list = ilist.toIntList()->elements();
-               AT_CHECK(
+               TORCH_CHECK(
                    list.size() == num_outputs,
                    "Expected ",
                    num_outputs,
@@ -793,7 +810,7 @@ RegisterOperators reg(
              return [=](Stack& stack) {
                auto ilist = pop(stack);
                const auto& list = ilist.toDoubleList()->elements();
-               AT_CHECK(
+               TORCH_CHECK(
                    list.size() == num_outputs,
                    "Expected ",
                    num_outputs,
@@ -806,7 +823,7 @@ RegisterOperators reg(
              return [=](Stack& stack) {
                auto ilist = pop(stack);
                const auto& list = ilist.toTensorList()->elements();
-               AT_CHECK(
+               TORCH_CHECK(
                    list.size() == num_outputs,
                    "Expected ",
                    num_outputs,
@@ -819,7 +836,7 @@ RegisterOperators reg(
              return [=](Stack& stack) {
                auto glist = pop(stack);
                const auto& list = glist.toGenericList()->elements();
-               AT_CHECK(
+               TORCH_CHECK(
                    list.size() == num_outputs,
                    "Expected ",
                    num_outputs,
@@ -890,7 +907,7 @@ RegisterOperators reg(
          "aten::_unwrap_optional(t(a)? optional) -> t(a)",
          [](Stack& stack) {
            auto val = pop(stack);
-           AT_CHECK(!val.isNone(), "Unwrapping null optional");
+           TORCH_CHECK(!val.isNone(), "Unwrapping null optional");
            push(stack, val);
            return 0;
          }),
@@ -916,7 +933,7 @@ RegisterOperators reg(
 
              push(stack, forked_interprester.getFuture());
 
-             c10::global_work_queue().run(std::move(continuation));
+             at::launch(std::move(continuation));
              return 0;
            };
          }),
@@ -1051,12 +1068,21 @@ RegisterOperators logging_operators(
     return 0;                                                   \
   })
 
+#define DEFINE_STR_CMP_OP(aten_op, op)                           \
+  Operator(#aten_op "(str a, str b) -> bool", [](Stack& stack) { \
+    auto b = pop(stack).toStringRef();                           \
+    auto a = pop(stack).toStringRef();                           \
+    push(stack, op);                                             \
+    return 0;                                                    \
+  })
+
 #define DEFINE_BINARY_OP(aten_op, op)             \
   DEFINE_GENERIC_OP(aten_op, op, op, int, float), \
       DEFINE_INT_FLOAT_OP(aten_op, op, float)
 #define DEFINE_COMPARISON_OP(aten_op, op)         \
   DEFINE_GENERIC_OP(aten_op, op, op, bool, bool), \
-      DEFINE_INT_FLOAT_OP(aten_op, op, bool)
+      DEFINE_INT_FLOAT_OP(aten_op, op, bool), DEFINE_STR_CMP_OP(aten_op, op)
+
 #define DEFINE_BOOL_OP(aten_op, op)                                \
   Operator(#aten_op "(bool a, bool b) -> bool", [](Stack& stack) { \
     bool a, b;                                                     \
@@ -1067,7 +1093,7 @@ RegisterOperators logging_operators(
 
 int stringSlice(Stack& stack) {
   auto step = pop(stack).toInt();
-  AT_CHECK(step == 1, "Slicing a string only supports step=1");
+  TORCH_CHECK(step == 1, "Slicing a string only supports step=1");
 
   auto end = pop(stack).toInt();
   auto start = pop(stack).toInt();
@@ -1726,6 +1752,18 @@ RegisterOperators reg2({
           push(stack, t.sizes()[0]);
           return 0;
         }),
+    Operator(
+        "aten::list(str t) -> str[]",
+        [](Stack& stack) {
+          auto str = pop(stack).toStringRef();
+          std::vector<IValue> chars;
+          chars.reserve(str.size());
+          for (auto c : str) {
+            chars.push_back(std::string(1, c));
+          }
+          push(stack, chars);
+          return 0;
+        }),
 // Mutable ops for lists containing mutable types.
 #define CREATE_MUTABLE_LIST_OPS(decl_type, c_type)                          \
   Operator(                                                                 \
@@ -1733,7 +1771,7 @@ RegisterOperators reg2({
       listSelect<Shared<c_type>>),                                          \
       Operator(                                                             \
           "aten::append( " decl_type "[](a!) self, " decl_type              \
-          "(c) el) -> " decl_type "[](a!)",                                 \
+          "(c -> *) el) -> " decl_type "[](a!)",                            \
           listAppend<Shared<c_type>, c_type::ElemType>),                    \
       Operator(                                                             \
           "aten::reverse( " decl_type "[](a!) self) -> ()",                 \
@@ -1749,7 +1787,7 @@ RegisterOperators reg2({
           listCopy<Shared<c_type>>),                                        \
       Operator(                                                             \
           "aten::_set_item(" decl_type "[](a!) l, int idx, " decl_type      \
-          " el) -> " decl_type "[](a!)",                                    \
+          "(b -> *) el) -> " decl_type "[](a!)",                            \
           listSetItem<Shared<c_type>, c_type::ElemType>),                   \
       Operator(                                                             \
           "aten::clear( " decl_type "[](a!) self) -> ()",                   \
@@ -1757,7 +1795,7 @@ RegisterOperators reg2({
       Operator(                                                             \
           "aten::insert( " decl_type                                        \
           "[](a!) self, int idx,                 \
-          " decl_type " el) -> ()",                                         \
+          " decl_type "(b -> *) el) -> ()",                                 \
           listInsert<Shared<c_type>, c_type::ElemType>),                    \
       Operator(                                                             \
           "aten::pop(" decl_type                                            \
@@ -1894,6 +1932,70 @@ RegisterOperators reg2({
     Operator(
         "aten::slice(str string, int start, int end=9223372036854775807, int step=1) -> str",
         stringSlice),
+
+// python string is methods return false if empty
+#define DEFINE_STRING_IS_OP(op_name, char_op)                      \
+  Operator(#op_name "(str self) -> bool", [](Stack& stack) {       \
+    auto string = pop(stack).toStringRef();                        \
+    push(                                                          \
+        stack,                                                     \
+        string.size() != 0 &&                                      \
+            std::all_of(string.begin(), string.end(), [](char c) { \
+              return char_op(c);                                   \
+            }));                                                   \
+    return 0;                                                      \
+  })
+
+    // upper and lower require there to be at least one alpha character,
+    // and ignore all other characters
+    Operator(
+        "aten::isupper(str self) -> bool",
+        [](Stack& stack) {
+          auto string = pop(stack).toStringRef();
+          bool found_alpha = false;
+          bool is_upper = true;
+          for (size_t i = 0; i < string.size() && is_upper; ++i) {
+            char c = string[i];
+            found_alpha |= std::isalpha(c);
+            is_upper &= (!std::isalpha(c) || std::isupper(c));
+          }
+          push(stack, found_alpha && is_upper);
+          return 0;
+        }),
+    Operator(
+        "aten::islower(str self) -> bool",
+        [](Stack& stack) {
+          auto string = pop(stack).toStringRef();
+          bool found_alpha = false;
+          bool is_lower = true;
+          for (size_t i = 0; i < string.size() && is_lower; ++i) {
+            char c = string[i];
+            found_alpha |= std::isalpha(c);
+            is_lower &= (!std::isalpha(c) || std::islower(c));
+          }
+          push(stack, found_alpha && is_lower);
+          return 0;
+        }),
+
+    DEFINE_STRING_IS_OP(aten::isdigit, std::isdigit),
+    DEFINE_STRING_IS_OP(aten::isspace, std::isspace),
+    DEFINE_STRING_IS_OP(aten::isalnum, std::isalnum),
+    DEFINE_STRING_IS_OP(aten::isalpha, std::isalpha),
+
+#define DEFINE_STRING_CHAR_MAP_OP(op_name, char_op)         \
+  Operator(#op_name "(str self) -> str", [](Stack& stack) { \
+    auto string = pop(stack).toStringRef();                 \
+    std::stringstream ss;                                   \
+    for (char c : string) {                                 \
+      ss << static_cast<char>(char_op(c));                  \
+    }                                                       \
+    push(stack, ss.str());                                  \
+    return 0;                                               \
+  })
+
+    DEFINE_STRING_CHAR_MAP_OP(aten::upper, std::toupper),
+    DEFINE_STRING_CHAR_MAP_OP(aten::lower, std::tolower),
+
     Operator(
         "prim::StringIndex(str string, int index) -> str",
         [](Stack& stack) {
@@ -1904,18 +2006,18 @@ RegisterOperators reg2({
           return 0;
         }),
     Operator(
-      "prim::str(t elem) -> str",
-      [](Stack& stack) {
-        std::stringstream ss;
-        ss << pop(stack);
-        push(stack, ss.str());
-        return 0;
-      }),
+        "prim::str(t elem) -> str",
+        [](Stack& stack) {
+          std::stringstream ss;
+          ss << pop(stack);
+          push(stack, ss.str());
+          return 0;
+        }),
     Operator(
         "aten::ord(str string) -> int",
         [](Stack& stack) {
           auto string = pop(stack).toStringRef();
-          AT_CHECK(
+          TORCH_CHECK(
               string.size() == 1,
               "String for ord() must be 1 character, found",
               string.size());
@@ -1943,7 +2045,7 @@ RegisterOperators reg2({
     DEFINE_BINARY_OP(aten::add, a + b),
     DEFINE_BINARY_OP(aten::sub, a - b),
     DEFINE_BINARY_OP(aten::mul, a* b),
-    DEFINE_BINARY_OP(aten::pow, static_cast<decltype(a)>(pow(a, b))),
+    DEFINE_BINARY_OP(aten::pow, pow(a, b)),
     // min and max are in prim:: because there is a difference between
     // the python builtin 'min' and 'torch.min'
     DEFINE_BINARY_OP(prim::min, a < b ? a : b),
@@ -2018,24 +2120,6 @@ RegisterOperators reg2({
           return 0;
         }),
 
-    Operator(
-        "aten::pow(float a, float b) -> float",
-        [](Stack& stack) {
-          double a, b;
-          pop(stack, a, b);
-          push(stack, std::pow(a, b));
-          return 0;
-        }),
-    Operator(
-        "aten::pow(float a, int b) -> float",
-        [](Stack& stack) {
-          double a;
-          int b;
-          pop(stack, a, b);
-          push(stack, std::pow(a, b));
-          return 0;
-        }),
-
     Operator(
         "aten::floor(float a) -> float",
         [](Stack& stack) {
@@ -2139,13 +2223,45 @@ RegisterOperators reg2({
           return 0;
         }),
 
+    DEFINE_INT_OP(aten::gcd, gcd(a, b)),
+
+    DEFINE_GENERIC_OP(
+        aten::copysign,
+        std::copysign(a, b),
+        std::copysign(a, b),
+        float,
+        float),
+    DEFINE_INT_FLOAT_OP(aten::copysign, std::copysign(a, b), float),
+
+#define DEFINE_MATH_OP(aten_op, op, int_result, float_result)             \
+  Operator(                                                               \
+      #aten_op "(int a) -> " #int_result,                                 \
+      [](Stack& stack) {                                                  \
+        int64_t a;                                                        \
+        pop(stack, a);                                                    \
+        push(stack, op);                                                  \
+        return 0;                                                         \
+      }),                                                                 \
+      Operator(#aten_op "(float a) -> " #float_result, [](Stack& stack) { \
+        double a;                                                         \
+        pop(stack, a);                                                    \
+        push(stack, op);                                                  \
+        return 0;                                                         \
+      })
+
+    DEFINE_MATH_OP(aten::gamma, std::tgamma(a), float, float),
+    DEFINE_MATH_OP(aten::erf, std::erf(a), float, float),
+    DEFINE_MATH_OP(aten::erfc, std::erfc(a), float, float),
+    DEFINE_MATH_OP(aten::expm1, std::expm1(a), float, float),
+    DEFINE_MATH_OP(aten::fabs, std::fabs(a), float, float),
+    DEFINE_MATH_OP(aten::lgamma, std::lgamma(a), float, float),
+
     DEFINE_COMPARISON_OP(aten::ne, a != b),
     DEFINE_COMPARISON_OP(aten::eq, a == b),
     DEFINE_COMPARISON_OP(aten::lt, a < b),
     DEFINE_COMPARISON_OP(aten::gt, a > b),
     DEFINE_COMPARISON_OP(aten::le, a <= b),
     DEFINE_COMPARISON_OP(aten::ge, a >= b),
-
     DEFINE_BOOL_OP(aten::__and__, a&& b),
     DEFINE_BOOL_OP(aten::__or__, a || b),
     DEFINE_BOOL_OP(aten::__xor__, a != b),
@@ -2230,7 +2346,7 @@ RegisterOperators reg2({
           dictGetDefault),                                                    \
       Operator(                                                               \
           "aten::_set_item(Dict(" key_type ", t)(a!) l, " key_type            \
-          " idx, t v) -> ()",                                                 \
+          " idx, t(b -> *) v) -> ()",                                         \
           dictSetItem)
 
     CREATE_DICT_OPS("str"),
@@ -2275,7 +2391,7 @@ void checkSortSchema(const Node* node, const c10::TypePtr& list_element_type) {
         << ", got list of " << list_element_type->python_str() << "\n";
   }
 
-  auto error_msg = script::ErrorReport(node->getSourceLocation());
+  auto error_msg = script::ErrorReport(node->sourceRange());
   error_msg << error_str.str();
   throw error_msg;
 }
diff --git a/torch/csrc/jit/register_special_ops.cpp b/torch/csrc/jit/register_special_ops.cpp
index 6f2ea80f363b..f4cb86884d04 100644
--- a/torch/csrc/jit/register_special_ops.cpp
+++ b/torch/csrc/jit/register_special_ops.cpp
@@ -22,9 +22,9 @@ namespace {
 void checkListInputType(const c10::TypePtr& elem_type, const Node* node) {
   if (!elem_type->isSubtypeOf(NumberType::get()) &&
       elem_type != BoolType::get()) {
-    auto error = script::ErrorReport(node->getSourceLocation());
+    auto error = script::ErrorReport(node->sourceRange());
     error << "Input list to torch.tensor must be of ints, floats, or bools, "
-          << "got " << elem_type->str();
+          << "got " << elem_type->python_str();
     // special case empty list torch.tensor([])
     if (elem_type->isSubtypeOf(TensorType::get())) {
       auto input = node->inputs().at(0);
@@ -151,112 +151,112 @@ void recursiveStore(
   }
 }
 
-RegisterOperators reg(
-    {Operator(
-         "aten::split(Tensor self, int[] split_sizes, int dim=0) -> Tensor[]",
-         [](Stack& stack) {
-           RECORD_FUNCTION("split_with_sizes", last(stack, 3));
-
-           auto result = at::split_with_sizes(
-               (std::move(peek(stack, 0, 3))).toTensor(),
-               (std::move(peek(stack, 1, 3))).toIntList()->elements(),
-               (std::move(peek(stack, 2, 3))).toInt());
-           drop(stack, 3);
-           pack(stack, std::move(result));
-           return 0;
-         }),
-     Operator(
-         "aten::Size(int[] sizes) -> int[]",
-         [](Stack& stack) { return 0; }),
-     Operator(
-         "aten::size(Tensor self) -> int[]",
-         [](Stack& stack) {
-           RECORD_FUNCTION("size", last(stack, 1));
-
-           auto t = std::move(pop(stack)).toTensor();
-           pack(stack, t.sizes().vec());
-           return 0;
-         }),
-     Operator(
-         "aten::list_with_default(int[] list, int[] defaults) -> int[]",
-         [](Stack& stack) {
-           RECORD_FUNCTION("sizes", last(stack, 2));
-
-           auto list = peek(stack, 0, 2).toIntListRef();
-           auto defaults = peek(stack, 1, 2).toIntListRef();
-           drop(stack, 2);
-
-           AT_ASSERT(defaults.size() > list.size());
-
-           // TODO: allow list of optionals to be filled in with defaults
-           // i.e. list_with_default([1, 2, None], [1, 2, 3]) -> [1, 2, 3]
-
-           push(stack, list);
-           return 0;
-         }),
-     Operator(
-         "aten::_infer_size(int[] a, int[] b) -> int[]",
-         [](const Node* node) {
-           return [](Stack& stack) {
-             auto a = pop(stack).toIntList()->elements();
-             auto b = pop(stack).toIntList()->elements();
-             push(stack, at::infer_size(a, b));
-             return 0;
-           };
-         }),
-     Operator(
-         "aten::_no_grad_embedding_renorm_(Tensor weight, Tensor input, float max_norm, float norm_type) -> Tensor",
-         [](const Node* node) {
-           return [](Stack& stack) {
-             at::Tensor weight;
-             at::Tensor input;
-             double max_norm;
-             double norm_type;
-             pop(stack, weight, input, max_norm, norm_type);
-
-             // TODO: remove when script supports setting grad mode
-             torch::NoGradGuard no_grad;
-
-             at::Tensor result =
-                 at::embedding_renorm_(weight, input, max_norm, norm_type);
-             push(stack, result);
-
-             return 0;
-           };
-         }),
-     Operator(
-         "aten::format(str self, ...) -> str",
-         [](const Node* node) {
-           size_t num_inputs = node->inputs().size();
-           std::regex unsupported_options("\\{(.*)\\}");
-           return [num_inputs, unsupported_options](Stack& stack) {
-             auto format = peek(stack, 0, num_inputs).toStringRef();
-
-             if (std::regex_search(format, unsupported_options)) {
-               AT_WARN("Format options are not supported.");
-             }
-
-             auto args = last(stack, num_inputs - 1);
-             std::stringstream ss;
-             for (size_t begin = 0, used_args = 0; true; ++used_args) {
-               size_t loc = format.find("{}", begin);
-               if (loc == std::string::npos) {
-                 ss << format.substr(begin);
-                 break;
-               }
-               ss << format.substr(begin, loc - begin);
-               if (used_args >= args.size()) {
-                 AT_ERROR("Too few arguments for format string: ", format);
-               }
-               ss << args[used_args];
-               begin = loc + 2;
-             }
-
-             drop(stack, num_inputs);
-             push(stack, ss.str());
-             return 0;
-           };
-         }),
+RegisterOperators reg({
+    Operator(
+        "aten::split(Tensor self, int[] split_sizes, int dim=0) -> Tensor[]",
+        [](Stack& stack) {
+          RECORD_FUNCTION("split_with_sizes", last(stack, 3));
+
+          auto result = at::split_with_sizes(
+              (std::move(peek(stack, 0, 3))).toTensor(),
+              (std::move(peek(stack, 1, 3))).toIntList()->elements(),
+              (std::move(peek(stack, 2, 3))).toInt());
+          drop(stack, 3);
+          pack(stack, std::move(result));
+          return 0;
+        }),
+    Operator(
+        "aten::Size(int[] sizes) -> int[]",
+        [](Stack& stack) { return 0; }),
+    Operator(
+        "aten::size(Tensor self) -> int[]",
+        [](Stack& stack) {
+          RECORD_FUNCTION("size", last(stack, 1));
+
+          auto t = std::move(pop(stack)).toTensor();
+          pack(stack, t.sizes().vec());
+          return 0;
+        }),
+    Operator(
+        "aten::list_with_default(int[] list, int[] defaults) -> int[]",
+        [](Stack& stack) {
+          RECORD_FUNCTION("sizes", last(stack, 2));
+
+          auto list = peek(stack, 0, 2).toIntListRef();
+          auto defaults = peek(stack, 1, 2).toIntListRef();
+          drop(stack, 2);
+
+          AT_ASSERT(defaults.size() > list.size());
+
+          // TODO: allow list of optionals to be filled in with defaults
+          // i.e. list_with_default([1, 2, None], [1, 2, 3]) -> [1, 2, 3]
+
+          push(stack, list);
+          return 0;
+        }),
+    Operator(
+        "aten::_infer_size(int[] a, int[] b) -> int[]",
+        [](const Node* node) {
+          return [](Stack& stack) {
+            auto a = pop(stack).toIntList()->elements();
+            auto b = pop(stack).toIntList()->elements();
+            push(stack, at::infer_size(a, b));
+            return 0;
+          };
+        }),
+    Operator(
+        "aten::_no_grad_embedding_renorm_(Tensor weight, Tensor input, float max_norm, float norm_type) -> Tensor",
+        [](const Node* node) {
+          return [](Stack& stack) {
+            at::Tensor weight;
+            at::Tensor input;
+            double max_norm;
+            double norm_type;
+            pop(stack, weight, input, max_norm, norm_type);
+
+            // TODO: remove when script supports setting grad mode
+            torch::NoGradGuard no_grad;
+
+            at::Tensor result =
+                at::embedding_renorm_(weight, input, max_norm, norm_type);
+            push(stack, result);
+
+            return 0;
+          };
+        }),
+    Operator(
+        "aten::format(str self, ...) -> str",
+        [](const Node* node) {
+          size_t num_inputs = node->inputs().size();
+          std::regex unsupported_options("\\{(.*)\\}");
+          return [num_inputs, unsupported_options](Stack& stack) {
+            auto format = peek(stack, 0, num_inputs).toStringRef();
+
+            if (std::regex_search(format, unsupported_options)) {
+              AT_WARN("Format options are not supported.");
+            }
+
+            auto args = last(stack, num_inputs - 1);
+            std::stringstream ss;
+            for (size_t begin = 0, used_args = 0; true; ++used_args) {
+              size_t loc = format.find("{}", begin);
+              if (loc == std::string::npos) {
+                ss << format.substr(begin);
+                break;
+              }
+              ss << format.substr(begin, loc - begin);
+              if (used_args >= args.size()) {
+                AT_ERROR("Too few arguments for format string: ", format);
+              }
+              ss << args[used_args];
+              begin = loc + 2;
+            }
+
+            drop(stack, num_inputs);
+            push(stack, ss.str());
+            return 0;
+          };
+        }),
 
 #define DEFINE_TORCH_TENSOR_OP(operator_type, c_type, tensor_creation_op)     \
   Operator(                                                                   \
@@ -286,159 +286,166 @@ RegisterOperators reg(
         };                                                                    \
       }),
 
-     DEFINE_TORCH_TENSOR_OP(float, double, at::scalar_to_tensor(scalar_val))
-         DEFINE_TORCH_TENSOR_OP(int, int64_t, at::scalar_to_tensor(scalar_val))
-             DEFINE_TORCH_TENSOR_OP(
-                 bool,
-                 bool,
-                 at::empty({}, at::CPU(at::kByte).options()).fill_(scalar_val))
-
-     // reference python implementation: internal_new_from_data in
-     // tensor_new.cpp
-     Operator(
-         "aten::_infer_size(int[] a, int[] b) -> int[]",
-         [](const Node* node) {
-           return [](Stack& stack) {
-             auto a = pop(stack).toIntList()->elements();
-             auto b = pop(stack).toIntList()->elements();
-             push(stack, at::infer_size(a, b));
-             return 0;
-           };
-         }),
-     Operator(
-         "aten::_no_grad_embedding_renorm_(Tensor weight, Tensor input, float max_norm, float norm_type) -> Tensor",
-         [](const Node* node) {
-           return [](Stack& stack) {
-             at::Tensor weight;
-             at::Tensor input;
-             double max_norm;
-             double norm_type;
-             pop(stack, weight, input, max_norm, norm_type);
-
-             // TODO: remove when script supports setting grad mode
-             torch::NoGradGuard no_grad;
-
-             at::Tensor result =
-                 at::embedding_renorm_(weight, input, max_norm, norm_type);
-             push(stack, result);
-
-             return 0;
-           };
-         }),
-     Operator(
-         "aten::tensor(t[] data, *, ScalarType? dtype=None, Device? device=None, bool requires_grad=False) -> Tensor",
-         [](const Node* node) {
-           auto input = node->inputs().at(0);
-           auto elem_type = input->type();
-           while (auto list_type = elem_type->cast<ListType>()) {
-             elem_type = list_type->getElementType();
-           }
-           checkListInputType(elem_type, node);
-           at::ScalarType initial_scalar_type =
-               scalarTypeFromJitType(elem_type);
-           return [initial_scalar_type, elem_type](Stack& stack) {
-             bool requires_grad;
-             IValue data;
-             IValue dtype;
-             IValue device;
-             pop(stack, data, dtype, device, requires_grad);
-             auto sizes = compute_sizes(data);
-             auto tensor = autograd::make_variable(at::empty(
-                 sizes, at::initialTensorOptions().dtype(initial_scalar_type)));
-
-             recursiveStore(
-                 (char*)tensor.data_ptr(),
-                 sizes,
-                 tensor.strides(),
-                 0,
-                 tensor.element_size(),
-                 data);
-
-             at::ScalarType scalar_type =
-                 dtype.isNone() ? tensor.scalar_type() : dtype.toScalarType();
-             c10::Device dev =
-                 device.isNone() ? tensor.device() : device.toDevice();
-             if (scalar_type != initial_scalar_type || dev != tensor.device()) {
-               tensor = tensor.to(dev, scalar_type);
-             }
-
-             auto default_type =
-                 at::typeMetaToScalarType(at::get_default_dtype());
-
-             if (dtype.isNone() && tensor.scalar_type() != default_type &&
-                 tensor.numel() == 0) {
-               AT_WARN(
-                   "Creating a tensor from an empty ",
-                   elem_type->str(),
-                   "list will create a tensor of default floating point type  (currently ",
-                   default_type,
-                   ") in python but a tensor of type ",
-                   elem_type->str(),
-                   " in torchscript.\n",
-                   "Pass in a dtype argument to ensure consistent behavior");
-             }
-             tensor.set_requires_grad(requires_grad);
-             push(stack, tensor);
-             return 0;
-           };
-         }),
-     Operator(
-         "aten::_assert_int_or_pair(int[] vals, str name, str message) -> Tensor",
-         [](const Node* node) {
-           return [](Stack& stack) {
-             // Everything is a list at the point this is used, so don't do
-             // anything
-             drop(stack, 3);
-             return 0;
-           };
-         }),
-     Operator(
-         "aten::_pack_sequence(Tensor output, Tensor batch_sizes, Tensor? sorted_indices, "
-         "Tensor? unsorted_indices) -> (Tensor, Tensor, Tensor?, Tensor?)",
-         [](Stack& stack) { return 0; }),
-     Operator("aten::_no_grad_uniform_(Tensor(a!) tensor, float a, float b) -> Tensor(a!)", [](Stack& stack) {
-       // TODO: remove when script supports setting grad mode
-       torch::NoGradGuard no_grad;
-
-       at::Tensor tensor;
-       double a;
-       double b;
-       pop(stack, tensor, a, b);
-       push(stack, at::_th_uniform_(tensor, a, b));
-       return 0;
-     }),
-     Operator("aten::_no_grad_normal_(Tensor(a!) tensor, float mean, float std) -> Tensor(a!)", [](Stack& stack) {
-       // TODO: remove when script supports setting grad mode
-       torch::NoGradGuard no_grad;
-
-       at::Tensor tensor;
-       double mean;
-       double std;
-       pop(stack, tensor, mean, std);
-       push(stack, at::_th_normal_(tensor, mean, std));
-       return 0;
-     }),
-     Operator("aten::_no_grad_fill_(Tensor(a!) tensor, float val) -> Tensor(a!)", [](Stack& stack) {
-       // TODO: remove when script supports setting grad mode
-       torch::NoGradGuard no_grad;
-
-       at::Tensor tensor;
-       double val;
-       pop(stack, tensor, val);
-       push(stack, at::fill_(tensor, val));
-       return 0;
-     }),
-     Operator("aten::_no_grad_zero_(Tensor(a!) tensor) -> Tensor(a!)", [](Stack& stack) {
-       // TODO: remove when script supports setting grad mode
-       torch::NoGradGuard no_grad;
-
-       at::Tensor tensor;
-       pop(stack, tensor);
-       push(stack, at::zero_(tensor));
-       return 0;
-     }),
-
-    });
+    DEFINE_TORCH_TENSOR_OP(float, double, at::scalar_to_tensor(scalar_val))
+        DEFINE_TORCH_TENSOR_OP(int, int64_t, at::scalar_to_tensor(scalar_val))
+            DEFINE_TORCH_TENSOR_OP(
+                bool,
+                bool,
+                at::empty({}, at::CPU(at::kByte).options()).fill_(scalar_val))
+
+    // reference python implementation: internal_new_from_data in
+    // tensor_new.cpp
+    Operator(
+        "aten::_infer_size(int[] a, int[] b) -> int[]",
+        [](const Node* node) {
+          return [](Stack& stack) {
+            auto a = pop(stack).toIntList()->elements();
+            auto b = pop(stack).toIntList()->elements();
+            push(stack, at::infer_size(a, b));
+            return 0;
+          };
+        }),
+    Operator(
+        "aten::_no_grad_embedding_renorm_(Tensor weight, Tensor input, float max_norm, float norm_type) -> Tensor",
+        [](const Node* node) {
+          return [](Stack& stack) {
+            at::Tensor weight;
+            at::Tensor input;
+            double max_norm;
+            double norm_type;
+            pop(stack, weight, input, max_norm, norm_type);
+
+            // TODO: remove when script supports setting grad mode
+            torch::NoGradGuard no_grad;
+
+            at::Tensor result =
+                at::embedding_renorm_(weight, input, max_norm, norm_type);
+            push(stack, result);
+
+            return 0;
+          };
+        }),
+    Operator(
+        "aten::tensor(t[] data, *, ScalarType? dtype=None, Device? device=None, bool requires_grad=False) -> Tensor",
+        [](const Node* node) {
+          auto input = node->inputs().at(0);
+          auto elem_type = input->type();
+          while (auto list_type = elem_type->cast<ListType>()) {
+            elem_type = list_type->getElementType();
+          }
+          checkListInputType(elem_type, node);
+          at::ScalarType initial_scalar_type = scalarTypeFromJitType(elem_type);
+          return [initial_scalar_type, elem_type](Stack& stack) {
+            bool requires_grad;
+            IValue data;
+            IValue dtype;
+            IValue device;
+            pop(stack, data, dtype, device, requires_grad);
+            auto sizes = compute_sizes(data);
+            auto tensor = autograd::make_variable(at::empty(
+                sizes, at::initialTensorOptions().dtype(initial_scalar_type)));
+
+            recursiveStore(
+                (char*)tensor.data_ptr(),
+                sizes,
+                tensor.strides(),
+                0,
+                tensor.element_size(),
+                data);
+
+            at::ScalarType scalar_type =
+                dtype.isNone() ? tensor.scalar_type() : dtype.toScalarType();
+            c10::Device dev =
+                device.isNone() ? tensor.device() : device.toDevice();
+            if (scalar_type != initial_scalar_type || dev != tensor.device()) {
+              tensor = tensor.to(dev, scalar_type);
+            }
+
+            auto default_type =
+                at::typeMetaToScalarType(at::get_default_dtype());
+
+            if (dtype.isNone() && tensor.scalar_type() != default_type &&
+                tensor.numel() == 0) {
+              AT_WARN(
+                  "Creating a tensor from an empty ",
+                  elem_type->python_str(),
+                  "list will create a tensor of default floating point type  (currently ",
+                  default_type,
+                  ") in python but a tensor of type ",
+                  elem_type->python_str(),
+                  " in torchscript.\n",
+                  "Pass in a dtype argument to ensure consistent behavior");
+            }
+            tensor.set_requires_grad(requires_grad);
+            push(stack, tensor);
+            return 0;
+          };
+        }),
+    Operator(
+        "aten::_assert_int_or_pair(int[] vals, str name, str message) -> Tensor",
+        [](const Node* node) {
+          return [](Stack& stack) {
+            // Everything is a list at the point this is used, so don't do
+            // anything
+            drop(stack, 3);
+            return 0;
+          };
+        }),
+    Operator(
+        "aten::_pack_sequence(Tensor output, Tensor batch_sizes, Tensor? sorted_indices, "
+        "Tensor? unsorted_indices) -> (Tensor, Tensor, Tensor?, Tensor?)",
+        [](Stack& stack) { return 0; }),
+    Operator(
+        "aten::_no_grad_uniform_(Tensor(a!) tensor, float a, float b) -> Tensor(a!)",
+        [](Stack& stack) {
+          // TODO: remove when script supports setting grad mode
+          torch::NoGradGuard no_grad;
+
+          at::Tensor tensor;
+          double a;
+          double b;
+          pop(stack, tensor, a, b);
+          push(stack, at::_th_uniform_(tensor, a, b));
+          return 0;
+        }),
+    Operator(
+        "aten::_no_grad_normal_(Tensor(a!) tensor, float mean, float std) -> Tensor(a!)",
+        [](Stack& stack) {
+          // TODO: remove when script supports setting grad mode
+          torch::NoGradGuard no_grad;
+
+          at::Tensor tensor;
+          double mean;
+          double std;
+          pop(stack, tensor, mean, std);
+          push(stack, at::_th_normal_(tensor, mean, std));
+          return 0;
+        }),
+    Operator(
+        "aten::_no_grad_fill_(Tensor(a!) tensor, float val) -> Tensor(a!)",
+        [](Stack& stack) {
+          // TODO: remove when script supports setting grad mode
+          torch::NoGradGuard no_grad;
+
+          at::Tensor tensor;
+          double val;
+          pop(stack, tensor, val);
+          push(stack, at::fill_(tensor, val));
+          return 0;
+        }),
+    Operator(
+        "aten::_no_grad_zero_(Tensor(a!) tensor) -> Tensor(a!)",
+        [](Stack& stack) {
+          // TODO: remove when script supports setting grad mode
+          torch::NoGradGuard no_grad;
+
+          at::Tensor tensor;
+          pop(stack, tensor);
+          push(stack, at::zero_(tensor));
+          return 0;
+        }),
+
+});
 } // namespace
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/scope.cpp b/torch/csrc/jit/scope.cpp
index 7cfeb6bc2b66..3850974db4bb 100644
--- a/torch/csrc/jit/scope.cpp
+++ b/torch/csrc/jit/scope.cpp
@@ -3,10 +3,43 @@
 namespace torch {
 namespace jit {
 
+ScopePtr Scope::intrusive_from_this() {
+  c10::raw::intrusive_ptr::incref(this); // we are creating a new pointer
+                                         // from a raw `this` pointer
+                                         // so we need to bump the refcount
+                                         // to account for this ownership
+  return c10::intrusive_ptr<Scope>::reclaim(this);
+}
+
+Scope::Scope() {
+  name_ = Symbol::scope("");
+}
+
+Scope::Scope(ScopePtr parent, Symbol name) {
+  name_ = name;
+  parent_ = std::move(parent);
+}
+
 ScopePtr Scope::push(Symbol name) {
   return c10::make_intrusive<Scope>(intrusive_from_this(), name);
 }
 
+ScopePtr Scope::parent() {
+  if (!parent_) {
+    throw std::runtime_error("Cannot get parent from Scope with no parent");
+  }
+  return parent_;
+}
+
+bool Scope::isRoot() const {
+  return !parent_;
+}
+
+bool Scope::isBlank() const {
+  static const Symbol blank = Symbol::scope("");
+  return isRoot() && name() == blank;
+}
+
 ScopePtr Scope::getRoot() {
   ScopePtr current = intrusive_from_this();
   while (current->parent_) {
@@ -25,6 +58,10 @@ size_t Scope::getDepth() {
   return d;
 }
 
+Symbol Scope::name() const {
+  return name_;
+}
+
 std::string Scope::namesFromRoot(const std::string& separator) const {
   // TODO: I think the answer is we shouldn't have used Symbol here
   std::string out = this->name_.toUnqualString();
diff --git a/torch/csrc/jit/scope.h b/torch/csrc/jit/scope.h
index 05f14b274344..5055e7767444 100644
--- a/torch/csrc/jit/scope.h
+++ b/torch/csrc/jit/scope.h
@@ -21,45 +21,26 @@ struct TORCH_API Scope : public c10::intrusive_ptr_target {
  private:
   ScopePtr parent_;
   Symbol name_;
-  ScopePtr intrusive_from_this() {
-    c10::raw::intrusive_ptr::incref(this); // we are creating a new pointer
-                                           // from a raw `this` pointer
-                                           // so we need to bump the refcount
-                                           // to account for this ownership
-    return c10::intrusive_ptr<Scope>::reclaim(this);
-  }
+  ScopePtr intrusive_from_this();
 
  public:
-  Scope() {
-    name_ = Symbol::scope("");
-  }
-  Scope(ScopePtr parent, Symbol name) {
-    name_ = name;
-    parent_ = std::move(parent);
-  }
+  Scope();
+
+  Scope(ScopePtr parent, Symbol name);
+
   ScopePtr push(Symbol name);
 
-  ScopePtr parent() {
-    if (!parent_) {
-      throw std::runtime_error("Cannot get parent from Scope with no parent");
-    }
-    return parent_;
-  }
-  bool isRoot() const {
-    return !parent_;
-  }
-  bool isBlank() const {
-    static const Symbol blank = Symbol::scope("");
-    return isRoot() && name() == blank;
-  }
+  ScopePtr parent();
+
+  bool isRoot() const;
+
+  bool isBlank() const;
 
   ScopePtr getRoot();
 
   size_t getDepth();
 
-  Symbol name() const {
-    return name_;
-  }
+  Symbol name() const;
 
   std::string namesFromRoot(const std::string& separator = "/") const;
 };
diff --git a/torch/csrc/jit/script/compilation_unit.h b/torch/csrc/jit/script/compilation_unit.h
index b3104007e436..400e0f3cf3f1 100644
--- a/torch/csrc/jit/script/compilation_unit.h
+++ b/torch/csrc/jit/script/compilation_unit.h
@@ -108,7 +108,7 @@ struct TORCH_API Function {
   }
 
   void check_single_output() {
-    AT_CHECK(
+    TORCH_CHECK(
         graph()->outputs().size() == 1,
         "Method (but not graphs in general) require a single output. Use None/Tuple for 0 or 2+ outputs");
   }
@@ -297,7 +297,7 @@ struct TORCH_API CompilationUnit {
 
  private:
   Function& register_function(std::shared_ptr<Function> fn) {
-    AT_CHECK(
+    TORCH_CHECK(
         0 == dict_.count(fn->name()),
         "method '",
         fn->name(),
diff --git a/torch/csrc/jit/script/compiler.cpp b/torch/csrc/jit/script/compiler.cpp
index 3cabe0566b7c..2ec5068be276 100644
--- a/torch/csrc/jit/script/compiler.cpp
+++ b/torch/csrc/jit/script/compiler.cpp
@@ -24,7 +24,6 @@ namespace torch {
 namespace jit {
 namespace script {
 
-using SugaredValuePtr = std::shared_ptr<SugaredValue>;
 using FunctionTable = std::unordered_map<std::string, Function&>;
 using ValueTable = std::unordered_map<std::string, SugaredValuePtr>;
 using AttributeMap = std::unordered_map<std::string, Const>;
@@ -145,6 +144,13 @@ static Value* asSimple(const SugaredValuePtr& value) {
   }
   return nullptr;
 }
+
+static std::shared_ptr<MagicMethod> makeMagic(
+    const std::string& name,
+    SugaredValuePtr base) {
+  return std::make_shared<MagicMethod>(name, base);
+}
+
 // we consider _N where N is a number, to be a non-meaningful name
 // and do not record it as a unique name. This allows python printing to
 // be able to export and import more consistently named graphs
@@ -353,9 +359,9 @@ struct Environment {
               unshapedType(simple_parent->type()))) {
         std::stringstream errMsg;
         errMsg << "variable '" << name << "' previously has type "
-               << simple_parent->type()->str()
+               << simple_parent->type()->python_str()
                << " but is now being assigned to a value of type "
-               << as_simple_value->type()->str();
+               << as_simple_value->type()->python_str();
         // Special-cased error msg if we're trying to assign to a tensor list.
         if (simple_parent->type()->kind() == TypeKind::ListType &&
             as_simple_value->type()->kind() == TypeKind::ListType) {
@@ -388,17 +394,32 @@ struct Environment {
     if (!retval) {
       static std::unordered_map<std::string, SugaredValuePtr> globals = {
           {"print", std::make_shared<PrintValue>()},
-          {"float", std::make_shared<CastValue>(FloatType::get(), prim::Float)},
-          {"int", std::make_shared<CastValue>(IntType::get(), prim::Int)},
-          {"bool", std::make_shared<CastValue>(BoolType::get(), prim::Bool)},
-          {"str", std::make_shared<CastValue>(StringType::get(), prim::str)},
+          {"float",
+           makeMagic(
+               "__float__",
+               std::make_shared<CastValue>(FloatType::get(), prim::Float))},
+          {"int",
+           makeMagic(
+               "__int__",
+               std::make_shared<CastValue>(IntType::get(), prim::Int))},
+          {"bool",
+           makeMagic(
+               "__bool__",
+               std::make_shared<CastValue>(BoolType::get(), prim::Bool))},
+          {"str",
+           makeMagic(
+               "__str__",
+               std::make_shared<CastValue>(StringType::get(), prim::str))},
           {"getattr", std::make_shared<GetAttrValue>()},
           {"isinstance", std::make_shared<IsInstanceValue>()},
           // todo(zach): remove when we can correctly export torch.full via ONNX
           // or we have implicit conversion that can convert numbers to tensors
           {"_to_tensor",
            std::make_shared<CastValue>(TensorType::get(), prim::NumToTensor)},
-          {"len", std::make_shared<OperatorOverload>(aten::len, "__len__")},
+          {"len",
+           makeMagic(
+               "__len__",
+               std::make_shared<BuiltinFunction>(aten::len, at::nullopt))},
           {"hash", std::make_shared<BuiltinFunction>(aten::hash, at::nullopt)},
           {"min", std::make_shared<BuiltinFunction>(prim::min, at::nullopt)},
           {"max", std::make_shared<BuiltinFunction>(prim::max, at::nullopt)},
@@ -498,7 +519,7 @@ static Value* materializeConstant(
 static Value* ensureInt(const SourceRange& range, Value* v) {
   if (!v->type()->isSubtypeOf(IntType::get())) {
     throw ErrorReport(range)
-        << "expected a int but found a " << v->type()->str();
+        << "expected a int but found a " << v->type()->python_str();
   }
   return v;
 }
@@ -952,8 +973,7 @@ struct to_ir {
   }
 
   Node* create(Symbol kind, const SourceRange& loc, size_t n_outputs) {
-    return graph->create(kind, n_outputs)
-        ->setSourceLocation(std::make_shared<SourceRange>(loc));
+    return graph->create(kind, n_outputs)->setSourceRange(loc);
   }
 
   Value* emitTernaryIf(const TernaryIf& expr) {
@@ -1106,8 +1126,8 @@ struct to_ir {
     auto unified = unifyTypes(true_type, false_type);
     if (!unified) {
       throw ErrorReport(range)
-          << "if-expression's true branch has type " << true_type->str()
-          << " but false branch has type " << false_type->str();
+          << "if-expression's true branch has type " << true_type->python_str()
+          << " but false branch has type " << false_type->python_str();
     }
 
     // Add op outputs
@@ -1118,26 +1138,21 @@ struct to_ir {
 
   Value* emitCond(const Expr& cond) {
     Value* v = emitExpr(cond);
-    if (!v->type()->isSubtypeOf(BoolType::get())) {
-      Value* cast_v = emitBuiltinCall(
-          cond.get()->range(),
-          *v->owningGraph(),
-          prim::Bool,
-          c10::nullopt,
-          {v},
-          {},
-          /*required*/ false);
-      if (cast_v == nullptr) {
-        ErrorReport error(cond);
-        error
-            << "expected a bool, int, float, or Tensor expression for condition but found "
-            << v->type()->str();
-        throw error;
-      } else {
-        v = cast_v;
-      }
+    Value* out;
+    try {
+      auto bool_cast = environment_stack->getSugaredVar("bool", cond.range());
+      out = asSimple(bool_cast->call(cond.get()->range(), method, {v}, {}, 0));
+    } catch (...) {
+      throw ErrorReport(cond.range()) << "Could not cast value of type "
+                                      << v->type()->python_str() << " to bool";
     }
-    return v;
+    // cast value not response for checking output type
+    if (!out->type()->isSubtypeOf(BoolType::get())) {
+      throw ErrorReport(cond)
+          << "expected a bool expression for condition but found "
+          << out->type()->python_str();
+    }
+    return out;
   }
 
   void emitIfElseBlocks(Value* cond_value, const If& stmt) {
@@ -1210,8 +1225,9 @@ struct to_ir {
       if (!unified) {
         ErrorReport error(stmt);
         error << "Type mismatch: " << x << " is set to type "
-              << tv->type()->str() << " in the true branch"
-              << " and type " << fv->type()->str() << " in the false branch";
+              << tv->type()->python_str() << " in the true branch"
+              << " and type " << fv->type()->python_str()
+              << " in the false branch";
         if (save_true->findInParentFrame(x) ||
             save_false->findInParentFrame(x)) {
           throw error;
@@ -2239,10 +2255,11 @@ struct to_ir {
             << arg->kind() << " instead";
       }
       if (class_arg->type_ != classNew->type_) {
-        throw ErrorReport(loc) << "Argument to __new__() must match the class "
-                               << "you are calling __new__() on. "
-                               << "Got: " << class_arg->type_->str()
-                               << ", expected: " << classNew->type_->str();
+        throw ErrorReport(loc)
+            << "Argument to __new__() must match the class "
+            << "you are calling __new__() on. "
+            << "Got: " << class_arg->type_->python_str()
+            << ", expected: " << classNew->type_->python_str();
       }
 
       return classNew->createObject(apply.range(), method);
@@ -2348,8 +2365,10 @@ struct to_ir {
     const auto& inputs = tree->trees();
     auto named_values = getNamedValues(inputs, /*maybe_unpack=*/false);
     auto neg_val =
-        asSimple(OperatorOverload(aten::neg, "__neg__")
-                     .call(tree->range(), method, named_values, {}, 0));
+        asSimple(makeMagic(
+                     "__neg__",
+                     std::make_shared<BuiltinFunction>(aten::neg, at::nullopt))
+                     ->call(tree->range(), method, named_values, {}, 0));
 
     // constant fold the input if possible
 
@@ -2376,10 +2395,9 @@ struct to_ir {
       at::ArrayRef<NamedValue> inputs,
       at::ArrayRef<NamedValue> attributes) {
     // Build the fork node without inputs
-    auto fork_node =
-        method.graph()
-            ->insertNode(method.graph()->create(prim::fork, 1))
-            ->setSourceLocation(std::make_shared<SourceRange>(loc));
+    auto fork_node = method.graph()
+                         ->insertNode(method.graph()->create(prim::fork, 1))
+                         ->setSourceRange(loc);
     auto body_block = fork_node->addBlock();
 
     // Build a template of the graph to be executed
@@ -2437,8 +2455,10 @@ struct to_ir {
         auto kind = getNodeKind(tree->kind(), inputs.size());
         auto overload = getOperatorOverload(tree->kind(), inputs.size());
         auto named_values = getNamedValues(inputs, /*maybe_unpack=*/false);
-        return asSimple(OperatorOverload(kind, overload)
-                            .call(tree->range(), method, named_values, {}, 0));
+        return asSimple(
+            makeMagic(
+                overload, std::make_shared<BuiltinFunction>(kind, at::nullopt))
+                ->call(tree->range(), method, named_values, {}, 0));
       }
       case TK_NOT: {
         Value* input = emitCond(Expr(tree->trees()[0]));
@@ -2723,7 +2743,7 @@ struct to_ir {
       }
       throw ErrorReport(loc)
           << "Unsupported operation: indexing tensor with unsupported index type '"
-          << index->type()->str()
+          << index->type()->python_str()
           << "'. Only ints, slices, and tensors are supported";
     }
     // at::index takes in a List[Optional[Tensor]] where some dims can be None.
@@ -2980,8 +3000,7 @@ struct FunctionResolver : public Resolver {
       functionTable_;
 };
 
-CompilationUnit::CompilationUnit(const std::string& source)
-{
+CompilationUnit::CompilationUnit(const std::string& source) {
   // calles the define with native resolver to generate the graph for functions
   define(source, nativeResolver(), nullptr);
 }
diff --git a/torch/csrc/jit/script/error_report.h b/torch/csrc/jit/script/error_report.h
index 13f028e98e81..63f9fe086c33 100644
--- a/torch/csrc/jit/script/error_report.h
+++ b/torch/csrc/jit/script/error_report.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <torch/csrc/jit/script/tree.h>
+#include <c10/util/Optional.h>
 
 namespace torch {
 namespace jit {
@@ -10,17 +11,15 @@ struct ErrorReport : public std::exception {
   ErrorReport(const ErrorReport& e)
       : ss(e.ss.str()), context(e.context), the_message(e.the_message) {}
 
-  ErrorReport() : context(nullptr) {}
-  explicit ErrorReport(const SourceRange& r)
-      : context(std::make_shared<SourceRange>(r)) {}
-  explicit ErrorReport(std::shared_ptr<SourceLocation> loc)
-      : context(std::move(loc)) {}
+  ErrorReport() : context(c10::nullopt) {}
+  explicit ErrorReport(SourceRange r)
+      : context(std::move(r)) {}
   explicit ErrorReport(const TreeRef& tree) : ErrorReport(tree->range()) {}
   explicit ErrorReport(const Token& tok) : ErrorReport(tok.range) {}
   const char* what() const noexcept override {
     std::stringstream msg;
     msg << "\n" << ss.str();
-    if (context != nullptr) {
+    if (context) {
       msg << ":\n";
       context->highlight(msg);
     } else {
@@ -35,7 +34,7 @@ struct ErrorReport : public std::exception {
   friend const ErrorReport& operator<<(const ErrorReport& e, const T& t);
 
   mutable std::stringstream ss;
-  std::shared_ptr<SourceLocation> context;
+  c10::optional<SourceRange> context;
   mutable std::string the_message;
 };
 
diff --git a/torch/csrc/jit/script/final_returns.cpp b/torch/csrc/jit/script/final_returns.cpp
index 2671ac273f6e..47534fefe823 100644
--- a/torch/csrc/jit/script/final_returns.cpp
+++ b/torch/csrc/jit/script/final_returns.cpp
@@ -41,7 +41,7 @@ ReturnInfo makeReturnsFinal(
     const SourceRange& range,
     at::ArrayRef<TreeRef> stmts,
     bool return_none) {
-  std::vector<TreeRef> changed;
+  at::SmallVector<TreeRef, 4> changed;
   changed.reserve(stmts.size());
   for (size_t i = 0; i < stmts.size(); ++i) {
     const TreeRef& stmt = stmts[i];
diff --git a/torch/csrc/jit/script/function_schema_parser.cpp b/torch/csrc/jit/script/function_schema_parser.cpp
index 0191398b3b27..adafa22f7a30 100644
--- a/torch/csrc/jit/script/function_schema_parser.cpp
+++ b/torch/csrc/jit/script/function_schema_parser.cpp
@@ -158,6 +158,8 @@ struct SchemaParser {
           return static_cast<int64_t>(at::kStrided);
         } else if ("Mean" == text) {
           return static_cast<int64_t>(Reduction::Mean);
+        } else if ("contiguous_format" == text) {
+          return static_cast<int64_t>(c10::MemoryFormat::Contiguous);
         } else {
           throw ErrorReport(L.cur().range) << "invalid numeric default value";
         }
@@ -278,7 +280,7 @@ C10_EXPORT either<OperatorName, FunctionSchema> parseSchemaOrName(const std::str
 
 C10_EXPORT FunctionSchema parseSchema(const std::string& schema) {
   auto parsed = parseSchemaOrName(schema);
-  AT_CHECK(parsed.is_right(), "Tried to parse a function schema but only the operator name was given");
+  TORCH_CHECK(parsed.is_right(), "Tried to parse a function schema but only the operator name was given");
   return parsed.right();
 }
 
diff --git a/torch/csrc/jit/script/init.cpp b/torch/csrc/jit/script/init.cpp
index 3f0299fe561a..5003ea21b4ab 100644
--- a/torch/csrc/jit/script/init.cpp
+++ b/torch/csrc/jit/script/init.cpp
@@ -152,7 +152,7 @@ FunctionSchema getSchemaWithNameAndDefaults(
             arg.name(), arg.type(), arg.N(), value, arg.kwarg_only());
       } catch (py::cast_error& e) {
         throw ErrorReport(range)
-            << "Expected a default value of type " << arg.type()->str()
+            << "Expected a default value of type " << arg.type()->python_str()
             << " on parameter \"" << arg.name() << "\"";
       }
     } else {
@@ -223,7 +223,7 @@ static std::shared_ptr<Graph> _propagate_and_assign_input_and_output_shapes(
     output_values = output_values.at(0)->node()->inputs();
   }
   AT_ASSERT(output_values.size() == outputs.size());
-  for (size_t i = 0; i < retval->outputs().size(); ++i) {
+  for (size_t i = 0; i < outputs.size(); ++i) {
     auto scalar_type = outputs[i].scalar_type();
     auto sizes = outputs[i].sizes();
     auto type =
@@ -233,6 +233,17 @@ static std::shared_ptr<Graph> _propagate_and_assign_input_and_output_shapes(
   return retval;
 }
 
+void addFunctionToModule(
+    Module& module,
+    const std::shared_ptr<Function>& func) {
+  // Make a graph with a fake self argument
+  auto graph = func->graph()->copy();
+  auto v = graph->insertInput(0, "self");
+  v->setType(module.module_object()->type());
+  module.module_object()->type()->compilation_unit().create_function(
+      "forward", graph);
+}
+
 void initJitScriptBindings(PyObject* module) {
   auto m = py::handle(module).cast<py::module>();
 
@@ -312,6 +323,11 @@ void initJitScriptBindings(PyObject* module) {
           },
           py::return_value_policy::reference_internal)
       .def("_register_parameter", &Module::register_parameter)
+      .def(
+          "_get_functions",
+          [](Module& self) {
+            return self.class_compilation_unit().get_functions();
+          })
       .def(
           "_register_attribute",
           [](Module& self, std::string name, TypePtr type, py::object value) {
@@ -476,6 +492,27 @@ void initJitScriptBindings(PyObject* module) {
             }
             return result;
           })
+      .def(
+          "save",
+          [](std::shared_ptr<Function> self,
+             const std::string& filename,
+             const ExtraFilesMap& _extra_files = ExtraFilesMap()) {
+            Module module;
+            addFunctionToModule(module, self);
+            module.save(filename, _extra_files);
+          },
+          py::arg("filename"),
+          py::arg("_extra_files") = ExtraFilesMap())
+      .def(
+          "save_to_buffer",
+          [](std::shared_ptr<Function> self,
+             const ExtraFilesMap& _extra_files = ExtraFilesMap()) {
+            std::ostringstream buf;
+            Module module;
+            addFunctionToModule(module, self);
+            return py::bytes(buf.str());
+          },
+          py::arg("_extra_files") = ExtraFilesMap())
       .def_property_readonly("graph", &Function::graph)
       .def_property_readonly("schema", &Function::getSchema)
       .def_property_readonly(
@@ -520,7 +557,9 @@ void initJitScriptBindings(PyObject* module) {
         PythonPrint(ss, self.function(), true, tensors, classes, false);
         return ss.str();
       });
-
+  m.def(
+      "_jit_recursive_script",
+      [](bool recurse) { getRecursiveScriptMode() = recurse; });
   m.def(
       "_jit_script_compile",
       [](const Def& def, ResolutionCallback rcb, FunctionDefaults defaults) {
diff --git a/torch/csrc/jit/script/module.cpp b/torch/csrc/jit/script/module.cpp
index 192baca1b4b5..de6f97f0cd41 100644
--- a/torch/csrc/jit/script/module.cpp
+++ b/torch/csrc/jit/script/module.cpp
@@ -1,12 +1,12 @@
-#include <torch/csrc/jit/script/module.h>
 #include <c10/util/Exception.h>
+#include <torch/csrc/autograd/generated/variable_factories.h>
 #include <torch/csrc/jit/export.h>
 #include <torch/csrc/jit/operator.h>
 #include <torch/csrc/jit/passes/dead_code_elimination.h>
 #include <torch/csrc/jit/script/compiler.h>
 #include <torch/csrc/jit/script/error_report.h>
+#include <torch/csrc/jit/script/module.h>
 #include <torch/csrc/jit/script/schema_matching.h>
-#include <torch/csrc/autograd/generated/variable_factories.h>
 
 namespace torch {
 namespace jit {
@@ -106,15 +106,15 @@ void module_state_to(
     const c10::optional<at::Device>& device,
     const c10::optional<at::ScalarType>& dtype,
     bool non_blocking) {
-    // Need to access the `at::Tensor` as a `Variable` here.
-    autograd::Variable variable = s.value().toTensor();
-    at::Tensor data = variable.data();
-    // Use the data's original device or dtype if not supplied here.
-    auto new_data = data.to(
-        device.value_or(data.device()),
-        dtype.value_or(data.scalar_type()),
-        non_blocking);
-    variable.set_data(new_data);
+  // Need to access the `at::Tensor` as a `Variable` here.
+  autograd::Variable variable = s.value().toTensor();
+  at::Tensor data = variable.data();
+  // Use the data's original device or dtype if not supplied here.
+  auto new_data = data.to(
+      device.value_or(data.device()),
+      dtype.value_or(data.scalar_type()),
+      non_blocking);
+  variable.set_data(new_data);
 }
 
 void Module::to_impl(
@@ -194,8 +194,10 @@ std::pair<std::shared_ptr<Graph>, std::vector<Slot>> lower_graph(
       continue;
     }
     if (e.n->kind() != prim::GetAttr) {
-      throw ErrorReport(e.n->getSourceLocation())
-          << "temporary: the only valid use of a module is looking up an attribute";
+      throw ErrorReport(e.n->sourceRange())
+          << "temporary: the only valid use of a module is looking up an "
+             "attribute but found "
+          << *e.n;
     }
     Slot slot(e.mod, e.mod->type()->getAttributeSlot(e.n->s(attr::name)));
     if (ClassTypePtr c = e.n->output()->type()->cast<ClassType>()) {
@@ -301,7 +303,6 @@ void Module::copy_into(
   }
 }
 
-
 void Module::clone_method(
     const Module& orig,
     const std::string& name,
@@ -348,10 +349,42 @@ void Module::clone_method(const Module& orig, const std::string& name) {
 }
 
 void Module::train(bool on) {
- for (auto& submod : get_modules()) {
-   submod->train(on);
- }
- register_buffer("training", torch::tensor(on ? 1 : 0, at::kLong));
+  for (auto& submod : get_modules()) {
+    submod->train(on);
+  }
+  register_buffer("training", torch::tensor(on ? 1 : 0, at::kLong));
+}
+
+IValue Module::create_class(const c10::QualifiedName& name, Stack stack) const {
+  // Classes live in the top-level compilation unit.
+  if (parent_) {
+    return parent_->create_class(name, std::move(stack));
+  }
+
+  // Look up the class
+  const auto classType =
+      class_compilation_unit().get_class(c10::QualifiedName(name));
+  if (!classType) {
+    AT_ERROR(
+        "Could not find class with name: '",
+        name.qualifiedName(),
+        "' in module.");
+  }
+
+  // Create a bare object with correct number of slots
+  const size_t numAttrs = classType->numAttributes();
+  auto obj = c10::ivalue::Object::create(classType, numAttrs);
+
+  // Invoke the `__init__()` of the class with the arguments provided.
+  Stack stackWithSelf = {obj};
+  for (auto& arg : stack) {
+    stackWithSelf.push_back(std::move(arg));
+  }
+  // Note: following Python, `__init__()` modifies its first parameter in-place
+  // and returns nothing.
+  classType->getMethod("__init__")->operator()(std::move(stackWithSelf));
+
+  return obj;
 }
 
 } // namespace script
diff --git a/torch/csrc/jit/script/module.h b/torch/csrc/jit/script/module.h
index 84988ca7750a..83d21ab2146f 100644
--- a/torch/csrc/jit/script/module.h
+++ b/torch/csrc/jit/script/module.h
@@ -143,7 +143,7 @@ struct TORCH_API Module {
   ~Module() {
     // ClassType own the compilation unit of their Functions, but each
     // Function has a self argument which owns the ClassType, created a
-    // referernce cycle. By dropping all the methods of the module's class
+    // reference cycle. By dropping all the methods of the module's class
     // here we break the cycle.
     class_compilation_unit().drop_all_functions();
   }
@@ -442,6 +442,14 @@ struct TORCH_API Module {
   // so that C++ users can easily add methods
   void define(const std::string& src, const ResolverPtr& resolver = nullptr);
 
+  template <typename... Types>
+  IValue create_class(const c10::QualifiedName& name, Types&&... args)
+      const {
+    return create_class(name, {IValue(std::forward<Types>(args))...});
+  }
+
+  IValue create_class(const c10::QualifiedName& name, Stack stack) const;
+
  private:
   std::pair<std::shared_ptr<Function>, std::vector<Slot>>
   lower_first_class_method(Function* fn);
diff --git a/torch/csrc/jit/script/python_sugared_value.cpp b/torch/csrc/jit/script/python_sugared_value.cpp
index f7fafbf74848..1a7552ba4610 100644
--- a/torch/csrc/jit/script/python_sugared_value.cpp
+++ b/torch/csrc/jit/script/python_sugared_value.cpp
@@ -24,6 +24,11 @@ std::shared_ptr<Function> as_function(const py::object& obj) {
   return nullptr;
 }
 
+thread_local bool recurse_on_python_ops = false;
+bool& getRecursiveScriptMode() {
+  return recurse_on_python_ops;
+}
+
 FunctionSchema PythonValue::getSchema(
     const size_t n_args,
     const size_t n_binders) {
@@ -73,6 +78,13 @@ FunctionSchema PythonValue::getSchema(
     }
     rets.push_back(Argument("0", ret_type, {}, {}, false));
   }
+  std::string name("");
+  // Use the qualified name if possible
+  if (py::hasattr(self, "__qualname__")) {
+    name = py::str(py::getattr(self, "__qualname__"));
+  } else if (py::hasattr(self, "__name__")) {
+    name = py::str(py::getattr(self, "__name__"));
+  }
   return FunctionSchema("", "", std::move(args), std::move(rets));
 }
 
@@ -106,11 +118,11 @@ std::shared_ptr<SugaredValue> PythonValue::call(
 
   // Mark if function is ignored on export
   if (py::cast<bool>(
-          py::module::import("torch.jit").attr("_try_get_ignored_op")(self))) {
+          py::module::import("torch.jit").attr("_is_ignored_function")(self))) {
     auto python_op = static_cast<PythonOp*>(new_node);
     python_op->ignore_on_export = true;
   }
-  new_node->setSourceLocation(std::make_shared<SourceRange>(loc));
+  new_node->setSourceRange(loc);
   for (auto& i : matched_schema->inputs)
     new_node->addInput(i);
 
@@ -330,6 +342,16 @@ std::vector<std::shared_ptr<SugaredValue>> ModuleValue::asTuple(
   return result;
 }
 
+void ModuleValue::setAttr(
+    const SourceRange& loc,
+    Function& m,
+    const std::string& field,
+    Value* newValue) {
+  // Forward to SimpleValue::setAttr
+  SimpleValue simple(self_);
+  simple.setAttr(loc, m, field, newValue);
+}
+
 std::shared_ptr<SugaredValue> BooleanDispatchValue::call(
     const SourceRange& loc,
     Function& caller,
@@ -454,6 +476,14 @@ std::shared_ptr<SugaredValue> toSugaredValue(
     }
   }
 
+  if (recurse_on_python_ops && py::isinstance<py::function>(obj)) {
+    auto compiled_fn =
+        py::module::import("torch.jit").attr("_try_compile_fn")(obj);
+    if (auto callee = as_function(compiled_fn)) {
+      return std::make_shared<FunctionValue>(callee);
+    }
+  }
+
   return std::make_shared<PythonValue>(obj);
 }
 
diff --git a/torch/csrc/jit/script/python_sugared_value.h b/torch/csrc/jit/script/python_sugared_value.h
index 714116d89c88..626af952e183 100644
--- a/torch/csrc/jit/script/python_sugared_value.h
+++ b/torch/csrc/jit/script/python_sugared_value.h
@@ -1,3 +1,4 @@
+
 #pragma once
 
 #include <torch/csrc/jit/pybind_utils.h>
@@ -160,6 +161,12 @@ struct VISIBILITY_HIDDEN ModuleValue : public SugaredValue {
       Function& m,
       const c10::optional<size_t>& size_hint = {}) override;
 
+  void setAttr(
+      const SourceRange& loc,
+      Function& m,
+      const std::string& field,
+      Value* newValue) override;
+
  private:
   Value* self_;
   std::shared_ptr<Module> module_;
@@ -185,6 +192,8 @@ struct VISIBILITY_HIDDEN BooleanDispatchValue : public SugaredValue {
   py::dict dispatched_fn_;
 };
 
+TORCH_API bool& getRecursiveScriptMode();
+
 } // namespace script
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/script/schema_matching.cpp b/torch/csrc/jit/script/schema_matching.cpp
index 8a4a4488feda..77a6d864db1a 100644
--- a/torch/csrc/jit/script/schema_matching.cpp
+++ b/torch/csrc/jit/script/schema_matching.cpp
@@ -99,9 +99,7 @@ Value* tryConvertToType(
     if (concrete_type->isSubtypeOf(NumberType::get()) &&
         value->type()->isSubtypeOf(TensorType::get())) {
       auto n = graph.createImplicitTensorToNum(concrete_type, value);
-      value = graph.insertNode(n)
-                  ->setSourceLocation(std::make_shared<SourceRange>(loc))
-                  ->output();
+      value = graph.insertNode(n)->setSourceRange(loc)->output();
     }
     if (value->type()->isSubtypeOf(StringType::get()) &&
         DeviceObjType::get()->isSubtypeOf(concrete_type)) {
@@ -134,8 +132,8 @@ Value* tryMatchArgument(
   const MatchTypeReturn matched_type =
       matchTypeVariables(arg.type(), value->type(), type_env);
   if (!matched_type.type) {
-    err() << "could not match type " << value->type()->str() << " to "
-          << arg.type()->str() << " in argument '" << arg.name()
+    err() << "could not match type " << value->type()->python_str() << " to "
+          << arg.type()->python_str() << " in argument '" << arg.name()
           << "': " << matched_type.errMsg << "\n"
           << named_value.locOr(loc);
     return nullptr;
@@ -145,9 +143,10 @@ Value* tryMatchArgument(
   value = tryConvertToType(loc, graph, concrete_type, value, allow_conversions);
 
   if (!value->type()->isSubtypeOf(concrete_type)) {
-    auto& ostream = err() << "expected a value of type " << concrete_type->str()
-                          << " for argument '" << arg.name() << "' but found "
-                          << value->type()->str() << "\n";
+    auto& ostream = err() << "expected a value of type "
+                          << concrete_type->python_str() << " for argument '"
+                          << arg.name() << "' but found "
+                          << value->type()->python_str() << "\n";
 
     if (auto v = value->type()->cast<ListType>()) {
       if (v->getElementType()->isSubtypeOf(TensorType::get())) {
@@ -355,7 +354,7 @@ static Value* emitBuiltinNode(
     Graph& graph,
     Symbol name) {
   auto n = graph.insertNode(graph.create(name, matched_schema.inputs, 0))
-               ->setSourceLocation(std::make_shared<SourceRange>(loc));
+               ->setSourceRange(loc);
 
   for (auto& ret : matched_schema.return_types) {
     n->addOutput()->setType(ret);
diff --git a/torch/csrc/jit/script/schema_type_parser.cpp b/torch/csrc/jit/script/schema_type_parser.cpp
index 1897be8e8776..ef99709a5fbd 100644
--- a/torch/csrc/jit/script/schema_type_parser.cpp
+++ b/torch/csrc/jit/script/schema_type_parser.cpp
@@ -1,31 +1,31 @@
-#include <torch/csrc/jit/script/schema_type_parser.h>
-#include <ATen/core/interned_strings.h>
 #include <ATen/core/alias_info.h>
+#include <ATen/core/interned_strings.h>
 #include <ATen/core/jit_type.h>
+#include <c10/util/string_utils.h>
 #include <torch/csrc/jit/script/lexer.h>
 #include <torch/csrc/jit/script/parse_string_literal.h>
-#include <c10/util/string_utils.h>
+#include <torch/csrc/jit/script/schema_type_parser.h>
 #include <string>
 
-using c10::Symbol;
+using c10::AliasInfo;
+using c10::BoolType;
+using c10::CompleteTensorType;
+using c10::DeviceObjType;
+using c10::DictType;
+using c10::DimensionedTensorType;
+using c10::FloatType;
+using c10::FutureType;
 using c10::GeneratorType;
 using c10::IntType;
-using c10::DeviceObjType;
-using c10::NumberType;
-using c10::StringType;
-using c10::BoolType;
+using c10::ListType;
 using c10::NoneType;
-using c10::FloatType;
+using c10::NumberType;
 using c10::OptionalType;
-using c10::TupleType;
+using c10::StringType;
+using c10::Symbol;
 using c10::TensorType;
-using c10::DimensionedTensorType;
-using c10::CompleteTensorType;
-using c10::FutureType;
-using c10::DictType;
-using c10::ListType;
+using c10::TupleType;
 using c10::VarType;
-using c10::AliasInfo;
 
 namespace torch {
 namespace jit {
@@ -36,6 +36,7 @@ TypeAndAlias SchemaTypeParser::parseBaseType() {
       {"Generator", GeneratorType::get()},
       {"ScalarType", IntType::get()},
       {"Layout", IntType::get()},
+      {"MemoryFormat", IntType::get()},
       {"Device", DeviceObjType::get()},
       {"Scalar", NumberType::get()},
       {"str", StringType::get()},
@@ -75,10 +76,10 @@ c10::optional<AliasInfo> SchemaTypeParser::parseAliasAnnotation() {
     // optional 'alias set annotation'
     parseList(TK_NOTHING, '|', TK_NOTHING, [&] {
       if (L.nextIf('*')) {
-        alias_info = AliasInfo::createWildcard();
+        alias_info.addBeforeSet(AliasInfo::wildcardSet());
 
         // If we found a wildcard, ignore all subsequent annotations
-      } else if (!alias_info.isWildcard()) {
+      } else if (!alias_info.isWildcardBefore()) {
         alias_info.addBeforeSet(
             Symbol::fromQualString("alias::" + L.expect(TK_IDENT).text()));
       }
@@ -89,11 +90,14 @@ c10::optional<AliasInfo> SchemaTypeParser::parseAliasAnnotation() {
     if (L.nextIf(TK_ARROW)) {
       // optional 'alias set annotation'
       parseList(TK_NOTHING, '|', TK_NOTHING, [&] {
-        if (L.cur().kind == '*') {
-          L.reportError("Wildcard not allowed as part of the output set");
+        if (L.nextIf('*')) {
+          alias_info.addAfterSet(AliasInfo::wildcardSet());
+
+          // If we found a wildcard, ignore all subsequent annotations
+        } else if (!alias_info.isWildcardAfter()) {
+          alias_info.addAfterSet(
+              Symbol::fromQualString("alias::" + L.expect(TK_IDENT).text()));
         }
-        alias_info.addAfterSet(
-            Symbol::fromQualString("alias::" + L.expect(TK_IDENT).text()));
       });
     } else {
       // We didn't encounter an ->, so assume the "after set" is identical
diff --git a/torch/csrc/jit/script/script_type_parser.cpp b/torch/csrc/jit/script/script_type_parser.cpp
index 1c601d218260..8fbd9db143bf 100644
--- a/torch/csrc/jit/script/script_type_parser.cpp
+++ b/torch/csrc/jit/script/script_type_parser.cpp
@@ -186,7 +186,8 @@ TypePtr ScriptTypeParser::parseTypeFromExpr(const Expr& expr) const {
         return typePtr;
       }
     }
-    throw ErrorReport(expr) << "Unknown type name " << *name;
+
+    throw ErrorReport(expr) << "Unknown type name '" << *name << "'";
   }
   throw ErrorReport(expr.range())
       << "Expression of type " << kindToString(expr.kind())
diff --git a/torch/csrc/jit/script/strtod.cpp b/torch/csrc/jit/script/strtod.cpp
index 4d40725b50ee..e4f44a7902ad 100644
--- a/torch/csrc/jit/script/strtod.cpp
+++ b/torch/csrc/jit/script/strtod.cpp
@@ -76,6 +76,13 @@ namespace torch {
 namespace jit {
 namespace script {
 
+#ifdef _MSC_VER
+C10_EXPORT double strtod_c(const char *nptr, char **endptr)
+{
+    static _locale_t loc = _create_locale(LC_ALL, "C");
+    return _strtod_l(nptr, endptr, loc);
+}
+#else
 C10_EXPORT double strtod_c(const char *nptr, char **endptr)
 {
     char *fail_pos;
@@ -240,6 +247,7 @@ C10_EXPORT double strtod_c(const char *nptr, char **endptr)
     errno = EINVAL;
     return -1.0;
 }
+#endif
 
 
 C10_EXPORT float strtof_c(const char *nptr, char **endptr)
diff --git a/torch/csrc/jit/script/sugared_value.cpp b/torch/csrc/jit/script/sugared_value.cpp
index 0ed60e5f3c82..963d67168edb 100644
--- a/torch/csrc/jit/script/sugared_value.cpp
+++ b/torch/csrc/jit/script/sugared_value.cpp
@@ -37,8 +37,7 @@ std::shared_ptr<SugaredValue> PrintValue::call(
     }
     lowered_inputs.erase(lowered_inputs.begin());
   }
-  g.insertNode(g.create(prim::Print, lowered_inputs, 0)
-                   ->setSourceLocation(std::make_shared<SourceRange>(loc)));
+  g.insertNode(g.create(prim::Print, lowered_inputs, 0)->setSourceRange(loc));
   return std::make_shared<NoneValue>();
 }
 
@@ -157,7 +156,7 @@ std::vector<std::shared_ptr<SugaredValue>> SimpleValue::asTuple(
         graph->insertNode(graph->createListUnpack(value_, *size_hint));
     return fmap(unpack->outputs(), make_simple_value);
   }
-  throw ErrorReport(loc) << value_->type()->str()
+  throw ErrorReport(loc) << value_->type()->python_str()
                          << " cannot be used as a tuple";
 }
 
@@ -169,7 +168,8 @@ void SimpleValue::setAttr(
   const auto classType = value_->type()->cast<ClassType>();
   if (!classType) {
     throw ErrorReport(loc) << "Tried to set an attribute: " << field
-                           << " on a non-class: " << value_->type()->str();
+                           << " on a non-class: "
+                           << value_->type()->python_str();
   }
   auto expectedType = classType->getAttribute(field);
   if (!expectedType) {
@@ -209,8 +209,8 @@ void SimpleValue::setAttr(
   const auto newType = newValue->type();
   if (!newType->isSubtypeOf(expectedType)) {
     throw ErrorReport(loc) << "Wrong type for attribute assignment. Expected "
-                           << expectedType->str() << " but got "
-                           << newType->str();
+                           << expectedType->python_str() << " but got "
+                           << newType->python_str();
   }
 
   auto& g = *m.graph();
diff --git a/torch/csrc/jit/script/sugared_value.h b/torch/csrc/jit/script/sugared_value.h
index d376df73192a..45460761126c 100644
--- a/torch/csrc/jit/script/sugared_value.h
+++ b/torch/csrc/jit/script/sugared_value.h
@@ -214,7 +214,6 @@ struct TORCH_API ClassValue : public SugaredValue {
   ClassTypePtr type_;
 };
 
-
 struct FunctionValue : public SugaredValue {
   FunctionValue(std::shared_ptr<Function> callee)
       : callee_(std::move(callee)) {}
@@ -232,6 +231,7 @@ struct FunctionValue : public SugaredValue {
     return std::make_shared<SimpleValue>(
         callee_->emit_call(*f.graph(), loc, inputs, attributes));
   }
+
  private:
   std::shared_ptr<Function> callee_;
 };
@@ -262,8 +262,6 @@ struct MethodValue : public SugaredValue {
   std::shared_ptr<Function> method_;
 };
 
-
-
 struct TORCH_API PrintValue : public SugaredValue {
   std::string kind() const override {
     return "print";
@@ -301,13 +299,19 @@ struct TORCH_API CastValue : public BuiltinFunction {
   TypePtr type_;
 };
 
+using SugaredValuePtr = std::shared_ptr<SugaredValue>;
+
 // builtins operators and functions that call a method if it exists
 // on a class type, like 'len(x)' and 'x + y'
-struct TORCH_API OperatorOverload : public BuiltinFunction {
-  OperatorOverload(c10::Symbol builtin_method, std::string desugared_name)
-      : BuiltinFunction(builtin_method, c10::nullopt),
+struct TORCH_API MagicMethod : public SugaredValue {
+  MagicMethod(std::string desugared_name, SugaredValuePtr base)
+      : base_value_(std::move(base)),
         desugared_name_(std::move(desugared_name)) {}
 
+  std::string kind() const override {
+    return desugared_name_;
+  }
+
   std::shared_ptr<SugaredValue> call(
       const SourceRange& loc,
       Function& m,
@@ -322,17 +326,18 @@ struct TORCH_API OperatorOverload : public BuiltinFunction {
               method->emit_call(*m.graph(), loc, inputs, attributes));
         } else {
           ErrorReport e(loc);
-          e << "Cannot call builtin operator " << symbol.toDisplayString()
-            << " on " << class_ptr->python_str() << " because it does not "
+          e << "Cannot call " << desugared_name_ << " on "
+            << class_ptr->python_str() << " because it does not "
             << " define a " << desugared_name_ << " method";
           throw e;
         }
       }
     }
-    return BuiltinFunction::call(loc, m, inputs, attributes, n_binders);
+    return base_value_->call(loc, m, inputs, attributes, n_binders);
   }
 
  private:
+  SugaredValuePtr base_value_;
   std::string desugared_name_;
 };
 
diff --git a/torch/csrc/jit/script/tree.h b/torch/csrc/jit/script/tree.h
index d0f65f7251c2..0a48b6434f47 100644
--- a/torch/csrc/jit/script/tree.h
+++ b/torch/csrc/jit/script/tree.h
@@ -6,6 +6,7 @@
 #include <vector>
 
 #include <torch/csrc/jit/script/lexer.h>
+#include <c10/util/SmallVector.h>
 
 namespace torch {
 namespace jit {
@@ -29,7 +30,7 @@ namespace script {
 
 struct Tree;
 using TreeRef = std::shared_ptr<Tree>;
-using TreeList = std::vector<TreeRef>;
+using TreeList = at::SmallVector<TreeRef, 4>;
 
 static const TreeList empty_trees = {};
 
diff --git a/torch/csrc/jit/source_location.h b/torch/csrc/jit/source_location.h
deleted file mode 100644
index cd851d5c7071..000000000000
--- a/torch/csrc/jit/source_location.h
+++ /dev/null
@@ -1,54 +0,0 @@
-#pragma once
-
-#include <ostream>
-#include <sstream>
-#include <stdexcept>
-#include <string>
-
-namespace torch {
-namespace jit {
-// SourceLocation represents source code-level debug information for a node.
-// It contains information about where a node got generated.
-// In the case of tracing this will be a python stack trace.
-// In the case of using the scripting frontend this will be backed
-// by a SourceRange object
-struct SourceLocation {
-  virtual ~SourceLocation() = default;
-  virtual void highlight(std::ostream& out) const = 0;
-
-  std::string wrapException(
-      const std::exception& e,
-      const std::string& additional = "") {
-    std::stringstream msg;
-    msg << "\n" << e.what() << ":\n";
-    if (!additional.empty()) {
-      msg << additional << ":\n";
-    }
-    highlight(msg);
-    return msg.str();
-  }
-  void wrapAndRethrowException(
-      const std::exception& e,
-      const std::string& additional = "") {
-    throw std::runtime_error(wrapException(e, additional));
-  }
-};
-
-inline std::ostream& operator<<(std::ostream& out, const SourceLocation& sl) {
-  sl.highlight(out);
-  return out;
-}
-
-// normally a python stack trace
-struct StringSourceLocation : public SourceLocation {
-  StringSourceLocation(std::string context) : context(std::move(context)) {}
-  void highlight(std::ostream& out) const override {
-    out << context;
-  }
-
- private:
-  std::string context;
-};
-
-} // namespace jit
-} // namespace torch
diff --git a/torch/csrc/jit/source_range.cpp b/torch/csrc/jit/source_range.cpp
new file mode 100644
index 000000000000..f648f7b8627a
--- /dev/null
+++ b/torch/csrc/jit/source_range.cpp
@@ -0,0 +1,56 @@
+#include <torch/csrc/jit/source_range.h>
+
+namespace torch {
+namespace jit {
+
+// a range of a shared string 'file_' with
+C10_EXPORT void SourceRange::highlight(std::ostream& out) const {
+  if (size() == file_->size()) {
+    // this is just the entire file, not a subset, so print it out.
+    // primarily used to print out python stack traces
+    out << *file_;
+    return;
+  }
+
+  const std::string& str = file();
+  size_t begin_line = start(); // beginning of line to highlight
+  size_t end_line = start(); // end of line to highlight
+  while (begin_line > 0 && str[begin_line - 1] != '\n')
+    --begin_line;
+  while (end_line < str.size() && str[end_line] != '\n')
+    ++end_line;
+  AT_ASSERT(begin_line == 0 || str[begin_line - 1] == '\n');
+  AT_ASSERT(end_line == str.size() || str[end_line] == '\n');
+
+  size_t begin_highlight = begin_line; // beginning of context, CONTEXT lines
+                                       // before the highlight line
+  for (size_t i = 0; begin_highlight > 0; --begin_highlight) {
+    if (str[begin_highlight - 1] == '\n')
+      ++i;
+    if (i >= CONTEXT)
+      break;
+  }
+  AT_ASSERT(begin_highlight == 0 || str[begin_highlight - 1] == '\n');
+
+  size_t end_highlight =
+      end_line; // end of context, CONTEXT lines after the highlight line
+  for (size_t i = 0; end_highlight < str.size(); ++end_highlight) {
+    if (str[end_highlight] == '\n')
+      ++i;
+    if (i >= CONTEXT)
+      break;
+  }
+  AT_ASSERT(end_highlight == str.size() || str[end_highlight] == '\n');
+
+  out << str.substr(begin_highlight, end_line - begin_highlight) << "\n";
+  out << std::string(start() - begin_line, ' ');
+  size_t len = std::min(size(), end_line - start());
+  out << std::string(len, '~')
+      << (len < size() ? "...  <--- HERE" : " <--- HERE");
+  out << str.substr(end_line, end_highlight - end_line);
+  if (!str.empty() && str.back() != '\n')
+    out << "\n";
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/source_range.h b/torch/csrc/jit/source_range.h
index 7bd6870139d7..f9b1d216fb63 100644
--- a/torch/csrc/jit/source_range.h
+++ b/torch/csrc/jit/source_range.h
@@ -1,67 +1,31 @@
 #pragma once
 #include <c10/util/Exception.h>
-#include <torch/csrc/jit/source_location.h>
 
 #include <algorithm>
 #include <memory>
-
+#include <iostream>
 namespace torch {
 namespace jit {
 
 // a range of a shared string 'file_' with functions to help debug by highlight
 // that
 // range.
-struct SourceRange : public SourceLocation {
+struct CAFFE2_API SourceRange {
   SourceRange(std::shared_ptr<std::string> file_, size_t start_, size_t end_)
       : file_(std::move(file_)), start_(start_), end_(end_) {}
+  explicit SourceRange(std::string string_range)
+      : file_(std::make_shared<std::string>(std::move(string_range))),
+        start_(0),
+        end_(file_->size()) {}
+
   const std::string text() const {
     return file().substr(start(), end() - start());
   }
   size_t size() const {
     return end() - start();
   }
-
   static const size_t CONTEXT = 10;
-  void highlight(std::ostream& out) const override {
-    const std::string& str = file();
-    size_t begin_line = start(); // beginning of line to highlight
-    size_t end_line = start(); // end of line to highlight
-    while (begin_line > 0 && str[begin_line - 1] != '\n')
-      --begin_line;
-    while (end_line < str.size() && str[end_line] != '\n')
-      ++end_line;
-    AT_ASSERT(begin_line == 0 || str[begin_line - 1] == '\n');
-    AT_ASSERT(end_line == str.size() || str[end_line] == '\n');
-
-    size_t begin_highlight = begin_line; // beginning of context, CONTEXT lines
-                                         // before the highlight line
-    for (size_t i = 0; begin_highlight > 0; --begin_highlight) {
-      if (str[begin_highlight - 1] == '\n')
-        ++i;
-      if (i >= CONTEXT)
-        break;
-    }
-    AT_ASSERT(begin_highlight == 0 || str[begin_highlight - 1] == '\n');
-
-    size_t end_highlight =
-        end_line; // end of context, CONTEXT lines after the highlight line
-    for (size_t i = 0; end_highlight < str.size(); ++end_highlight) {
-      if (str[end_highlight] == '\n')
-        ++i;
-      if (i >= CONTEXT)
-        break;
-    }
-    AT_ASSERT(end_highlight == str.size() || str[end_highlight] == '\n');
-
-    out << str.substr(begin_highlight, end_line - begin_highlight) << "\n";
-    out << std::string(start() - begin_line, ' ');
-    size_t len = std::min(size(), end_line - start());
-    out << std::string(len, '~')
-        << (len < size() ? "...  <--- HERE" : " <--- HERE");
-    out << str.substr(end_line, end_highlight - end_line);
-    if (!str.empty() && str.back() != '\n')
-      out << "\n";
-  }
+  void highlight(std::ostream& out) const;
   const std::string& file() const {
     return *file_;
   }
@@ -74,6 +38,27 @@ struct SourceRange : public SourceLocation {
   size_t end() const {
     return end_;
   }
+  std::string str() const {
+    std::stringstream ss;
+    highlight(ss);
+    return ss.str();
+  }
+  std::string wrapException(
+      const std::exception& e,
+      const std::string& additional = "") {
+    std::stringstream msg;
+    msg << "\n" << e.what() << ":\n";
+    if (!additional.empty()) {
+      msg << additional << ":\n";
+    }
+    highlight(msg);
+    return msg.str();
+  }
+  void wrapAndRethrowException(
+      const std::exception& e,
+      const std::string& additional = "") {
+    throw std::runtime_error(wrapException(e, additional));
+  }
 
  private:
   std::shared_ptr<std::string> file_;
@@ -81,5 +66,10 @@ struct SourceRange : public SourceLocation {
   size_t end_;
 };
 
+inline std::ostream& operator<<(std::ostream& out, const SourceRange& range) {
+  range.highlight(out);
+  return out;
+}
+
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/subgraph_matcher.cpp b/torch/csrc/jit/subgraph_matcher.cpp
index f6f2dfa4fc4b..58bd6b01f9bc 100644
--- a/torch/csrc/jit/subgraph_matcher.cpp
+++ b/torch/csrc/jit/subgraph_matcher.cpp
@@ -21,24 +21,24 @@ class SubgraphMatcher {
    * is the same as in the corresponding matchGraph node, its type is the same,
    * and all nodes producing input-values also match.
    */
-  bool matchesSubgraphFromAnchorNode(const Node* anchor);
+  bool matchesSubgraphFromAnchorNode(Node* anchor);
 
   /** \brief Return match map for nodes. */
-  std::unordered_map<const Node*, const Node*> nodes_map() const {
+  std::unordered_map<const Node*, Node*> nodes_map() const {
     return nodes_map_;
   }
 
   /** \brief Return match map for values. */
-  std::unordered_map<const Value*, const Value*> values_map() const {
+  std::unordered_map<const Value*, Value*> values_map() const {
     return values_map_;
   }
 
  private:
-  bool matchValues(const Value* v1, const Value* v2);
-  bool matchNodes(const Node* n1, const Node* n2);
+  bool matchValues(const Value* v1, Value* v2);
+  bool matchNodes(const Node* n1, Node* n2);
 
-  std::unordered_map<const Node*, const Node*> nodes_map_;
-  std::unordered_map<const Value*, const Value*> values_map_;
+  std::unordered_map<const Node*, Node*> nodes_map_;
+  std::unordered_map<const Value*, Value*> values_map_;
 
   const Graph& pattern_;
   const Node* anchor_ = nullptr;
@@ -73,7 +73,7 @@ bool patternGraphIsValid(const Graph& pattern) {
  * 1) the nodes defining them match
  * 2) they have the same number of uses, except they are entry or exit nodes.
  */
-bool SubgraphMatcher::matchValues(const Value* v1, const Value* v2) {
+bool SubgraphMatcher::matchValues(const Value* v1, Value* v2) {
   // Check if we've already visited these values.
   if (values_map_.count(v1)) {
     return values_map_.at(v1) == v2;
@@ -104,7 +104,7 @@ bool SubgraphMatcher::matchValues(const Value* v1, const Value* v2) {
  * A special case is when N1 is PARAM - this is considered outside the pattern,
  * so it matches everything.
  */
-bool SubgraphMatcher::matchNodes(const Node* n1, const Node* n2) {
+bool SubgraphMatcher::matchNodes(const Node* n1, Node* n2) {
   // Check if we've already visited these nodes.
   if (nodes_map_.count(n1)) {
     return nodes_map_.at(n1) == n2;
@@ -123,7 +123,8 @@ bool SubgraphMatcher::matchNodes(const Node* n1, const Node* n2) {
 
   if (n1->kind() != n2->kind() ||
       n1->outputs().size() != n2->outputs().size() ||
-      n1->inputs().size() != n2->inputs().size()) {
+      n1->inputs().size() != n2->inputs().size() ||
+      n1->numAttributes() != n2->numAttributes()) {
     return false;
   }
 
@@ -140,6 +141,31 @@ bool SubgraphMatcher::matchNodes(const Node* n1, const Node* n2) {
       return false;
     }
   }
+  for (const Symbol& attr_name : n1->attributeNames()) {
+    if (n1->kindOf(attr_name) != n2->kindOf(attr_name)) {
+      return false;
+    }
+    switch (n1->kindOf(attr_name)) {
+      case AttributeKind::s:
+        if (n1->s(attr_name) != n2->s(attr_name)) {
+          return false;
+        }
+        break;
+      case AttributeKind::f:
+        if (n1->f(attr_name) != n2->f(attr_name)) {
+          return false;
+        }
+        break;
+      case AttributeKind::i:
+        if (n1->i(attr_name) != n2->i(attr_name)) {
+          return false;
+        }
+        break;
+      default:
+        // Other attributes types not supported yet
+        return false;
+    }
+  }
 
   return true;
 }
@@ -148,7 +174,7 @@ bool SubgraphMatcher::matchNodes(const Node* n1, const Node* n2) {
  * Recursively try to match pattern with the actual graph starting from the
  * exiting node in the pattern and anchor node in the actual graph.
  */
-bool SubgraphMatcher::matchesSubgraphFromAnchorNode(const Node* anchor) {
+bool SubgraphMatcher::matchesSubgraphFromAnchorNode(Node* anchor) {
   nodes_map_.clear();
   values_map_.clear();
   anchor_ = anchor;
@@ -167,26 +193,24 @@ bool SubgraphMatcher::matchesSubgraphFromAnchorNode(const Node* anchor) {
 } // unnamed namespace
 
 // Main entry point for the subgraph matching.
-std::vector<Match> findPatternMatches(
-    const Graph& pattern,
-    const Graph& graph) {
+std::vector<Match> findPatternMatches(const Graph& pattern, Graph& graph) {
   AT_ASSERT(patternGraphIsValid(pattern));
 
   SubgraphMatcher m(pattern);
   std::vector<Match> matches;
-  std::stack<const Block*> blocks_to_visit;
+  std::stack<Block*> blocks_to_visit;
 
   // Iterate over all nodes in the graph (including nodes in subblocks) trying
   // to match the pattern each node.
   blocks_to_visit.push(graph.block());
   while (!blocks_to_visit.empty()) {
-    const Block* block = blocks_to_visit.top();
+    Block* block = blocks_to_visit.top();
     blocks_to_visit.pop();
-    for (const Node* n : block->nodes()) {
+    for (Node* n : block->nodes()) {
       if (m.matchesSubgraphFromAnchorNode(n)) {
         matches.push_back({n, m.nodes_map(), m.values_map()});
       }
-      for (const Block* subblock : n->blocks()) {
+      for (Block* subblock : n->blocks()) {
         blocks_to_visit.push(subblock);
       }
     }
diff --git a/torch/csrc/jit/subgraph_matcher.h b/torch/csrc/jit/subgraph_matcher.h
index f5440a37629c..0885aff61666 100644
--- a/torch/csrc/jit/subgraph_matcher.h
+++ b/torch/csrc/jit/subgraph_matcher.h
@@ -17,9 +17,9 @@ namespace jit {
  * (match-map values). We keep such maps for both nodes and values.
  */
 struct Match {
-  const Node* anchor;
-  std::unordered_map<const Node*, const Node*> nodes_map;
-  std::unordered_map<const Value*, const Value*> values_map;
+  Node* anchor;
+  std::unordered_map<const Node*, Node*> nodes_map;
+  std::unordered_map<const Value*, Value*> values_map;
 };
 
 /**
@@ -42,9 +42,12 @@ struct Match {
  *  - Aliasing nodes in the graph can not consitute a match (i.e. in all found
  * matches no nodes in the subgraph alias with each other). TODO: the check not
  * implemented yet.
+ *  - The matcher will not mutate either the pattern graph or the matched graph,
+ * but the latter is taken as non-const so that Match may contain non-const
+ * pointers.  This enables clients of this API to use Match to drive mutations.
  */
 std::vector<Match> TORCH_API
-findPatternMatches(const Graph& pattern, const Graph& graph);
+findPatternMatches(const Graph& pattern, Graph& graph);
 
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/symbolic_script.cpp b/torch/csrc/jit/symbolic_script.cpp
index eefc70613fd8..91c41780a6de 100644
--- a/torch/csrc/jit/symbolic_script.cpp
+++ b/torch/csrc/jit/symbolic_script.cpp
@@ -402,11 +402,11 @@ const std::vector<std::string> functions = {
 
             return torch._dim_arange(like, dim), backward
 
-        def contiguous(self):
+        def contiguous(self, *, memory_format: int=0):
             def backward(grad_output):
-                return grad_output
+                return grad_output, None
 
-            return self.contiguous(), backward
+            return self.contiguous(memory_format=memory_format), backward
 
         def dot(self, tensor):
             def backward(grad_output):
@@ -1067,7 +1067,8 @@ const std::vector<std::string> functions = {
 
             return output, backward
 
-        def layer_norm(input : Tensor,
+        # disable the layernorm AD temporarily because of bug in https://github.com/pytorch/pytorch/issues/19769
+        def layer_norm_disabled(input : Tensor,
                        normalized_shape : List[int],
                        weight : Optional[Tensor],
                        bias : Optional[Tensor],
@@ -1150,6 +1151,7 @@ const std::vector<std::string> functions = {
                 res = mask * input / p1m
 
             def backward(grad_output):
+                use_cuda = grad_output.is_cuda
                 if use_cuda:
                     grad_input = AD_fused_dropout_backward(grad_output, mask, p1m)
                 else:
@@ -1295,20 +1297,20 @@ std::unordered_map<const FunctionSchema*, GradientPair> cached_gradient_pairs;
 } // anonymous namespace
 
 std::pair<std::shared_ptr<Graph>, Value*> extractClosure(Value* closure) {
-  AT_CHECK(
+  TORCH_CHECK(
       closure->node()->kind() == prim::TupleConstruct,
       "closure must be a literal tuple construct");
   Value* fn = closure->node()->inputs().at(0);
   Value* context = closure->node()->inputs().at(1);
 
-  AT_CHECK(
+  TORCH_CHECK(
       fn->node()->kind() == prim::Function,
       "closure tuple must contain a prim::Function");
   return std::make_pair(fn->node()->g(attr::Subgraph), context);
 }
 
 Argument originalReturnType(const TupleTypePtr& tup) {
-  AT_CHECK(tup->elements().size() > 1);
+  TORCH_CHECK(tup->elements().size() > 1);
   if (tup->elements().size() == 2)
     return Argument("", tup->elements().at(0));
   std::vector<TypePtr> types = tup->elements().vec();
@@ -1353,7 +1355,7 @@ void loadModule(const script::CompilationUnit& module) {
     Node* forward_tuple = pair.forward->outputs().at(0)->node();
 
     if (forward_tuple->kind() != prim::TupleConstruct) {
-      throw script::ErrorReport(forward_tuple->getSourceLocation())
+      throw script::ErrorReport(forward_tuple->sourceRange())
           << "gradient must return literal a tuple";
     }
 
diff --git a/torch/csrc/jit/tracer.cpp b/torch/csrc/jit/tracer.cpp
index 0f23cb10d4ef..912a678d19a8 100644
--- a/torch/csrc/jit/tracer.cpp
+++ b/torch/csrc/jit/tracer.cpp
@@ -451,6 +451,9 @@ void addInputs(Node* n, const char* name, at::Layout value) {
 void addInputs(Node* n, const char* name, at::ScalarType value) {
   detail::genericAddInput(n, static_cast<int64_t>(value));
 }
+void addInputs(Node* n, const char* name, at::MemoryFormat value) {
+  detail::genericAddInput(n, static_cast<int64_t>(value));
+}
 void addInputs(
     Node* n,
     const char* name,
diff --git a/torch/csrc/jit/tracer.h b/torch/csrc/jit/tracer.h
index 5474117d834e..cdb9c7032108 100644
--- a/torch/csrc/jit/tracer.h
+++ b/torch/csrc/jit/tracer.h
@@ -148,6 +148,7 @@ TORCH_API void addInputs(
     Node* n,
     const char* name,
     const c10::optional<at::ScalarType>& value);
+TORCH_API void addInputs(Node* n, const char* name, at::MemoryFormat value);
 TORCH_API void addInputs(Node* n, const char* name, at::Generator* value);
 
 template<typename T>
diff --git a/torch/csrc/tensor/python_tensor.cpp b/torch/csrc/tensor/python_tensor.cpp
index 9439296acf2a..abbe8b083df5 100644
--- a/torch/csrc/tensor/python_tensor.cpp
+++ b/torch/csrc/tensor/python_tensor.cpp
@@ -225,7 +225,7 @@ static THPObjectPtr get_tensor_dict() {
   if (!tensor_class) throw python_error();
 
   auto tensor_type = (PyTypeObject*)tensor_class.get();
-  AT_CHECK(tensor_type->tp_base, "missing base type for Tensor");
+  TORCH_CHECK(tensor_type->tp_base, "missing base type for Tensor");
 
   auto res = THPObjectPtr(PyDict_New());
   if (!res) throw python_error();
diff --git a/torch/csrc/utils/python_arg_parser.cpp b/torch/csrc/utils/python_arg_parser.cpp
index 63e462542fe1..55fe41d8f122 100644
--- a/torch/csrc/utils/python_arg_parser.cpp
+++ b/torch/csrc/utils/python_arg_parser.cpp
@@ -2,6 +2,7 @@
 
 #include <torch/csrc/Exceptions.h>
 #include <torch/csrc/Layout.h>
+#include <torch/csrc/MemoryFormat.h>
 #include <torch/csrc/utils/invalid_arguments.h>
 #include <torch/csrc/utils/python_strings.h>
 
@@ -28,10 +29,33 @@ static std::unordered_map<std::string, ParameterType> type_map = {
   {"PyObject*", ParameterType::PYOBJECT},
   {"ScalarType", ParameterType::SCALARTYPE},
   {"Layout", ParameterType::LAYOUT},
+  {"MemoryFormat", ParameterType::MEMORY_FORMAT},
   {"Device", ParameterType::DEVICE},
   {"std::string", ParameterType::STRING},
 };
 
+// Default arg name translations for compatibility with NumPy.
+//
+// Example:
+// ```python
+// t = torch.randn(10,10)
+// torch.sum(a=t, axis=0, keepdim=True)
+// ```
+//
+// A vector is necessary, because we might need to try multiple values.
+// In particular, NumPy sometimes uses "x" and sometimes "a" for the main input tensor.
+// Rather than annotate each function separately with whether it should take "x" or "a",
+// just try both.
+//
+// TODO: Allow individual functions to specify non-default translations:
+// For example, `torch.pow` should translate "exponent" to "x2".
+static const std::unordered_map<std::string, std::vector<std::string>> numpy_compatibility_arg_names = {
+  {"dim", {"axis"}},
+  {"keepdim", {"keepdims"}},
+  {"input", {"x", "a", "x1"}},
+  {"other", {"x2"}},
+};
+
 // TODO: remove this. This is a temporary list of functions that allow Python
 // numbers to bind to Tensors. Some binary ops have separate Tensor and Scalar
 // overloads and binding to the Tensor overload with a number of a different
@@ -94,11 +118,13 @@ FunctionParameter::FunctionParameter(const std::string& fmt, bool keyword_only)
   } else {
     name = name_str;
   }
-#if PY_MAJOR_VERSION == 2
-  python_name = PyString_InternFromString(name.c_str());
-#else
-  python_name = PyUnicode_InternFromString(name.c_str());
-#endif
+  python_name = THPUtils_internString(name);
+  auto np_compat_it = numpy_compatibility_arg_names.find(name);
+  if (np_compat_it != numpy_compatibility_arg_names.end()) {
+    for (const auto& str: np_compat_it->second) {
+      numpy_python_names.push_back(THPUtils_internString(str));
+    }
+  }
 }
 
 bool FunctionParameter::check(PyObject* obj) {
@@ -145,6 +171,7 @@ bool FunctionParameter::check(PyObject* obj) {
     case ParameterType::PYOBJECT: return true;
     case ParameterType::SCALARTYPE: return THPDtype_Check(obj);
     case ParameterType::LAYOUT: return THPLayout_Check(obj);
+    case ParameterType::MEMORY_FORMAT: return THPMemoryFormat_Check(obj);
     case ParameterType::DEVICE:
       return THPUtils_checkLong(obj) || THPUtils_checkString(obj) || THPDevice_Check(obj);
     case ParameterType::STRING: return THPUtils_checkString(obj);
@@ -166,6 +193,7 @@ std::string FunctionParameter::type_name() const {
     case ParameterType::PYOBJECT: return "object";
     case ParameterType::SCALARTYPE: return "torch.dtype";
     case ParameterType::LAYOUT: return "torch.layout";
+    case ParameterType::MEMORY_FORMAT: return "torch.memory_format";
     case ParameterType::DEVICE: return "torch.device";
     case ParameterType::STRING: return "str";
     default: throw std::runtime_error("unknown parameter type");
@@ -201,7 +229,7 @@ static inline std::vector<int64_t> parse_intlist_args(const std::string& s, int6
   // case 2. s is a list of dims (e.g., s={1,2})
 
   // since already checked left brace '{' above, here only checks right brace '}'
-  AT_CHECK(s[n - 1] == '}', "Default value of IntArrayRef is missing right brace '}', found ", s[n - 1]);
+  TORCH_CHECK(s[n - 1] == '}', "Default value of IntArrayRef is missing right brace '}', found ", s[n - 1]);
 
   auto args = std::vector<int64_t>();
   std::istringstream ss(s.substr(1, s.length() - 2)); // exclude '{' and '}'
@@ -461,6 +489,12 @@ bool FunctionSignature::parse(PyObject* args, PyObject* kwargs, PyObject* dst[],
       obj = PyTuple_GET_ITEM(args, arg_pos);
     } else if (kwargs) {
       obj = PyDict_GetItem(kwargs, param.python_name);
+      for (PyObject *numpy_name: param.numpy_python_names) {
+        if (obj) {
+          break;
+        }
+        obj = PyDict_GetItem(kwargs, numpy_name);
+      }
       is_kwd = true;
     }
 
diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h
index 8b0b1d3c74f7..7241f2e2f847 100644
--- a/torch/csrc/utils/python_arg_parser.h
+++ b/torch/csrc/utils/python_arg_parser.h
@@ -47,6 +47,7 @@
 #include <torch/csrc/DynamicTypes.h>
 #include <torch/csrc/Exceptions.h>
 #include <torch/csrc/Generator.h>
+#include <torch/csrc/MemoryFormat.h>
 #include <torch/csrc/autograd/python_variable.h>
 #include <torch/csrc/jit/tracer.h>
 #include <torch/csrc/tensor/python_tensor.h>
@@ -70,7 +71,7 @@ namespace torch {
 
 enum class ParameterType {
   TENSOR, SCALAR, INT64, DOUBLE, TENSOR_LIST, INT_LIST, GENERATOR,
-  BOOL, STORAGE, PYOBJECT, SCALARTYPE, LAYOUT, DEVICE, STRING
+  BOOL, STORAGE, PYOBJECT, SCALARTYPE, LAYOUT, MEMORY_FORMAT, DEVICE, STRING
 };
 
 struct FunctionParameter;
@@ -134,6 +135,7 @@ struct PythonArgs {
   inline at::Device device(int i);
   inline at::Device deviceWithDefault(int i, const at::Device& default_device);
   inline c10::optional<at::Device> deviceOptional(int i);
+  inline at::MemoryFormat toMemoryFormat(int i);
   inline std::string string(int i);
   inline PyObject* pyobject(int i);
   inline int64_t toInt64(int i);
@@ -177,6 +179,7 @@ struct FunctionParameter {
   // having this as a raw PyObject * will presumably leak it, but these are only held by static objects
   // anyway, and Py_Finalize can already be called when this is destructed.
   PyObject *python_name;
+  at::SmallVector<PyObject *, 5> numpy_python_names;
   at::Scalar default_scalar;
   std::vector<int64_t> default_intlist;
   union {
@@ -369,7 +372,7 @@ inline at::Device PythonArgs::device(int i) {
   }
   if (THPUtils_checkLong(args[i])) {
     const auto device_index = THPUtils_unpackLong(args[i]);
-    AT_CHECK(device_index >= 0, "Device index must not be negative");
+    TORCH_CHECK(device_index >= 0, "Device index must not be negative");
     return at::Device(at::DeviceType::CUDA, device_index);
   }
   const std::string &device_str = THPUtils_unpackString(args[i]);
@@ -387,6 +390,13 @@ inline c10::optional<at::Device> PythonArgs::deviceOptional(int i) {
   return device(i);
 }
 
+inline at::MemoryFormat PythonArgs::toMemoryFormat(int i) {
+  if (!args[i]) return at::MemoryFormat::Any;
+  AT_CHECK(THPMemoryFormat_Check(args[i]), "memory_format arg must be an instance of the torch.memory_format");
+  const auto memory_format = reinterpret_cast<THPMemoryFormat*>(args[i]);
+  return memory_format->memory_format;
+}
+
 inline std::string PythonArgs::string(int i) {
   if (!args[i]) return "";
   return THPUtils_unpackString(args[i]);
diff --git a/torch/csrc/utils/python_strings.h b/torch/csrc/utils/python_strings.h
index 80dce9f7ee40..56f7b1cba3ba 100644
--- a/torch/csrc/utils/python_strings.h
+++ b/torch/csrc/utils/python_strings.h
@@ -57,3 +57,11 @@ inline PyObject* THPUtils_packString(const std::string& str) {
   return PyUnicode_FromStringAndSize(str.c_str(), str.size());
 #endif
 }
+
+inline PyObject* THPUtils_internString(const std::string& str) {
+#if PY_MAJOR_VERSION == 2
+  return PyString_InternFromString(str.c_str());
+#else
+  return PyUnicode_InternFromString(str.c_str());
+#endif
+}
diff --git a/torch/csrc/utils/tensor_dtypes.cpp b/torch/csrc/utils/tensor_dtypes.cpp
index b7f0614e99ff..7450b3814603 100644
--- a/torch/csrc/utils/tensor_dtypes.cpp
+++ b/torch/csrc/utils/tensor_dtypes.cpp
@@ -43,6 +43,10 @@ static std::pair<std::string, std::string> getDtypeNames(
       return std::make_pair("bool", "");
     case at::ScalarType::QInt8:
       return std::make_pair("qint8", "");
+    case at::ScalarType::QUInt8:
+      return std::make_pair("quint8", "");
+    case at::ScalarType::QInt32:
+      return std::make_pair("qint32", "");
     default:
       throw std::runtime_error("Unimplemented scalar type");
   }
diff --git a/torch/csrc/utils/tensor_memoryformats.cpp b/torch/csrc/utils/tensor_memoryformats.cpp
new file mode 100644
index 000000000000..756fd6c02b3e
--- /dev/null
+++ b/torch/csrc/utils/tensor_memoryformats.cpp
@@ -0,0 +1,39 @@
+#include <torch/csrc/utils/tensor_memoryformats.h>
+
+#include <torch/csrc/DynamicTypes.h>
+#include <torch/csrc/Exceptions.h>
+#include <torch/csrc/MemoryFormat.h>
+#include <c10/core/MemoryFormat.h>
+
+#include <torch/csrc/python_headers.h>
+#include <torch/csrc/utils/object_ptr.h>
+
+
+namespace torch {
+namespace utils {
+
+#define _ADD_MEMORY_FORMAT(format, name)                                       \
+  {                                                                            \
+    std::string module_name = "torch.";                                        \
+    PyObject* memory_format = THPMemoryFormat_New(format, module_name + name); \
+    Py_INCREF(memory_format);                                                  \
+    if (PyModule_AddObject(torch_module, name, memory_format) != 0) {          \
+      throw python_error();                                                    \
+    }                                                                          \
+  }
+
+void initializeMemoryFormats() {
+  auto torch_module = THPObjectPtr(PyImport_ImportModule("torch"));
+  if (!torch_module) {
+    throw python_error();
+  }
+
+  _ADD_MEMORY_FORMAT(at::MemoryFormat::Any, "any_format");
+  _ADD_MEMORY_FORMAT(at::MemoryFormat::Preserve, "preserve_format");
+  _ADD_MEMORY_FORMAT(at::MemoryFormat::Contiguous, "contiguous_format");
+  _ADD_MEMORY_FORMAT(at::MemoryFormat::ChannelsLast, "channels_last");
+
+}
+
+} // namespace utils
+} // namespace torch
diff --git a/torch/csrc/utils/tensor_memoryformats.h b/torch/csrc/utils/tensor_memoryformats.h
new file mode 100644
index 000000000000..8a90f1b00986
--- /dev/null
+++ b/torch/csrc/utils/tensor_memoryformats.h
@@ -0,0 +1,7 @@
+#pragma once
+
+namespace torch { namespace utils {
+
+void initializeMemoryFormats();
+
+}} // namespace torch::utils
diff --git a/torch/csrc/utils/tensor_new.cpp b/torch/csrc/utils/tensor_new.cpp
index e9a3c36b7c8e..b199f5200a39 100644
--- a/torch/csrc/utils/tensor_new.cpp
+++ b/torch/csrc/utils/tensor_new.cpp
@@ -205,7 +205,7 @@ Tensor internal_new_from_data(
   }
 
   if (THPVariable_Check(data)) {
-    AT_CHECK(!pin_memory, "Can't pin tensor constructed from a variable");
+    TORCH_CHECK(!pin_memory, "Can't pin tensor constructed from a variable");
     auto var = reinterpret_cast<THPVariable*>(data)->cdata;
     if (copy_variables) {
       var = var.detach();
@@ -220,8 +220,18 @@ Tensor internal_new_from_data(
   }
 
 #ifdef USE_NUMPY
+  if (PyObject_HasAttrString(data, "__cuda_array_interface__")) {
+    AT_CHECK(!pin_memory, "Can't pin tensor constructed from __cuda_array_interface__");
+    auto tensor = autograd::make_variable(tensor_from_cuda_array_interface(data), /*requires_grad=*/false);
+    const auto& inferred_scalar_type = type_inference ? tensor.scalar_type() : scalar_type;
+    auto device = device_opt.has_value() ? *device_opt : at::Device(type.device_type());
+    AutoNoGIL no_gil;
+    maybe_initialize_cuda(device);
+    return tensor.to(device, inferred_scalar_type, /*non_blocking=*/false, /*copy=*/copy_numpy);
+  }
+
   if (PyArray_Check(data)) {
-    AT_CHECK(!pin_memory, "Can't pin tensor constructed from numpy");
+    TORCH_CHECK(!pin_memory, "Can't pin tensor constructed from numpy");
     auto tensor = autograd::make_variable(tensor_from_numpy(data), /*requires_grad=*/false);
     const auto& inferred_scalar_type = type_inference ? tensor.scalar_type() : scalar_type;
     auto device = device_opt.has_value() ? *device_opt : at::Device(type.device_type());
@@ -264,7 +274,7 @@ Tensor legacy_new_from_sequence(
 
 void check_legacy_ctor_device(const Type& type, c10::optional<Device> device) {
   if (device.has_value()) {
-    AT_CHECK(type.device_type() == device.value().type(),
+    TORCH_CHECK(type.device_type() == device.value().type(),
              "legacy constructor for device type: ", type.device_type(),
              " was passed device type: ", device.value().type(),
              ", but device type must be: ", type.device_type());
diff --git a/torch/csrc/utils/tensor_numpy.cpp b/torch/csrc/utils/tensor_numpy.cpp
index ac7dfd81066b..62dee930b37e 100644
--- a/torch/csrc/utils/tensor_numpy.cpp
+++ b/torch/csrc/utils/tensor_numpy.cpp
@@ -12,6 +12,9 @@ at::Tensor tensor_from_numpy(PyObject* obj) {
 bool is_numpy_scalar(PyObject* obj) {
   throw std::runtime_error("PyTorch was compiled without NumPy support");
 }
+at::Tensor tensor_from_cuda_array_interface(PyObject* obj) {
+    throw std::runtime_error("PyTorch was compiled without NumPy support");
+}
 }}
 #else
 
@@ -21,6 +24,7 @@ bool is_numpy_scalar(PyObject* obj) {
 #include <torch/csrc/utils/object_ptr.h>
 
 #include <ATen/ATen.h>
+#include <ATen/TensorUtils.h>
 #include <memory>
 #include <sstream>
 #include <stdexcept>
@@ -49,6 +53,22 @@ static std::vector<int64_t> to_aten_shape(int ndim, npy_intp* values) {
   return result;
 }
 
+static std::vector<int64_t> seq_to_aten_shape(PyObject *py_seq) {
+  int ndim = PySequence_Length(py_seq);
+  if (ndim == -1) {
+    throw TypeError("shape and strides must be sequences");
+  }
+  auto result = std::vector<int64_t>(ndim);
+  for (int i = 0; i < ndim; i++) {
+    auto item = THPObjectPtr(PySequence_GetItem(py_seq, i));
+    if (!item) throw python_error();
+
+    result[i] = PyLong_AsLongLong(item);
+    if (result[i] == -1 && PyErr_Occurred()) throw python_error();
+  }
+  return result;
+}
+
 static int aten_to_dtype(const ScalarType scalar_type);
 
 PyObject* tensor_to_numpy(const at::Tensor& tensor) {
@@ -63,7 +83,7 @@ PyObject* tensor_to_numpy(const at::Tensor& tensor) {
         "convert to a dense tensor first.");
   }
   if (tensor.type().backend() != Backend::CPU) {
-      throw TypeError("NumPy conversion for %s is not supported", tensor.type().toString().c_str());
+    throw TypeError("NumPy conversion for %s is not supported", tensor.type().toString().c_str());
   }
   auto dtype = aten_to_dtype(tensor.scalar_type());
   auto sizes = to_numpy_shape(tensor.sizes());
@@ -198,6 +218,100 @@ bool is_numpy_scalar(PyObject* obj) {
           PyArray_IsScalar(obj, Floating));
 }
 
+at::Tensor tensor_from_cuda_array_interface(PyObject* obj) {
+  auto cuda_dict = THPObjectPtr(PyObject_GetAttrString(obj, "__cuda_array_interface__"));
+  TORCH_INTERNAL_ASSERT(cuda_dict);
+
+  if (!PyDict_Check(cuda_dict)) {
+    throw TypeError("`__cuda_array_interface__` must be a dict");
+  }
+
+  // Extract the `obj.__cuda_array_interface__['shape']` attribute
+  std::vector<int64_t> sizes;
+  {
+    PyObject *py_shape = PyDict_GetItemString(cuda_dict, "shape");
+    if (py_shape == nullptr) {
+      throw TypeError("attribute `shape` must exist");
+    }
+    sizes = seq_to_aten_shape(py_shape);
+  }
+
+  // Extract the `obj.__cuda_array_interface__['typestr']` attribute
+  ScalarType dtype;
+  int dtype_size_in_bytes;
+  {
+    PyObject *py_typestr = PyDict_GetItemString(cuda_dict, "typestr");
+    if (py_typestr == nullptr) {
+      throw TypeError("attribute `typestr` must exist");
+    }
+    PyArray_Descr *descr;
+    if(!PyArray_DescrConverter(py_typestr, &descr)) {
+      throw ValueError("cannot parse `typestr`");
+    }
+    dtype = numpy_dtype_to_aten(descr->type_num);
+    dtype_size_in_bytes = descr->elsize;
+    TORCH_INTERNAL_ASSERT(dtype_size_in_bytes > 0);
+  }
+
+  // Extract the `obj.__cuda_array_interface__['data']` attribute
+  void *data_ptr;
+  {
+    PyObject *py_data = PyDict_GetItemString(cuda_dict, "data");
+    if (py_data == nullptr) {
+      throw TypeError("attribute `shape` data exist");
+    }
+    if(!PyTuple_Check(py_data) || PyTuple_GET_SIZE(py_data) != 2) {
+      throw TypeError("`data` must be a 2-tuple of (int, bool)");
+    }
+    data_ptr = PyLong_AsVoidPtr(PyTuple_GET_ITEM(py_data, 0));
+    if (data_ptr == nullptr && PyErr_Occurred()) {
+      throw python_error();
+    }
+    int read_only = PyObject_IsTrue(PyTuple_GET_ITEM(py_data, 1));
+    if (read_only == -1) {
+      throw python_error();
+    }
+    if (read_only) {
+      throw TypeError("the read only flag is not supported, should always be False");
+    }
+  }
+
+  // Extract the `obj.__cuda_array_interface__['strides']` attribute
+  std::vector<int64_t> strides;
+  {
+    PyObject *py_strides = PyDict_GetItemString(cuda_dict, "strides");
+    if (py_strides != nullptr && py_strides != Py_None) {
+      if (PySequence_Length(py_strides) == -1 || PySequence_Length(py_strides) != sizes.size()) {
+        throw TypeError("strides must be a sequence of the same length as shape");
+      }
+      strides = seq_to_aten_shape(py_strides);
+    } else {
+      strides = at::detail::defaultStrides(sizes);
+    }
+
+    // __cuda_array_interface__ strides use bytes. Torch strides use element counts.
+    for (auto& stride : strides) {
+      if (stride%dtype_size_in_bytes != 0) {
+        throw ValueError(
+            "given array strides not a multiple of the element byte size. "
+            "Make a copy of the array to reallocate the memory.");
+        }
+      stride /= dtype_size_in_bytes;
+    }
+  }
+
+  Py_INCREF(obj);
+  return at::from_blob(
+      data_ptr,
+      sizes,
+      strides,
+      [obj](void* data) {
+          AutoGIL gil;
+          Py_DECREF(obj);
+      },
+      at::device(kCUDA).dtype(dtype)
+  );
+}
 }} // namespace torch::utils
 
 #endif  // USE_NUMPY
diff --git a/torch/csrc/utils/tensor_numpy.h b/torch/csrc/utils/tensor_numpy.h
index 9452a7d443d5..013a974abd54 100644
--- a/torch/csrc/utils/tensor_numpy.h
+++ b/torch/csrc/utils/tensor_numpy.h
@@ -12,4 +12,6 @@ at::ScalarType numpy_dtype_to_aten(int dtype);
 
 bool is_numpy_scalar(PyObject* obj);
 
+at::Tensor tensor_from_cuda_array_interface(PyObject* obj);
+
 }} // namespace torch::utils
diff --git a/torch/csrc/utils/tensor_types.cpp b/torch/csrc/utils/tensor_types.cpp
index 2b83d57d924a..122fad160b64 100644
--- a/torch/csrc/utils/tensor_types.cpp
+++ b/torch/csrc/utils/tensor_types.cpp
@@ -85,8 +85,8 @@ std::vector<std::pair<Backend, ScalarType>> all_declared_types() {
                                            ScalarType::Int, ScalarType::Long, ScalarType::Short, ScalarType::Half, ScalarType::Bool};
   for (auto& backend : backends) {
     for (auto& scalar_type : scalar_types) {
-      // there are no sparse half or bool types.
-      if ((scalar_type == ScalarType::Half || scalar_type == ScalarType::Bool) && (backend == Backend::SparseCUDA || backend == Backend::SparseCPU)) {
+      // there is no sparse bool type.
+      if (scalar_type == ScalarType::Bool && (backend == Backend::SparseCUDA || backend == Backend::SparseCPU)) {
         continue;
       }
       ret.emplace_back(std::make_pair(backend, scalar_type));
diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py
index 59bba4c92d44..6e103226b6a7 100644
--- a/torch/cuda/__init__.py
+++ b/torch/cuda/__init__.py
@@ -127,11 +127,7 @@ def _check_capability():
         capability = get_device_capability(d)
         major = capability[0]
         name = get_device_name(d)
-        if CUDA_VERSION < 8000 and major >= 6:
-            warnings.warn(incorrect_binary_warn % (d, name, 8000, CUDA_VERSION))
-        elif CUDA_VERSION < 9000 and major >= 7:
-            warnings.warn(incorrect_binary_warn % (d, name, 9000, CUDA_VERSION))
-        elif capability == (3, 0) or major < 3:
+        if capability == (3, 0) or major < 3:
             warnings.warn(old_gpu_warn % (d, name, major, capability[1]))
 
 
diff --git a/torch/distributions/binomial.py b/torch/distributions/binomial.py
index 2d6f8660581d..3c50b46586ad 100644
--- a/torch/distributions/binomial.py
+++ b/torch/distributions/binomial.py
@@ -5,6 +5,11 @@
 from torch.distributions.utils import broadcast_all, probs_to_logits, lazy_property, logits_to_probs
 
 
+def _clamp_by_zero(x):
+    # works like clamp(x, min=0) but has grad at 0 is 0.5
+    return (x.clamp(min=0) + x - x.clamp(max=0)) / 2
+
+
 class Binomial(Distribution):
     r"""
     Creates a Binomial distribution parameterized by :attr:`total_count` and
@@ -113,9 +118,15 @@ def log_prob(self, value):
         log_factorial_n = torch.lgamma(self.total_count + 1)
         log_factorial_k = torch.lgamma(value + 1)
         log_factorial_nmk = torch.lgamma(self.total_count - value + 1)
-        # Note that: torch.log1p(-self.probs)) = - torch.log1p(self.logits.exp()))
-        return (log_factorial_n - log_factorial_k - log_factorial_nmk +
-                value * self.logits - self.total_count * torch.log1p(self.logits.exp()))
+        # k * log(p) + (n - k) * log(1 - p) = k * (log(p) - log(1 - p)) + n * log(1 - p)
+        #     (case logit < 0)              = k * logit - n * log1p(e^logit)
+        #     (case logit > 0)              = k * logit - n * (log(p) - log(1 - p)) + n * log(p)
+        #                                   = k * logit - n * logit - n * log1p(e^-logit)
+        #     (merge two cases)             = k * logit - n * max(logit, 0) - n * log1p(e^-|logit|)
+        normalize_term = (self.total_count * _clamp_by_zero(self.logits)
+                          + self.total_count * torch.log1p(torch.exp(-torch.abs(self.logits)))
+                          - log_factorial_n)
+        return value * self.logits - log_factorial_k - log_factorial_nmk - normalize_term
 
     def enumerate_support(self, expand=True):
         total_count = int(self.total_count.max())
diff --git a/torch/functional.py b/torch/functional.py
index aeec338e3939..690c0148315e 100644
--- a/torch/functional.py
+++ b/torch/functional.py
@@ -270,7 +270,7 @@ def meshgrid(*tensors, **kwargs):
 
     Returns:
         seq (sequence of Tensors): If the input has :math:`k` tensors of size
-        :math:`(N_1,), (N_2,), \ldots , (N_k,)`, then the output would also has :math:`k` tensors,
+        :math:`(N_1,), (N_2,), \ldots , (N_k,)`, then the output would also have :math:`k` tensors,
         where all tensors are of size :math:`(N_1, N_2, \ldots , N_k)`.
 
     Example::
diff --git a/torch/hub.py b/torch/hub.py
index e23c5bad17fc..3dc98f8b216e 100644
--- a/torch/hub.py
+++ b/torch/hub.py
@@ -159,12 +159,12 @@ def _get_cache_or_reload(github, force_reload):
         url = _git_archive_link(repo_owner, repo_name, branch)
         _download_archive_zip(url, cached_file)
 
-        cached_zipfile = zipfile.ZipFile(cached_file)
-        extraced_repo_name = cached_zipfile.infolist()[0].filename
-        extracted_repo = os.path.join(hub_dir, extraced_repo_name)
-        _remove_if_exists(extracted_repo)
-        # Unzip the code and rename the base folder
-        cached_zipfile.extractall(hub_dir)
+        with zipfile.ZipFile(cached_file) as cached_zipfile:
+            extraced_repo_name = cached_zipfile.infolist()[0].filename
+            extracted_repo = os.path.join(hub_dir, extraced_repo_name)
+            _remove_if_exists(extracted_repo)
+            # Unzip the code and rename the base folder
+            cached_zipfile.extractall(hub_dir)
 
         _remove_if_exists(cached_file)
         _remove_if_exists(repo_dir)
@@ -182,14 +182,35 @@ def _check_module_exists(name):
         import importlib.find_loader
         return importlib.find_loader(name) is not None
     else:
-        # NB: imp doesn't handle hierarchical module names (names contains dots).
+        # NB: Python2.7 imp.find_module() doesn't respect PEP 302,
+        #     it cannot find a package installed as .egg(zip) file.
+        #     Here we use workaround from:
+        #     https://stackoverflow.com/questions/28962344/imp-find-module-which-supports-zipped-eggs?lq=1
+        #     Also imp doesn't handle hierarchical module names (names contains dots).
         try:
+            # 1. Try imp.find_module(), which searches sys.path, but does
+            # not respect PEP 302 import hooks.
             import imp
-            imp.find_module(name)
-        except Exception:
-            return False
-        return True
-
+            result = imp.find_module(name)
+            if result:
+                return True
+        except ImportError:
+            pass
+        path = sys.path
+        for item in path:
+            # 2. Scan path for import hooks. sys.path_importer_cache maps
+            # path items to optional "importer" objects, that implement
+            # find_module() etc.  Note that path must be a subset of
+            # sys.path for this to work.
+            importer = sys.path_importer_cache.get(item)
+            if importer:
+                try:
+                    result = importer.find_module(name, [item])
+                    if result:
+                        return True
+                except ImportError:
+                    pass
+        return False
 
 def _check_dependencies(m):
     dependencies = _load_attr_from_module(m, VAR_DEPENDENCY)
diff --git a/torch/jit/__init__.py b/torch/jit/__init__.py
index 0c3057e628e6..0dfd123ed963 100644
--- a/torch/jit/__init__.py
+++ b/torch/jit/__init__.py
@@ -6,8 +6,8 @@
 import torch.backends.cudnn as cudnn
 import torch.jit.annotations
 import torch._jit_internal as _jit_internal
-from torch._six import with_metaclass, get_function_from_type, \
-    string_classes
+from torch._six import PY2, with_metaclass, get_function_from_type, \
+    string_classes, builtins
 from torch._jit_internal import ignore  # noqa: F401
 from ..nn.modules.utils import _single, _pair, _triple, _quadruple, \
     _list_with_default
@@ -482,14 +482,14 @@ def _check_trace(check_inputs, func, executor_options, traced_func, check_tolera
             for name, data in inputs.items():
                 copied_dict[name] = _clone_inputs(data)
             check_mod = torch.jit.trace_module(
-                func.__self__,
+                func.__self__ if hasattr(func, '__self__') else func,
                 copied_dict,
                 check_trace=False,
                 _force_outplace=force_outplace,
                 **executor_options)
             check_mod_func = check_mod._c._get_method(traced_func.name)
             inputs = inputs[traced_func.name]
-            if isinstance(inputs, torch.Tensor):
+            if isinstance(inputs, (torch.Tensor, dict)):
                 inputs = (inputs,)
         else:
             check_mod = torch.jit.trace(
@@ -519,10 +519,10 @@ def graph_diagnostic_info():
                         node_diff = difflib.ndiff(str(n_mod).splitlines(True),
                                                   str(n_check).splitlines(True))
                         source_printout = 'Node diff:\n' + indent(''.join(node_diff)) + '\n'
-                        mod_stack = n_mod.getSourceLocation()
+                        mod_stack = n_mod.sourceRange()
                         if mod_stack:
                             source_printout += 'Trace source location:\n' + indent(mod_stack) + '\n'
-                        check_stack = n_check.getSourceLocation()
+                        check_stack = n_check.sourceRange()
                         if check_stack:
                             source_printout += 'Check source location:\n' + indent(check_stack) + '\n'
                         graph_diff_errors += source_printout
@@ -548,7 +548,7 @@ def graph_diagnostic_info():
                         if tensor_compare_errors is None:
                             tensor_compare_errors = ''
                         tensor_compare_errors += 'Node:\n' + indent(str(n_mod)) + '\n'
-                        compare_stack = n_mod.getSourceLocation()
+                        compare_stack = n_mod.sourceRange()
                         if compare_stack:
                             tensor_compare_errors += 'Source Location:\n' + indent(compare_stack) + '\n'
                         tensor_compare_errors += 'Comparison exception: ' + indent(str(e))
@@ -617,11 +617,13 @@ def ignore_lib_warnings():
         # We ignore warnings from all submodules excluding the JIT, because we need them e.g. for _check_trace
         warnings.filterwarnings('ignore', category=TracerWarning, module='torch.(?!jit)')
 
+
 # We ignore the tracer warnings coming form inside the library, because all our shape
 # checks in nn will trigger them.
 TracerWarning.ignore_lib_warnings()
 torch._C._tracer_warn_use_python()
 
+
 def make_tuple(example_inputs):
     if isinstance(example_inputs, (torch.Tensor, dict)):
         return (example_inputs,)
@@ -630,11 +632,18 @@ def make_tuple(example_inputs):
         return tuple(example_inputs)
     return example_inputs
 
+
 def make_module(mod, _module_class, executor_options):
     if _module_class is None:
         _module_class = TopLevelTracedModule
     return _module_class(mod, **executor_options)
 
+def wrap_check_inputs(check_inputs):
+    if check_inputs is None:
+        return None
+
+    return [{'forward' : c} for c in check_inputs]
+
 def trace(func,
           example_inputs,
           optimize=True,
@@ -714,6 +723,19 @@ def weighted_kernel_sum(self, weight):
     """
     if not _enabled:
         return func
+
+    if isinstance(func, torch.nn.Module):
+        return trace_module(func, {'forward': example_inputs}, optimize,
+                            check_trace, wrap_check_inputs(check_inputs),
+                            check_tolerance, _force_outplace, _module_class)
+
+    if (hasattr(func, '__self__') and isinstance(func.__self__, torch.nn.Module) and
+            func.__name__ == 'forward'):
+
+        return trace_module(func.__self__, {'forward': example_inputs}, optimize,
+                            check_trace, wrap_check_inputs(check_inputs),
+                            check_tolerance, _force_outplace, _module_class)
+
     executor_options = {'optimize': bool(optimize)}
     # Special case for common case of passing a single Tensor
     if isinstance(example_inputs, (torch.Tensor, dict)):
@@ -721,21 +743,19 @@ def weighted_kernel_sum(self, weight):
     # done primarily so that weird iterables fail here and not pybind11 code
     elif not isinstance(example_inputs, tuple):
         example_inputs = tuple(example_inputs)
+
     var_lookup_fn = _create_interpreter_name_lookup_fn(0)
 
-    if isinstance(func, torch.nn.Module):
-        if _module_class is None:
-            _module_class = TopLevelTracedModule
-        traced = _module_class(func, **executor_options)
-        traced._c._create_method_from_trace('forward', func, example_inputs,
-                                            var_lookup_fn, _force_outplace)
-    else:
-        name = getattr(func, '__name__', 'forward')
-        if name == '<lambda>':
-            name = '_lambda'  # make name a valid identifier
-        traced = torch._C._create_function_from_trace(name, func, example_inputs,
-                                                      var_lookup_fn,
-                                                      _force_outplace)
+    if (hasattr(func, '__self__') and isinstance(func.__self__, torch.nn.Module)):
+        raise AttributeError("trace doesn't support compiling individual module's functions.\n"
+                             "Please use trace_module")
+
+    name = getattr(func, '__name__', 'forward')
+    if name == '<lambda>':
+        name = '_lambda'  # make name a valid identifier
+    traced = torch._C._create_function_from_trace(name, func, example_inputs,
+                                                  var_lookup_fn,
+                                                  _force_outplace)
 
     # Check the trace against new traces created from user-specified inputs
     if check_trace:
@@ -746,6 +766,7 @@ def weighted_kernel_sum(self, weight):
 
     return traced
 
+
 def trace_module(mod,
                  inputs,
                  optimize=True,
@@ -807,6 +828,7 @@ def f(x):
         traced_f = torch.jit.trace(f, torch.rand(1))
 
     """
+
     if not _enabled:
         return mod
     executor_options = {'optimize': bool(optimize)}
@@ -821,8 +843,8 @@ def f(x):
     module = make_module(mod, _module_class, executor_options)
 
     for method_name, example_inputs in inputs.items():
-
-        func = getattr(mod, method_name)
+        # this is needed since Module.__call__ sets up some extra tracing
+        func = mod if method_name == "forward" else getattr(mod, method_name)
         example_inputs = make_tuple(example_inputs)
         module._c._create_method_from_trace(method_name, func, example_inputs, var_lookup_fn, _force_outplace)
         check_trace_method = module._c._get_method(method_name)
@@ -830,12 +852,15 @@ def f(x):
         # Check the trace against new traces created from user-specified inputs
         if check_trace:
             if check_inputs is not None:
-                _check_trace(check_inputs, func, executor_options, check_trace_method, check_tolerance, _force_outplace, True)
+                _check_trace(check_inputs, func, executor_options, check_trace_method,
+                             check_tolerance, _force_outplace, True)
             else:
-                _check_trace([inputs], func, executor_options, check_trace_method, check_tolerance, _force_outplace, True)
+                _check_trace([inputs], func, executor_options, check_trace_method,
+                             check_tolerance, _force_outplace, True)
 
         return module
 
+
 class CompilationUnit(object):
     def __init__(self, lang=None, optimize=True, _frames_up=0):
         self._c = torch._C.CompilationUnit()
@@ -881,10 +906,48 @@ def _try_compile_weak_script(fn):
         _jit_internal.compiled_weak_fns[fn]["compiled_fn"] = compiled_fn
         entry["status"] = _jit_internal.COMPILED
         return compiled_fn
+        # TODO: use fn.__closure__
+        raise RuntimeError("Cannot make resolutionCallback in Python 2")
     else:
         return entry["compiled_fn"]
 
 
+class ScriptWarning(Warning):
+    pass
+
+
+def createResolutionCallbackFromClosure(fn):
+    """
+    Create a resolutionCallback by introspecting the function instead of
+    looking up the stack for the enclosing scope
+    """
+    var_names = fn.__code__.co_freevars
+
+    # map of captured name -> value
+    free_vars = {}
+
+    for index, name in enumerate(var_names):
+        free_vars[name] = fn.__closure__[index].cell_contents
+    f_globals = fn.__globals__
+
+    def env(key):
+        if key in free_vars:
+            return free_vars[key]
+        elif hasattr(builtins, key):
+            return getattr(builtins, key)
+        else:
+            return f_globals.get(key)
+
+    return env
+
+
+def _try_compile_fn(fn):
+    if inspect.ismethod(fn) or _is_ignored_function(fn):
+        # Skip methods
+        return None
+    rcb = createResolutionCallbackFromClosure(fn)
+    return torch.jit.script(fn, _rcb=rcb)
+
 # ScriptClasses must be new-style classes because we construct them using their
 # __new__ method.
 def _is_new_style_class(cls):
@@ -938,6 +1001,13 @@ def _qualified_name(obj):
     return module_name + "." + name
 
 
+@contextlib.contextmanager
+def _enable_recursive_script():
+    torch._C._jit_recursive_script(True)
+    yield
+    torch._C._jit_recursive_script(False)
+
+
 def script(obj, optimize=True, _frames_up=0, _rcb=None):
     if not _enabled:
         return obj
@@ -992,7 +1062,7 @@ def _try_get_weak_module(mod):
     return _jit_internal.weak_modules.get(mod)
 
 
-def _try_get_ignored_op(fn):
+def _is_ignored_function(fn):
     if not callable(fn):
         return False
     if hasattr(fn, '__func__'):
@@ -1158,25 +1228,34 @@ class ScriptMeta(type):
     # issues because ScriptModule inherits from torch._C.ScriptModule,
     # a pybind11 type
     def __init__(cls, name, bases, attrs):
-        # find all the script methods
-        cls._original_methods = {}
-        methods = []
+        # initialize inherited properties
+        cls._methods = {}
+        cls._constants_set = set(getattr(cls, '__constants__', ()))
+        for base in reversed(bases):
+            for k, v in getattr(base, '_methods', {}).items():
+                cls._methods[k] = v
+            base_constants = getattr(base, '_constants_set', set())
+            cls._constants_set = cls._constants_set.union(base_constants)
+
+        # find all the script methods of the current class
         for k, v in sorted(attrs.items()):
             if isinstance(v, ScriptMethodStub):
                 delattr(cls, k)
-                methods.append(v)
-                cls._original_methods[v.original_method.__name__] = v.original_method
-        # after the user's __init__ register all the script methods
-        # with the module
+                cls._methods[v.original_method.__name__] = v
+
         original_init = getattr(cls, '__init__', lambda self: None)
-        super_constants = getattr(super(cls), '_constants_set', set())
-        cls._constants_set = set(getattr(cls, '__constants__', ())).union(super_constants)
         cls._overloads = dict(getattr(cls, '__overloads__', {}))
 
+        # after the user's __init__ register all the script methods
+        # with the module
         @functools.wraps(original_init)
         def init_then_register(self, *args, **kwargs):
             original_init(self, *args, **kwargs)
-            _create_methods_from_stubs(self, methods)
+            if type(self) == cls:
+                # this is the init of the concrete type of self,
+                # we have already resolved all _methods
+                methods = [v for k, v in sorted(cls._methods.items())]
+                _create_methods_from_stubs(self, methods)
 
         cls.__init__ = init_then_register
         return super(ScriptMeta, cls).__init__(name, bases, attrs)
@@ -1316,7 +1395,6 @@ def forward(self, input):
                       input = F.relu(self.conv2(input))
                       return input
         """
-
         def __init__(self, optimize=True):
             self.__dict__['_c'] = torch._C.ScriptModule()
             Module.__init__(self)
@@ -1348,8 +1426,8 @@ def __getattr__(self, attr):
             if '_c' not in self.__dict__:
                 raise RuntimeError("ScriptModule has not been initialized, did you forget to call super's init?")
             if self._c._has_method(attr):
-                if attr in self.__class__._original_methods:
-                    original_method = self.__class__._original_methods[attr]
+                if attr in self.__class__._methods:
+                    original_method = self.__class__._methods[attr].original_method
                     script_method = self._c._get_method(attr)
                     script_method = functools.wraps(original_method)(script_method)
                 else:
@@ -1449,9 +1527,13 @@ def __init__(self, original, stubs):
             self.__dict__['_initialized'] = False
             super(WeakScriptModuleProxy, self).__init__()
 
+            # Store a weak reference to the original module
             self.__dict__["_original"] = weakref.ref(original)
 
-            # Copy Parameters / Modules / Buffers
+            constants_set = set(getattr(original, "__constants__", []))
+            self.__dict__["_constants_set"] = {}
+
+            # Copy Parameters and Modules
             for name in dir(original):
                 item = getattr(original, name)
                 if item is None and name in original._parameters:
@@ -1460,6 +1542,8 @@ def __init__(self, original, stubs):
                     object.__setattr__(self, name, item)
                 elif isinstance(item, Parameter) or (isinstance(item, Module) and item is not self):
                     ScriptModule.__setattr__(self, name, item)
+
+            # Copy buffers
             for name in original._buffers:
                 if original._buffers[name] is None:
                     object.__setattr__(self, name, None)
@@ -1467,7 +1551,10 @@ def __init__(self, original, stubs):
                     self.register_buffer(name, original._buffers[name])
 
             # Copy constants
-            self.__dict__["_constants_set"] = set(getattr(original, "__constants__", []))
+            self.__dict__["_constants_set"] = constants_set
+            for name in self.__dict__["_constants_set"]:
+                if hasattr(original, name):
+                    self.__dict__[name] = getattr(original, name)
 
             # Copy overloads
             self.__dict__["_overloads"] = dict(getattr(original, "__overloads__", {}))
@@ -1481,8 +1568,11 @@ def __getattr__(self, attr):
             try:
                 return ScriptModule.__getattr__(self, attr)
             except AttributeError:
-                if self.__dict__["_initialized"]:
-                    return getattr(self.__dict__["_original"](), attr)
+                # unwrap the original
+                original_module = self.__dict__["_original"]()
+                if original_module and self.__dict__["_initialized"]:
+                    # get attr from original if it is still alive
+                    return getattr(original_module, attr)
                 else:
                     # Only fall back to original once __init__() is done
                     raise AttributeError("Weak module has no attribute '{}'"
@@ -1539,7 +1629,12 @@ def _make_strong(mod):
         _jit_internal.weak_types[type(mod)]["method_stubs"] = stubs
 
     # Create proxy with stubs
-    proxy = WeakScriptModuleProxy(mod, stubs)
+    original_type = type(mod)
+
+    # Construct a new type that inherits from both WeakScriptModuleProxy and
+    # original_type so that isinstance checks work correctly
+    weak_type = type(original_type.__name__, (WeakScriptModuleProxy, original_type), {})
+    proxy = weak_type(mod, stubs)
 
     _jit_internal.weak_modules[mod] = proxy
 
@@ -1709,36 +1804,50 @@ def register_all(mod):
     for mod in _modules_containing_builtins:
         register_all(mod)
 
-    _builtin_table[id(warnings.warn)] = "aten::warn"
-    _builtin_table[id(_single)] = "aten::_single"
-    _builtin_table[id(_pair)] = "aten::_pair"
-    _builtin_table[id(_triple)] = "aten::_triple"
-    _builtin_table[id(_quadruple)] = "aten::_quadruple"
-    _builtin_table[id(_list_with_default)] = "aten::list_with_default"
-    _builtin_table[id(_unwrap_optional)] = "aten::_unwrap_optional"
-    _builtin_table[id(cudnn.is_acceptable)] = "aten::cudnn_is_acceptable"
-    _builtin_table[id(torch._C._infer_size)] = "aten::_infer_size"
-    _builtin_table[id(torch.nn.functional._no_grad_embedding_renorm_)] = "aten::_no_grad_embedding_renorm_"
-
-    _builtin_table[id(math.floor)] = "aten::floor"
-    _builtin_table[id(math.ceil)] = "aten::ceil"
-    _builtin_table[id(math.log)] = "aten::log"
-    _builtin_table[id(math.log1p)] = "aten::log1p"
-    _builtin_table[id(math.log10)] = "aten::log10"
-    _builtin_table[id(math.exp)] = "aten::exp"
-    _builtin_table[id(math.sqrt)] = "aten::sqrt"
-    _builtin_table[id(math.pow)] = "aten::pow"
-    _builtin_table[id(torch.nn.functional.interpolate)] = "aten::__interpolate"
-    _builtin_table[id(torch.nn.functional.upsample_nearest)] = "aten::__upsample_nearest"
-    _builtin_table[id(torch.nn.functional.upsample)] = "aten::__upsample"
-    _builtin_table[id(torch.nn.functional.upsample_bilinear)] = "aten::__upsample_bilinear"
-    _builtin_table[id(torch.nn.functional.assert_int_or_pair)] = "aten::_assert_int_or_pair"
-    _builtin_table[id(torch.nn.utils.rnn.get_packed_sequence)] = "aten::_pack_sequence"
-
-    _builtin_table[id(torch.nn.init._no_grad_fill_)] = "aten::_no_grad_fill_"
-    _builtin_table[id(torch.nn.init._no_grad_normal_)] = "aten::_no_grad_normal_"
-    _builtin_table[id(torch.nn.init._no_grad_uniform_)] = "aten::_no_grad_uniform_"
-    _builtin_table[id(torch.nn.init._no_grad_zero_)] = "aten::_no_grad_zero_"
+    builtin_ops = [
+        # Pairs of (function, op_name)
+        (_list_with_default, "aten::list_with_default"),
+        (_pair, "aten::_pair"),
+        (_quadruple, "aten::_quadruple"),
+        (_single, "aten::_single"),
+        (_triple, "aten::_triple"),
+        (_unwrap_optional, "aten::_unwrap_optional"),
+        (_wait, 'aten::wait'),
+        (cudnn.is_acceptable, "aten::cudnn_is_acceptable"),
+        (math.ceil, "aten::ceil"),
+        (math.copysign, "aten::copysign"),
+        (math.erf, "aten::erf"),
+        (math.erfc, "aten::erfc"),
+        (math.exp, "aten::exp"),
+        (math.expm1, "aten::expm1"),
+        (math.fabs, "aten::fabs"),
+        (math.floor, "aten::floor"),
+        (math.gamma, "aten::gamma"),
+        (math.lgamma, "aten::lgamma"),
+        (math.log, "aten::log"),
+        (math.log10, "aten::log10"),
+        (math.log1p, "aten::log1p"),
+        (math.pow, "aten::pow"),
+        (math.sqrt, "aten::sqrt"),
+        (torch._C._infer_size, "aten::_infer_size"),
+        (torch.nn.functional._no_grad_embedding_renorm_, "aten::_no_grad_embedding_renorm_"),
+        (torch.nn.functional.assert_int_or_pair, "aten::_assert_int_or_pair"),
+        (torch.nn.functional.interpolate, "aten::__interpolate"),
+        (torch.nn.functional.upsample_bilinear, "aten::__upsample_bilinear"),
+        (torch.nn.functional.upsample_nearest, "aten::__upsample_nearest"),
+        (torch.nn.functional.upsample, "aten::__upsample"),
+        (torch.nn.init._no_grad_fill_, "aten::_no_grad_fill_"),
+        (torch.nn.init._no_grad_normal_, "aten::_no_grad_normal_"),
+        (torch.nn.init._no_grad_uniform_, "aten::_no_grad_uniform_"),
+        (torch.nn.init._no_grad_zero_, "aten::_no_grad_zero_"),
+        (torch.nn.utils.rnn.get_packed_sequence, "aten::_pack_sequence"),
+        (warnings.warn, "aten::warn"),
+    ]
+
+    for builtin, aten_op in builtin_ops:
+        _builtin_table[id(builtin)] = aten_op
+    if not PY2:
+        _builtin_table[id(math.gcd)] = "aten::gcd"
 
     return _builtin_table
 
@@ -1750,10 +1859,6 @@ def _register_builtin(fn, op):
 def _find_builtin(fn):
     return _get_builtin_table().get(id(fn))
 
-
-_register_builtin(len, 'aten::len')
-_register_builtin(_wait, 'aten::wait')
-
 # qualified_name => ScriptClass mapping
 _script_classes = {}
 
diff --git a/torch/jit/_pickle.py b/torch/jit/_pickle.py
index 53e63fef1781..0d8992cf25ef 100644
--- a/torch/jit/_pickle.py
+++ b/torch/jit/_pickle.py
@@ -1,7 +1,22 @@
+# These functions are referenced from the pickle archives produced by
+# ScriptModule.save()
+
 def build_intlist(data):
     return data
 
 
+def build_tensorlist(data):
+    return data
+
+
+def build_doublelist(data):
+    return data
+
+
+def build_boollist(data):
+    return data
+
+
 def build_tensor_from_id(data):
     if isinstance(data, int):
         # just the id, can't really do anything
diff --git a/torch/jit/annotations.py b/torch/jit/annotations.py
index 0bf51a7dcaea..93002056791e 100644
--- a/torch/jit/annotations.py
+++ b/torch/jit/annotations.py
@@ -3,9 +3,10 @@
 import inspect
 import torch
 from .._jit_internal import List, BroadcastingList1, BroadcastingList2, \
-    BroadcastingList3, Tuple, is_tuple, is_list, Dict, is_dict
+    BroadcastingList3, Tuple, is_tuple, is_list, Dict, is_dict, Optional, \
+    is_optional
 from torch._C import TensorType, TupleType, FloatType, IntType, \
-    ListType, StringType, DictType, BoolType
+    ListType, StringType, DictType, BoolType, OptionalType
 from textwrap import dedent
 
 
@@ -31,6 +32,7 @@ def __getattr__(self, name):
     'Tuple': Tuple,
     'List': List,
     'Dict': Dict,
+    'Optional': Optional,
 }
 
 
@@ -110,15 +112,40 @@ def parse_type_line(type_line):
 
 def get_type_line(source):
     """Tries to find the line containing a comment with the type annotation."""
-    lines = source.split('\n')
+    type_comment = '# type:'
 
-    type_line = None
-    for line in lines:
-        if '# type:' in line:
-            type_line = line.strip()
-            break
+    lines = source.split('\n')
+    lines = [(line_num, line) for line_num, line in enumerate(lines)]
+    type_lines = list(filter(lambda line: type_comment in line[1], lines))
 
-    return type_line
+    if len(type_lines) == 0:
+        return None
+    elif len(type_lines) == 1:
+        # Only 1 type line, quit now
+        return type_lines[0][1].strip()
+
+    # Parse split up argument types according to PEP 484
+    # https://www.python.org/dev/peps/pep-0484/#suggested-syntax-for-python-2-7-and-straddling-code
+    return_line = None
+    parameter_type_lines = []
+    for line_num, line in reversed(type_lines):
+        if '# type: (...) -> ' in line:
+            return_line = (line_num, line)
+        elif type_comment in line:
+            if return_line is None:
+                raise RuntimeError("Return type line '# type: (...) -> ...' not found on multiline "
+                                   "type annotation\n(See PEP 484 https://www.python.org/dev/peps/pep-0484/#suggested-syntax-for-python-2-7-and-straddling-code)")  # noqa
+            if line_num < return_line[0]:
+                parameter_type_lines.insert(0, line)
+
+    def get_parameter_type(line):
+        item_type = line[line.find(type_comment) + len(type_comment):]
+        return item_type.strip()
+
+    types = map(get_parameter_type, parameter_type_lines)
+    parameter_types = ", ".join(types)
+
+    return return_line[1].replace("...", parameter_types)
 
 
 def split_type_line(type_line):
@@ -173,6 +200,11 @@ def ann_to_type(ann):
         key = ann_to_type(ann.__args__[0])
         value = ann_to_type(ann.__args__[1])
         return DictType(key, value)
+    elif is_optional(ann):
+        if issubclass(ann.__args__[1], type(None)):
+            return OptionalType(ann_to_type(ann.__args__[0]))
+        else:
+            return OptionalType(ann_to_type(ann.__args__[1]))
     elif ann is float:
         return FloatType.get()
     elif ann is int:
@@ -181,7 +213,7 @@ def ann_to_type(ann):
         return StringType.get()
     elif ann is bool:
         return BoolType.get()
-    raise ValueError("Unknown type annotation: '{}'".format(ann.__name__))
+    raise ValueError("Unknown type annotation: '{}'".format(ann))
 
 
 __all__ = [
diff --git a/torch/jit/quantized.py b/torch/jit/quantized.py
index 35a2a0e5e97a..8b71ef85d678 100644
--- a/torch/jit/quantized.py
+++ b/torch/jit/quantized.py
@@ -20,7 +20,7 @@ def __init__(self, other):
         self.weight = torch.nn.Parameter(self.weight, requires_grad=False)
         self.col_offsets = torch.nn.Parameter(self.col_offsets, requires_grad=False)
         assert other.bias is not None, 'QuantizedLinear requires a bias'
-        self.bias = torch.nn.Parameter(other.bias.clone().float())
+        self.bias = torch.nn.Parameter(other.bias.clone().float(), requires_grad=False)
 
         self.register_buffer(
             'packed_tensor_ptr',
@@ -42,7 +42,7 @@ def forward(self, input):
         out = torch.fbgemm_linear_int8_weight(
             input.float(), self.weight, self.packed_tensor_ptr, self.col_offsets,
             self.scale, self.zero_point, self.bias)
-        return out.type_as(input)
+        return out.to(input.dtype)
 
     def extra_repr(self):
         repr = 'in_features={in_features}, out_features={out_features}, ' \
@@ -378,7 +378,7 @@ def forward_impl(self, input, hx, batch_sizes, max_batch_size, sorted_indices):
         self.check_forward_args(input, hx, batch_sizes)
         assert batch_sizes is None
         result = _VF.quantized_lstm(input, hx, self._get_all_weights(), self.bias, self.num_layers,
-                                    self.dropout, self.training, self.bidirectional,
+                                    float(self.dropout), self.training, self.bidirectional,
                                     self.batch_first)
 
         output = result[0]
diff --git a/torch/lib/THD/CMakeLists.txt b/torch/lib/THD/CMakeLists.txt
index ae6931164837..9932944027d8 100644
--- a/torch/lib/THD/CMakeLists.txt
+++ b/torch/lib/THD/CMakeLists.txt
@@ -117,7 +117,7 @@ ADD_LIBRARY(THD STATIC ${all_cpp})
 
 TARGET_COMPILE_DEFINITIONS(THD PRIVATE "_THD_CORE=1")
 
-ADD_DEPENDENCIES(THD caffe2)
+target_link_libraries(THD caffe2)
 
 target_include_directories(THD PRIVATE
   ${CMAKE_BINARY_DIR}/aten/src # provides "ATen/TypeExtendedInterface.h" to ATen.h
diff --git a/torch/lib/c10d/ProcessGroupNCCL.cpp b/torch/lib/c10d/ProcessGroupNCCL.cpp
index 9db5beb09539..66da0ac3d1b7 100644
--- a/torch/lib/c10d/ProcessGroupNCCL.cpp
+++ b/torch/lib/c10d/ProcessGroupNCCL.cpp
@@ -635,7 +635,7 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::barrier(
 
   // Work will take over barrierTensors
   auto ncclWork = dynamic_cast<ProcessGroupNCCL::WorkNCCL*>(work.get());
-  AT_CHECK(ncclWork);
+  TORCH_CHECK(ncclWork);
   ncclWork->barrierTensors_ = std::move(barrierTensors);
 
   return work;
diff --git a/torch/nn/_functions/thnn/auto.py b/torch/nn/_functions/thnn/auto.py
index 850e875cd7be..c83705c1d3b5 100644
--- a/torch/nn/_functions/thnn/auto.py
+++ b/torch/nn/_functions/thnn/auto.py
@@ -277,13 +277,10 @@ def _generate_function_classes(scope_dict):
         'SpatialConvolutionMM',
         'TemporalConvolution',
         'SpatialAveragePooling',
-        'SpatialMaxPooling',
-        'SpatialDilatedMaxPooling',
         'SpatialMaxUnpooling',
         'VolumetricAveragePooling',
         'VolumetricMaxPooling',
         'VolumetricMaxUnpooling',
-        'VolumetricAdaptiveAveragePooling',
         'VolumetricConvolution',
         'VolumetricFullConvolution',
         'VolumetricConvolutionMM',
diff --git a/torch/nn/functional.py b/torch/nn/functional.py
index 1f39e39a43c4..703e8b962a43 100644
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@@ -1930,22 +1930,11 @@ def poisson_nll_loss(input, target, log_input=True, full=False, size_average=Non
     """
     if size_average is not None or reduce is not None:
         reduction = _Reduction.legacy_get_string(size_average, reduce)
-    if log_input:
-        loss = torch.exp(input) - target * input
-    else:
-        loss = input - target * torch.log(input + eps)
-    if full:
-        mask = target > 1
-        loss[mask] += (target * torch.log(target) - target + 0.5 * torch.log(2 * math.pi * target))[mask]
-    if reduction == 'none':
-        ret = loss
-    elif reduction == 'mean':
-        ret = torch.mean(loss)
-    elif reduction == 'sum':
-        ret = torch.sum(loss)
-    else:
+    if reduction != 'none' and reduction != 'mean' and reduction != 'sum':
         ret = input
         raise ValueError(reduction + " is not valid")
+
+    ret = torch.poisson_nll_loss(input, target, log_input, full, eps, _Reduction.get_enum(reduction))
     return ret
 
 
@@ -3088,3 +3077,188 @@ def _pad_circular(input, padding):
         input = torch.cat([input[:, :, :, :, -(padding[-5] + padding[-6]):-padding[-5]], input], dim=4)
 
     return input
+
+
+@weak_script
+def multi_head_attention_forward(query,                  # type: Tensor
+                                 key,                    # type: Tensor
+                                 value,                  # type: Tensor
+                                 embed_dim_to_check,     # type: int
+                                 num_heads,              # type: int
+                                 in_proj_weight,         # type: Tensor
+                                 in_proj_bias,           # type: Tensor
+                                 bias_k,                 # type: Tensor
+                                 bias_v,                 # type: Tensor
+                                 add_zero_attn,          # type: bool
+                                 dropout_p,              # type: float
+                                 out_proj,               # type: Tensor
+                                 training=True,          # type: bool
+                                 key_padding_mask=None,  # type: Optional[Tensor]
+                                 need_weights=True,      # type: bool
+                                 attn_mask=None          # type: Optional[Tensor]
+                                 ):
+    # type: (...) -> Tuple[Tensor, Tensor]
+    r"""
+    Args:
+        query, key, value: map a query and a set of key-value pairs to an output.
+            See "Attention Is All You Need" for more details.
+        embed_dim_to_check: total dimension of the model.
+        num_heads: parallel attention heads.
+        in_proj_weight, in_proj_bias: input projection weight and bias.
+        bias_k, bias_v: bias of the key and value sequences to be added at dim=0.
+        add_zero_attn: add a new batch of zeros to the key and
+                       value sequences at dim=1.
+        dropout_p: probability of an element to be zeroed.
+        out_proj: the output projection.
+        training: apply dropout if is ``True``.
+        key_padding_mask: if provided, specified padding elements in the key will
+            be ignored by the attention.
+        need_weights: output attn_output_weights.
+        attn_mask: mask that prevents attention to certain positions.
+
+
+    Shape:
+        Inputs:
+        - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is
+          the embedding dimension.
+        - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is
+          the embedding dimension.
+        - value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is
+          the embedding dimension.
+        - key_padding_mask: :math:`(N, S)`, ByteTensor, where N is the batch size, S is the source sequence length.
+        - attn_mask: :math:`(L, L)` where L is the target sequence length.
+
+        Outputs:
+        - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size,
+          E is the embedding dimension.
+        - attn_output_weights: :math:`(N, L, S)` where N is the batch size,
+          L is the target sequence length, S is the source sequence length.
+    """
+
+    def _in_proj(input, weight, bias, start=0, end=None):
+        # type: (Tensor, Tensor, Optional[Tensor], int, Optional[int]) -> Tensor
+        weight = weight[start:end, :]
+        if bias is not None:
+            bias = bias[start:end]
+        return linear(input, weight, bias)
+
+
+    def _in_proj_qkv(weight, bias, query):
+        # type: (Tensor, Tensor, Tensor) -> Tensor
+        return _in_proj(query, weight, bias).chunk(3, dim=-1)
+
+
+    def _in_proj_kv(weight, bias, embed_dim, key):
+        # type: (Tensor, Tensor, int, Tensor) -> Tensor
+        return _in_proj(key, weight, bias, start=embed_dim).chunk(2, dim=-1)
+
+
+    def _in_proj_q(weight, bias, embed_dim, query):
+        # type: (Tensor, Tensor, int, Tensor) -> Tensor
+        return _in_proj(query, weight, bias, end=embed_dim)
+
+
+    def _in_proj_k(weight, bias, embed_dim, key):
+        # type: (Tensor, Tensor, int, Tensor) -> Tensor
+        return _in_proj(key, weight, bias, start=embed_dim, end=2 * embed_dim)
+
+
+    def _in_proj_v(weight, bias, embed_dim, value):
+        # type: (Tensor, Tensor, int, Tensor) -> Tensor
+        return _in_proj(value, weight, bias, start=2 * embed_dim)
+
+
+    qkv_same = query.data_ptr() == key.data_ptr() == value.data_ptr()
+    kv_same = key.data_ptr() == value.data_ptr()
+
+    tgt_len, bsz, embed_dim = query.size()
+    assert embed_dim == embed_dim_to_check
+    assert list(query.size()) == [tgt_len, bsz, embed_dim]
+    assert key.size() == value.size()
+
+    head_dim = embed_dim // num_heads
+    assert head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
+    scaling = head_dim ** -0.5
+
+    if qkv_same:
+        # self-attention
+        q, k, v = _in_proj_qkv(in_proj_weight, in_proj_bias, query)
+    elif kv_same:
+        # encoder-decoder attention
+        q = _in_proj_q(in_proj_weight, in_proj_bias, embed_dim, query)
+        if key is None:
+            assert value is None
+            k = v = None
+        else:
+            k, v = _in_proj_kv(in_proj_weight, in_proj_bias, embed_dim, key)
+    else:
+        q = _in_proj_q(in_proj_weight, in_proj_bias, embed_dim, query)
+        k = _in_proj_k(in_proj_weight, in_proj_bias, embed_dim, key)
+        v = _in_proj_v(in_proj_weight, in_proj_bias, embed_dim, value)
+    q *= scaling
+
+    if bias_k is not None:
+        assert bias_v is not None
+        k = torch.cat([k, bias_k.repeat(1, bsz, 1)])
+        v = torch.cat([v, bias_v.repeat(1, bsz, 1)])
+        if attn_mask is not None:
+            attn_mask = torch.cat([attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1)
+        if key_padding_mask is not None:
+            key_padding_mask = torch.cat(
+                [key_padding_mask, key_padding_mask.new_zeros(key_padding_mask.size(0), 1)], dim=1)
+
+    q = q.contiguous().view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
+    if k is not None:
+        k = k.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
+    if v is not None:
+        v = v.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
+
+    src_len = k.size(1)
+
+    if key_padding_mask is not None:
+        assert key_padding_mask.size(0) == bsz
+        assert key_padding_mask.size(1) == src_len
+
+    if add_zero_attn:
+        src_len += 1
+        k = torch.cat([k, k.new_zeros((k.size(0), 1) + k.size()[2:])], dim=1)
+        v = torch.cat([v, v.new_zeros((v.size(0), 1) + v.size()[2:])], dim=1)
+        if attn_mask is not None:
+            attn_mask = torch.cat([attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1)
+        if key_padding_mask is not None:
+            key_padding_mask = torch.cat(
+                [key_padding_mask, torch.zeros(key_padding_mask.size(0), 1).type_as(key_padding_mask)], dim=1)
+
+    attn_output_weights = torch.bmm(q, k.transpose(1, 2))
+    assert list(attn_output_weights.size()) == [bsz * num_heads, tgt_len, src_len]
+
+    if attn_mask is not None:
+        attn_mask = attn_mask.unsqueeze(0)
+        attn_output_weights += attn_mask
+
+    if key_padding_mask is not None:
+        attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
+        attn_output_weights = attn_output_weights.masked_fill(
+            key_padding_mask.unsqueeze(1).unsqueeze(2),
+            float('-inf'),
+        )
+        attn_output_weights = attn_output_weights.view(bsz * num_heads, tgt_len, src_len)
+
+    attn_output_weights = softmax(
+        attn_output_weights.float(), dim=-1,
+        dtype=torch.float32 if attn_output_weights.dtype == torch.float16 else attn_output_weights.dtype)
+    attn_output_weights = dropout(attn_output_weights, p=dropout_p, training=training)
+
+    attn_output = torch.bmm(attn_output_weights, v)
+    assert list(attn_output.size()) == [bsz * num_heads, tgt_len, head_dim]
+    attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
+    attn_output = out_proj(attn_output)
+
+    if need_weights:
+        # average attention weights over heads
+        attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
+        attn_output_weights = attn_output_weights.sum(dim=1) / num_heads
+    else:
+        attn_output_weights = None
+
+    return attn_output, attn_output_weights
diff --git a/torch/nn/modules/activation.py b/torch/nn/modules/activation.py
index 52929be0044c..37f0fbaf6307 100644
--- a/torch/nn/modules/activation.py
+++ b/torch/nn/modules/activation.py
@@ -691,8 +691,10 @@ class MultiheadAttention(Module):
     Args:
         embed_dim: total dimension of the model.
         num_heads: parallel attention heads.
+        dropout: a Dropout layer on attn_output_weights. Default: 0.0.
+        bias: add bias as module parameter. Default: True.
         add_bias_kv: add bias to the key and value sequences at dim=0.
-        add_zero_attn: add a new batch of zeros to the key and 
+        add_zero_attn: add a new batch of zeros to the key and
                        value sequences at dim=1.
 
     Examples::
@@ -708,7 +710,6 @@ def __init__(self, embed_dim, num_heads, dropout=0., bias=True, add_bias_kv=Fals
         self.dropout = dropout
         self.head_dim = embed_dim // num_heads
         assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
-        self.scaling = self.head_dim ** -0.5
 
         self.in_proj_weight = Parameter(torch.empty(3 * embed_dim, embed_dim))
         if bias:
@@ -742,183 +743,40 @@ def _reset_parameters(self):
             xavier_normal_(self.bias_v)
 
     @weak_script_method
-    def forward(self, query, key, value, key_padding_mask=None, incremental_state=None,
-                need_weights=True, static_kv=False, attn_mask=None):
+    def forward(self, query, key, value, key_padding_mask=None,
+                need_weights=True, attn_mask=None):
         r"""
     Args:
-        query, key, value: map a query and a set of key-value pairs to an output. 
-            See "Attention Is All You Need" for more details. 
-        key_padding_mask: if provided, specified padding elements in the key will 
+        query, key, value: map a query and a set of key-value pairs to an output.
+            See "Attention Is All You Need" for more details.
+        key_padding_mask: if provided, specified padding elements in the key will
             be ignored by the attention.
-        incremental_state: if provided, previous time steps are cached.
         need_weights: output attn_output_weights.
-        static_kv: if true, key and value are static. The key and value in previous 
-            states will be used.
         attn_mask: mask that prevents attention to certain positions.
 
+
     Shape:
         - Inputs:
-
-        - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is 
+        - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is
           the embedding dimension.
-        - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is 
+        - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is
           the embedding dimension.
-        - value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is 
+        - value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is
           the embedding dimension.
         - key_padding_mask: :math:`(N, S)`, ByteTensor, where N is the batch size, S is the source sequence length.
-        - incremental_state: a dictionary used for storing states.
         - attn_mask: :math:`(L, L)` where L is the target sequence length.
 
         - Outputs:
-
-        - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, 
+        - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size,
           E is the embedding dimension.
         - attn_output_weights: :math:`(N, L, S)` where N is the batch size,
           L is the target sequence length, S is the source sequence length.
         """
-        qkv_same = query.data_ptr() == key.data_ptr() == value.data_ptr()
-        kv_same = key.data_ptr() == value.data_ptr()
-
-        tgt_len, bsz, embed_dim = query.size()
-        assert embed_dim == self.embed_dim
-        assert list(query.size()) == [tgt_len, bsz, embed_dim]
-        assert key.size() == value.size()
-
-        if incremental_state is not None:
-            saved_state = self._get_input_buffer(incremental_state)
-            if 'prev_key' in saved_state:
-                # previous time steps are cached - no need to recompute
-                # key and value if they are static
-                if static_kv:
-                    assert kv_same and not qkv_same
-                    key = value = None
-        else:
-            saved_state = None
-
-        if qkv_same:
-            # self-attention
-            q, k, v = self._in_proj_qkv(query)
-        elif kv_same:
-            # encoder-decoder attention
-            q = self._in_proj_q(query)
-            if key is None:
-                assert value is None
-                k = v = None
-            else:
-                k, v = self._in_proj_kv(key)
-        else:
-            q = self._in_proj_q(query)
-            k = self._in_proj_k(key)
-            v = self._in_proj_v(value)
-        q *= self.scaling
-
-        if self.bias_k is not None:
-            assert self.bias_v is not None
-            k = torch.cat([k, self.bias_k.repeat(1, bsz, 1)])
-            v = torch.cat([v, self.bias_v.repeat(1, bsz, 1)])
-            if attn_mask is not None:
-                attn_mask = torch.cat([attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1)
-            if key_padding_mask is not None:
-                key_padding_mask = torch.cat(
-                    [key_padding_mask, key_padding_mask.new_zeros(key_padding_mask.size(0), 1)], dim=1)
-
-        q = q.contiguous().view(tgt_len, bsz * self.num_heads, self.head_dim).transpose(0, 1)
-        if k is not None:
-            k = k.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1)
-        if v is not None:
-            v = v.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1)
-
-        if saved_state is not None:
-            # saved states are stored with shape (bsz, num_heads, seq_len, head_dim)
-            if 'prev_key' in saved_state:
-                prev_key = saved_state['prev_key'].view(bsz * self.num_heads, -1, self.head_dim)
-                if static_kv:
-                    k = prev_key
-                else:
-                    k = torch.cat((prev_key, k), dim=1)
-            if 'prev_value' in saved_state:
-                prev_value = saved_state['prev_value'].view(bsz * self.num_heads, -1, self.head_dim)
-                if static_kv:
-                    v = prev_value
-                else:
-                    v = torch.cat((prev_value, v), dim=1)
-            saved_state['prev_key'] = k.view(bsz, self.num_heads, -1, self.head_dim)
-            saved_state['prev_value'] = v.view(bsz, self.num_heads, -1, self.head_dim)
-
-            self._set_input_buffer(incremental_state, saved_state)
-
-        src_len = k.size(1)
-
-        if key_padding_mask is not None:
-            assert key_padding_mask.size(0) == bsz
-            assert key_padding_mask.size(1) == src_len
-
-        if self.add_zero_attn:
-            src_len += 1
-            k = torch.cat([k, k.new_zeros((k.size(0), 1) + k.size()[2:])], dim=1)
-            v = torch.cat([v, v.new_zeros((v.size(0), 1) + v.size()[2:])], dim=1)
-            if attn_mask is not None:
-                attn_mask = torch.cat([attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1)
-            if key_padding_mask is not None:
-                key_padding_mask = torch.cat(
-                    [key_padding_mask, torch.zeros(key_padding_mask.size(0), 1).type_as(key_padding_mask)], dim=1)
-
-        attn_output_weights = torch.bmm(q, k.transpose(1, 2))
-        assert list(attn_output_weights.size()) == [bsz * self.num_heads, tgt_len, src_len]
-
-        if attn_mask is not None:
-            attn_mask = attn_mask.unsqueeze(0)
-            attn_output_weights += attn_mask
-
-        if key_padding_mask is not None:
-            attn_output_weights = attn_output_weights.view(bsz, self.num_heads, tgt_len, src_len)
-            attn_output_weights = attn_output_weights.masked_fill(
-                key_padding_mask.unsqueeze(1).unsqueeze(2),
-                float('-inf'),
-            )
-            attn_output_weights = attn_output_weights.view(bsz * self.num_heads, tgt_len, src_len)
-
-        attn_output_weights = F.softmax(
-            attn_output_weights.float(), dim=-1,
-            dtype=torch.float32 if attn_output_weights.dtype == torch.float16 else attn_output_weights.dtype)
-        attn_output_weights = F.dropout(attn_output_weights, p=self.dropout, training=self.training)
-
-        attn_output = torch.bmm(attn_output_weights, v)
-        assert list(attn_output.size()) == [bsz * self.num_heads, tgt_len, self.head_dim]
-        attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
-        attn_output = self.out_proj(attn_output)
-
-        if need_weights:
-            # average attention weights over heads
-            attn_output_weights = attn_output_weights.view(bsz, self.num_heads, tgt_len, src_len)
-            attn_output_weights = attn_output_weights.sum(dim=1) / self.num_heads
-        else:
-            attn_output_weights = None
-
-        return attn_output, attn_output_weights
-
-    def _in_proj_qkv(self, query):
-        return self._in_proj(query).chunk(3, dim=-1)
-
-    def _in_proj_kv(self, key):
-        return self._in_proj(key, start=self.embed_dim).chunk(2, dim=-1)
-
-    def _in_proj_q(self, query):
-        return self._in_proj(query, end=self.embed_dim)
-
-    def _in_proj_k(self, key):
-        return self._in_proj(key, start=self.embed_dim, end=2 * self.embed_dim)
-
-    def _in_proj_v(self, value):
-        return self._in_proj(value, start=2 * self.embed_dim)
-
-    def _in_proj(self, input, start=0, end=None):
-        weight = self.in_proj_weight
-        bias = self.in_proj_bias
-        weight = weight[start:end, :]
-        if bias is not None:
-            bias = bias[start:end]
-        return F.linear(input, weight, bias)
+        return F.multi_head_attention_forward(
+            query, key, value, self.embed_dim, self.num_heads,
+            self.in_proj_weight, self.in_proj_bias, self.bias_k, self.bias_v, self.add_zero_attn,
+            self.dropout, self.out_proj, training=self.training,
+            key_padding_mask=key_padding_mask, need_weights=need_weights, attn_mask=attn_mask)
 
 
 @weak_module
diff --git a/torch/nn/modules/linear.py b/torch/nn/modules/linear.py
index f5c2c02d78c5..f5d11e11bfb4 100644
--- a/torch/nn/modules/linear.py
+++ b/torch/nn/modules/linear.py
@@ -67,7 +67,7 @@ class Linear(Module):
         >>> print(output.size())
         torch.Size([128, 30])
     """
-    __constants__ = ['bias']
+    __constants__ = ['bias', 'in_features', 'out_features']
 
     def __init__(self, in_features, out_features, bias=True):
         super(Linear, self).__init__()
diff --git a/torch/nn/modules/loss.py b/torch/nn/modules/loss.py
index 92c85d5f3ef3..665333ac14ed 100644
--- a/torch/nn/modules/loss.py
+++ b/torch/nn/modules/loss.py
@@ -1236,51 +1236,52 @@ class CTCLoss(_Loss):
     with respect to each input node. The alignment of input to target is assumed to be "many-to-one", which
     limits the length of the target sequence such that it must be :math:`\leq` the input length.
 
-    **Args:**
-        **blank** (int, optional): blank label. Default :math:`0`.
+    Args:
+        blank (int, optional): blank label. Default :math:`0`.
         reduction (string, optional): Specifies the reduction to apply to the output:
             ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
             ``'mean'``: the output losses will be divided by the target lengths and
             then the mean over the batch is taken. Default: ``'mean'``
-
-        **zero_infinity** (bool, optional):
+        zero_infinity (bool, optional):
             Whether to zero infinite losses and the associated gradients.
             Default: ``False``
             Infinite losses mainly occur when the inputs are too short
             to be aligned to the targets.
 
-    **Inputs:**
-        **log_probs**: Tensor of size :math:`(T, N, C)`
-            | :math:`T = \text{input length}`
-            | :math:`N = \text{batch size}`
-            | :math:`C = \text{number of classes (including blank)}`
-
-            The logarithmized probabilities of the outputs
-            (e.g. obtained with :func:`torch.nn.functional.log_softmax`).
-        **targets**: Tensor of size :math:`(N, S)` or :math:`(\operatorname{sum}(\text{target\_lengths}))`
-            | :math:`N = \text{batch size}`
-            | :math:`S = \text{max target length, if shape is } (N, S)`.
-
-            | Target sequences. Each element in the target sequence is a class index. Target index
-              cannot be blank (default=0).
-
-            | In the :math:`(N, S)` form, targets are padded to the length of the longest sequence, and stacked.
-            | In the :math:`(\operatorname{sum}(\text{target\_lengths}))` form, the targets are assumed to
-              be un-padded and concatenated within 1 dimension.
-        **input_lengths**: Tuple or tensor of size :math:`(N)`.
-            Lengths of the inputs (must each be :math:`\leq T`).
-            Lengths are specified for each sequence to achieve masking under the
-            assumption that sequences are padded to equal lengths.
-        **target_lengths**: Tuple or tensor of size  :math:`(N)`.
-            | Lengths of the targets. Lengths are specified for each sequence to achieve masking under the
-              assumption that sequences are padded to equal lengths.
-
-            | If target shape is :math:`(N,S)`, target_lengths are effectively the stop index
-              :math:`s_n` for each target sequence, such that ``target_n = targets[n,0:s_n]`` for
-              each target in a batch. Lengths must each be :math:`\leq S`
-
-            | If the targets are given as a 1d tensor that is the concatenation of individual targets,
-              the target_lengths must add up to the total length of the tensor.
+    Shape:
+        - Log_probs: Tensor of size :math:`(T, N, C)`,
+          where :math:`T = \text{input length}`,
+          :math:`N = \text{batch size}`, and
+          :math:`C = \text{number of classes (including blank)}`.
+          The logarithmized probabilities of the outputs (e.g. obtained with
+          :func:`torch.nn.functional.log_softmax`).
+        - Targets: Tensor of size :math:`(N, S)` or
+          :math:`(\operatorname{sum}(\text{target\_lengths}))`,
+          where :math:`N = \text{batch size}` and
+          :math:`S = \text{max target length, if shape is } (N, S)`.
+          It represent the target sequences. Each element in the target
+          sequence is a class index. And the target index cannot be blank (default=0).
+          In the :math:`(N, S)` form, targets are padded to the
+          length of the longest sequence, and stacked.
+          In the :math:`(\operatorname{sum}(\text{target\_lengths}))` form,
+          the targets are assumed to be un-padded and
+          concatenated within 1 dimension.
+        - Input_lengths: Tuple or tensor of size :math:`(N)`,
+          where :math:`N = \text{batch size}`. It represent the lengths of the
+          inputs (must each be :math:`\leq T`). And the lengths are specified
+          for each sequence to achieve masking under the assumption that sequences
+          are padded to equal lengths.
+        - Target_lengths: Tuple or tensor of size :math:`(N)`,
+          where :math:`N = \text{batch size}`. It represent lengths of the targets.
+          Lengths are specified for each sequence to achieve masking under the
+          assumption that sequences are padded to equal lengths. If target shape is
+          :math:`(N,S)`, target_lengths are effectively the stop index
+          :math:`s_n` for each target sequence, such that ``target_n = targets[n,0:s_n]`` for
+          each target in a batch. Lengths must each be :math:`\leq S`
+          If the targets are given as a 1d tensor that is the concatenation of individual
+          targets, the target_lengths must add up to the total length of the tensor.
+        - Output: scalar. If :attr:`reduction` is ``'none'``, then
+          :math:`(N)`, where :math:`N = \text{batch size}`.
 
     Example::
 
diff --git a/torch/nn/modules/module.py b/torch/nn/modules/module.py
index eb82b8363fa0..864e9ab7c799 100644
--- a/torch/nn/modules/module.py
+++ b/torch/nn/modules/module.py
@@ -761,6 +761,7 @@ def load(module, prefix=''):
                     load(child, prefix + name + '.')
 
         load(self)
+        load = None  # break load->load reference cycle
 
         if strict:
             if len(unexpected_keys) > 0:
diff --git a/torch/nn/modules/normalization.py b/torch/nn/modules/normalization.py
index bc3e5eef5227..0b6f8d88e5cd 100644
--- a/torch/nn/modules/normalization.py
+++ b/torch/nn/modules/normalization.py
@@ -135,7 +135,7 @@ def __init__(self, normalized_shape, eps=1e-5, elementwise_affine=True):
         super(LayerNorm, self).__init__()
         if isinstance(normalized_shape, numbers.Integral):
             normalized_shape = (normalized_shape,)
-        self.normalized_shape = torch.Size(normalized_shape)
+        self.normalized_shape = tuple(normalized_shape)
         self.eps = eps
         self.elementwise_affine = elementwise_affine
         if self.elementwise_affine:
diff --git a/torch/nn/modules/rnn.py b/torch/nn/modules/rnn.py
index ee7a9dc31fb5..c53d02225d32 100644
--- a/torch/nn/modules/rnn.py
+++ b/torch/nn/modules/rnn.py
@@ -26,7 +26,7 @@ def apply_permutation(tensor, permutation, dim=1):
 
 class RNNBase(Module):
     __constants__ = ['mode', 'input_size', 'hidden_size', 'num_layers', 'bias',
-                     'batch_first', 'dropout', 'bidirectional', '_flat_parameters']
+                     'batch_first', 'dropout', 'bidirectional']
 
     def __init__(self, mode, input_size, hidden_size,
                  num_layers=1, bias=True, batch_first=False,
diff --git a/torch/nn/modules/sparse.py b/torch/nn/modules/sparse.py
index 325302cc2b17..c2f7bb82f413 100644
--- a/torch/nn/modules/sparse.py
+++ b/torch/nn/modules/sparse.py
@@ -77,7 +77,7 @@ class Embedding(Module):
                  [-0.1655,  0.9897,  0.0635]]])
     """
     __constants__ = ['num_embeddings', 'embedding_dim', 'padding_idx', 'max_norm',
-                     'norm_type', 'scale_grad_by_freq', 'sparse', '_weight']
+                     'norm_type', 'scale_grad_by_freq', 'sparse']
 
     def __init__(self, num_embeddings, embedding_dim, padding_idx=None,
                  max_norm=None, norm_type=2., scale_grad_by_freq=False,
@@ -253,7 +253,7 @@ class EmbeddingBag(Module):
                 [ 1.1306, -2.5798, -1.0044]])
     """
     __constants__ = ['num_embeddings, embedding_dim', 'max_norm', 'norm_type',
-                     'scale_grad_by_freq', 'mode', 'sparse', '_weight']
+                     'scale_grad_by_freq', 'mode', 'sparse']
 
     def __init__(self, num_embeddings, embedding_dim,
                  max_norm=None, norm_type=2., scale_grad_by_freq=False,
diff --git a/torch/nn/modules/upsampling.py b/torch/nn/modules/upsampling.py
index 1b355c503616..55b6f93c0841 100644
--- a/torch/nn/modules/upsampling.py
+++ b/torch/nn/modules/upsampling.py
@@ -122,7 +122,10 @@ def __init__(self, size=None, scale_factor=None, mode='nearest', align_corners=N
         super(Upsample, self).__init__()
         self.name = type(self).__name__
         self.size = size
-        self.scale_factor = float(scale_factor) if scale_factor else None
+        if isinstance(scale_factor, tuple):
+            self.scale_factor = tuple(float(factor) for factor in scale_factor)
+        else:
+            self.scale_factor = float(scale_factor) if scale_factor else None
         self.mode = mode
         self.align_corners = align_corners
 
diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py
index 8061a1624aef..f8d702499170 100644
--- a/torch/nn/parallel/distributed.py
+++ b/torch/nn/parallel/distributed.py
@@ -3,7 +3,7 @@
 
 import torch
 
-from torch.cuda.comm import broadcast_coalesced
+import torch.cuda.comm
 import torch.distributed as dist
 
 if dist.is_available():
@@ -197,8 +197,16 @@ class DistributedDataParallel(Module):
                                        module's ``forward`` function.
                                        Parameters that don't receive gradients as
                                        part of this graph are preemptively marked
-                                       as being ready to be reduced.
-                                       (default: ``False``)
+                                       as being ready to be reduced. Note that all
+                                       ``forward`` outputs that are derived from
+                                       module parameters must participate in
+                                       calculating loss and later the gradient
+                                       computation. If they don't, this wrapper will
+                                       hang waiting for autograd to produce gradients
+                                       for those parameters. Any outputs derived from
+                                       module parameters that are otherwise unused can
+                                       be detached from the autograd graph using
+                                       ``torch.Tensor.detach``. (default: ``False``)
         check_reduction: when setting to ``True``, it enables DistributedDataParallel
                          to automatically check if the previous iteration's
                          backward reductions were successfully issued at the
@@ -282,8 +290,9 @@ def __init__(self, module, device_ids=None,
         # Sync params and buffers
         module_states = list(self.module.state_dict().values())
         if len(module_states) > 0:
-            self._dist_broadcast_coalesced(module_states,
-                                           self.broadcast_bucket_size)
+            self._distributed_broadcast_coalesced(
+                module_states,
+                self.broadcast_bucket_size)
 
         self._ddp_init_helper()
 
@@ -406,8 +415,8 @@ def train(self, mode=True):
         for module in self._module_copies[1:]:
             module.train(mode)
 
-    def _dist_broadcast_coalesced(self, tensors, buffer_size):
-        dist._dist_broadcast_coalesced(self.process_group, tensors, buffer_size, False)
+    def _distributed_broadcast_coalesced(self, tensors, buffer_size):
+        dist._broadcast_coalesced(self.process_group, tensors, buffer_size)
 
     def _sync_params(self):
         with torch.no_grad():
@@ -415,9 +424,10 @@ def _sync_params(self):
             # CUDA modules
             if self.device_ids and len(self.device_ids) > 1:
                 # intra-node parameter sync
-                result = broadcast_coalesced(self.modules_params[0],
-                                             self.device_ids,
-                                             self.broadcast_bucket_size)
+                result = torch.cuda.comm.broadcast_coalesced(
+                    self.modules_params[0],
+                    self.device_ids,
+                    self.broadcast_bucket_size)
                 for tensors, module_params in zip(result[1:],
                                                   self.modules_params[1:]):
                     for tensor, param in zip(tensors, module_params):
@@ -432,16 +442,19 @@ def _sync_params(self):
 
             # module buffer sync
             if self.broadcast_buffers and len(self.modules_buffers[0]) > 0:
-                # cross-node buffer sync
-                self._dist_broadcast_coalesced(self.modules_buffers[0],
-                                               self.broadcast_bucket_size)
+                # Synchronize buffers across processes.
+                # The process with rank 0 is considered the authoritative copy.
+                self._distributed_broadcast_coalesced(
+                    self.modules_buffers[0],
+                    self.broadcast_bucket_size)
                 # only do intra-node buffer sync for replicated single-device
                 # CUDA modules
                 if self.device_ids and len(self.device_ids) > 1:
                     # intra-node buffer sync
-                    result = broadcast_coalesced(self.modules_buffers[0],
-                                                 self.device_ids,
-                                                 self.broadcast_bucket_size)
+                    result = torch.cuda.comm.broadcast_coalesced(
+                        self.modules_buffers[0],
+                        self.device_ids,
+                        self.broadcast_bucket_size)
                     for tensors, module_buffers in zip(result[1:],
                                                        self.modules_buffers[1:]):
                         for tensor, buffer in zip(tensors, module_buffers):
diff --git a/torch/nn/quantized/functional.py b/torch/nn/quantized/functional.py
index 60a7fd8e7dee..1e02fd473449 100644
--- a/torch/nn/quantized/functional.py
+++ b/torch/nn/quantized/functional.py
@@ -1,4 +1,4 @@
 from torch._ops import ops
 
 relu = ops.quantized.relu
-sum_relu = ops.quantized.sum_relu
+sum_relu = ops.quantized.add_relu
diff --git a/torch/onnx/__init__.py b/torch/onnx/__init__.py
index 519a10df4725..4a7b2e5f98a4 100644
--- a/torch/onnx/__init__.py
+++ b/torch/onnx/__init__.py
@@ -6,6 +6,13 @@
 
 ONNX_ARCHIVE_MODEL_PROTO_NAME = "__MODEL_PROTO"
 
+# TODO: Update these variables when there 
+# is a new ir_version and producer_version
+# and use these values in the exporter
+ir_version = 4
+producer_name = "pytorch"
+producer_version = "1.1"
+
 
 class ExportTypes:
     PROTOBUF_FILE = 1
@@ -58,3 +65,8 @@ def _run_symbolic_method(*args, **kwargs):
 def is_in_onnx_export():
     from torch.onnx import utils
     return utils.is_in_onnx_export()
+
+
+def register_custom_op_symbolic(symbolic_name, symbolic_fn, opset_version):
+    from torch.onnx import utils
+    return utils.register_custom_op_symbolic(symbolic_name, symbolic_fn, opset_version)
diff --git a/torch/onnx/symbolic_helper.py b/torch/onnx/symbolic_helper.py
new file mode 100644
index 000000000000..d7633ea7e085
--- /dev/null
+++ b/torch/onnx/symbolic_helper.py
@@ -0,0 +1,293 @@
+import torch
+from torch._C import ListType
+import warnings
+
+import torch.onnx
+# This import monkey-patches graph manipulation methods on Graph, used for the
+# ONNX symbolics
+import torch.onnx.utils
+
+from functools import wraps
+
+
+# Note [Edit Symbolic Files]
+# EDITING THIS FILE AND SYMBOLIC_OPSET<VERSION> FILES? READ THIS FIRST!
+#
+# - These files is ONLY for ATen operators (e.g., operators that show up in the
+#   trace as aten::blah).  If you need to special case a primitive operator,
+#   look at _run_symbolic_function
+# - Parameter ordering does NOT necessarily match what is in VariableType.cpp;
+#   tensors are always first, then non-tensor arguments.
+# - Parameter names must *exactly* match the names in VariableType.cpp, because
+#   dispatch is done with keyword arguments.
+# - Looking for inplace ops?  They're detected by the trailing underscore, and
+#   transparently dispatched to their non inplace versions in
+#   'run_symbolic_function'.   See Note [Export inplace]
+#
+# ----------------------------------------------------------------------------------
+# A note on Tensor types
+# ----------------------------------------------------------------------------------
+#
+# In general, we should avoid depending on the type of Tensor Values contained
+# within the trace graph. However, this is sometimes unavoidable (due to ONNX
+# spec requirements, etc). If you are implementing a symbolic and need Tensor
+# type information, note that there are several levels of Tensor types, defined
+# in aten/src/ATen/core/jit_type.h:
+#
+# TensorType - This is a Tensor, but we don't know anything about its
+#               properties (e.g. scalar type, # dims, shapes).
+#               Appears as `Tensor` in graph print-outs.
+# DimensionedTensorType <: TensorType - Denotes a Tensor for which we know the scalar
+#                             type and number of dimensions, but not the concrete
+#                             shapes. For example, appears as 'Float(*, *)' in
+#                             graph print-outs. Useful accessor methods include
+#                             dim() and scalarType()
+# CompleteTensorType <: DimensionedTensorType - Denotes a Tensor for which we know the
+#                                               concrete sizes in addition to the information
+#                                               contained in TensorTyper. This adds a sizes()
+#                                               method which can be used to retrieve the
+#                                               concrete sizes.
+#
+# In general, we should prefer to rely on the least specific information possible.
+# For example, not relying on tensor properties at all is better than relying
+# on the number of dimensions (DimensionedTensorType) which is better than relying on
+# concrete shapes (CompleteTensorType). Doing so will make the export symbolics
+# more robust to different graphs.
+
+# ---------------------------------------------------------------------------------
+# Helper functions
+# ---------------------------------------------------------------------------------
+
+# Save some builtins as locals, because we'll shadown them below
+_sum = sum
+
+
+def _parse_arg(value, desc):
+    if desc == 'none':
+        return value
+    if desc == 'v' or not _is_value(value):
+        return value
+    if value.node().kind() == 'onnx::Constant':
+        tval = value.node()['value']
+        if desc == 'i':
+            return int(tval)
+        elif desc == 'f':
+            return float(tval)
+        elif desc == 'b':
+            return bool(tval)
+        elif desc == 't':
+            return tval
+        elif desc == 'is':
+            return [int(v) for v in tval]
+        else:
+            raise RuntimeError("ONNX symbolic doesn't know to interpret Constant node")
+    elif value.node().kind() == 'prim::ListConstruct':
+        if desc == 'is':
+            for v in value.node().inputs():
+                if v.node().kind() != 'onnx::Constant':
+                    raise RuntimeError("Failed to export an ONNX attribute, "
+                                       "since it's not constant, please try to make "
+                                       "things (e.g., kernel size) static if possible")
+            return [int(v.node()['value']) for v in value.node().inputs()]
+        else:
+            raise RuntimeError("ONNX symbolic doesn't know to interpret ListConstruct node")
+
+    raise RuntimeError("Unexpected node type: {}".format(value.node().kind()))
+
+
+def _maybe_get_const(value, desc):
+    if _is_value(value) and value.node().kind() == 'onnx::Constant':
+        return _parse_arg(value, desc)
+    return value
+
+
+def _maybe_get_scalar(value):
+    value_t = _maybe_get_const(value, 't')
+    if isinstance(value_t, torch.Tensor) and value_t.shape == ():
+        return value_t
+    return value
+
+
+def _get_const(value, desc, arg_name):
+    if _is_value(value) and value.node().kind() != 'onnx::Constant':
+        raise RuntimeError("ONNX symbolic expected a constant value of the {} argument".format(arg_name))
+    return _parse_arg(value, desc)
+
+
+def _unpack_list(list_value):
+    list_node = list_value.node()
+    assert list_node.kind() == "prim::ListConstruct"
+    return list(list_node.inputs())
+
+
+def parse_args(*arg_descriptors):
+    def decorator(fn):
+        def wrapper(g, *args):
+            # some args may be optional, so the length may be smaller
+            assert len(arg_descriptors) >= len(args)
+            args = [_parse_arg(arg, arg_desc) for arg, arg_desc in zip(args, arg_descriptors)]
+            return fn(g, *args)
+        # In Python 2 functools.wraps chokes on partially applied functions, so we need this as a workaround
+        try:
+            wrapper = wraps(fn)(wrapper)
+        except Exception:
+            pass
+        return wrapper
+    return decorator
+
+
+def _scalar(x):
+    """Convert a scalar tensor into a Python value."""
+    assert x.numel() == 1
+    return x.item()
+
+
+def _if_scalar_type_as(g, self, tensor):
+    """
+    Convert self into the same type of tensor, as necessary.
+
+    We only support implicit casting for scalars, so we never
+    actually need to insert an ONNX cast operator here; just
+    fix up the scalar.
+    """
+    if isinstance(self, torch._C.Value):
+        return self
+    elif tensor.type().kind() == "DimensionedTensorType" or tensor.type().kind() == "CompleteTensorType":
+        ty = tensor.type().scalarType().lower()
+        return getattr(self, ty)()
+    else:
+        return self
+
+
+def _is_value(x):
+    return isinstance(x, torch._C.Value)
+
+
+def _is_tensor_list(x):
+    return x.type().isSubtypeOf(ListType.ofTensors())
+
+
+def _unimplemented(op, msg):
+    warnings.warn("ONNX export failed on " + op + " because " + msg + " not supported")
+
+
+def _black_list_in_opset(name):
+    def symbolic_fn(*args, **kwargs):
+        warnings.warn("ONNX export failed on {}, which is not yet implemented for opset 10. "
+                      "Try exporting with a previous opset version."
+                      .format(name))
+    return symbolic_fn
+
+
+def _try_get_scalar_type(*args):
+    for arg in args:
+        try:
+            return arg.type().scalarType()
+        except RuntimeError:
+            pass
+    return None
+
+
+# ---------------------------------------------------------------------
+# ONNX operator version
+# ---------------------------------------------------------------------
+
+# READ ME BEFORE EDITING _default_onnx_opset_version:
+#
+# The variable below controls which ONNX operator set version we are
+# targeting. THIS VARIABLE HAS SEMANTIC EFFECT! Say a breaking
+# change occurred in version 8. As long as this variable < 8, you can
+# export models targeting the old behavior. However, if you bump
+# this variable to 8 or later, the breaking change will take into effect:
+# you MUST adjust any symbolic affected by breaking changes. The ONNX
+# spec publishes a *comprehensive* list of BC-breaking changes for every
+# operator revision at:
+#
+#   https://github.com/onnx/onnx/blob/master/docs/Changelog.md
+#
+# Please be sure to go through and check all of our implementations here before
+# increasing this number. This includes symbolic definitions NOT in this
+# file, so grep for "OpName" (with quotes)
+#
+# Besides, opset_version can be specified in the invocation of export()
+# and export_to_pretty_string(), and _export_onnx_opset_version will be set
+# and the symbolic functions should check it to determine the behavior
+# of the exporter.
+
+
+_default_onnx_opset_version = 9
+_onnx_master_opset = 10
+_onnx_stable_opsets = [9, 10]
+_export_onnx_opset_version = _default_onnx_opset_version
+
+
+def _set_opset_version(opset_version):
+    global _export_onnx_opset_version
+    if opset_version == _default_onnx_opset_version:
+        _export_onnx_opset_version = opset_version
+        return
+    if opset_version in _onnx_stable_opsets + [_onnx_master_opset]:
+        _export_onnx_opset_version = opset_version
+        return
+    raise ValueError("Unsupported ONNX opset version: " + str(opset_version))
+
+
+# Metaprogram symbolics for each ATen native specialized cast operator.
+# For e.g. we specify a function named `_cast_uint8_t` that instantiates an
+# ONNX cast node with `to` attribute 'UINT8'
+#
+# TODO: remove these once we support Type's in the JIT IR and we can once again
+# use the unified toType operator
+cast_pytorch_to_onnx = {
+    'Byte': torch.onnx.TensorProtoDataType.UINT8,
+    'Char': torch.onnx.TensorProtoDataType.INT8,
+    'Double': torch.onnx.TensorProtoDataType.DOUBLE,
+    'Float': torch.onnx.TensorProtoDataType.FLOAT,
+    'Half': torch.onnx.TensorProtoDataType.FLOAT16,
+    'Int': torch.onnx.TensorProtoDataType.INT32,
+    'Long': torch.onnx.TensorProtoDataType.INT64,
+    'Short': torch.onnx.TensorProtoDataType.INT16,
+    'Bool': torch.onnx.TensorProtoDataType.BOOL,
+}
+
+scalar_name_to_pytorch = {
+    'uint8_t': 'Byte',
+    'int8_t': 'Char',
+    'double': 'Double',
+    'float': 'Float',
+    'half': 'Half',
+    'int': 'Int',
+    'int64_t': 'Long',
+    'int16_t': 'Short',
+}
+
+
+# This indicates each scalar type's corresponding
+# torch type. Related source:
+# https://github.com/pytorch/pytorch/blob/da7468853ae322252270bbb58032668bd21b7457/c10/core/ScalarType.h
+scalar_type_to_pytorch_type = [
+    torch.uint8,    # 0
+    torch.int8,     # 1
+    torch.short,    # 2
+    torch.int,      # 3
+    torch.int64,    # 4
+    torch.half,     # 5
+    torch.float,    # 6
+    torch.double,   # 7
+]
+
+
+def _cast_func_template(to_i, g, input, non_blocking):
+    return g.op("Cast", input, to_i=to_i)
+
+
+scalar_type_to_onnx = [
+    cast_pytorch_to_onnx["Byte"],
+    cast_pytorch_to_onnx["Char"],
+    cast_pytorch_to_onnx["Short"],
+    cast_pytorch_to_onnx["Int"],
+    cast_pytorch_to_onnx["Long"],
+    cast_pytorch_to_onnx["Half"],
+    cast_pytorch_to_onnx["Float"],
+    cast_pytorch_to_onnx["Double"],
+]
diff --git a/torch/onnx/symbolic_opset10.py b/torch/onnx/symbolic_opset10.py
new file mode 100644
index 000000000000..ee75d7e757fd
--- /dev/null
+++ b/torch/onnx/symbolic_opset10.py
@@ -0,0 +1,122 @@
+import torch
+from torch.nn.modules.utils import _single, _pair, _triple
+import torch.onnx
+# This import monkey-patches graph manipulation methods on Graph, used for the
+# ONNX symbolics
+import torch.onnx.utils
+
+from torch.onnx.symbolic_helper import parse_args, _unimplemented, _black_list_in_opset
+import torch.onnx.symbolic_opset9
+
+
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in symbolic_helper.py
+
+# This file exports ONNX ops for opset 10
+# Opset 10 is supported by ONNX release 1.5.0
+# release on 04/24/19
+
+
+# Blacklist operators for this opset version.
+# These operators have been updated in ONNX but not re-implemented here.
+# It is very important to blacklist these operators to avoid exporting
+# models with mixed versions of operators.
+# TODO : add support for the blacklisted operators in black_listed_operators
+black_listed_operators = ["flip",
+                          "slice",
+                          "upsample_nearest2d", "upsample_bilinear2d",
+                          "dropout", "feature_dropout", "alpha_dropout", "feature_alpha_dropout",
+                          "dropout_", "feature_dropout_", "alpha_dropout_", "feature_alpha_dropout_"]
+
+for black_listed_op in black_listed_operators:
+    vars()[black_listed_op] = _black_list_in_opset(black_listed_op)
+
+
+# Add new operator here
+@parse_args('v', 'i', 'i', 'i', 'i')
+def topk(g, self, k, dim, largest, sorted, out=None):
+    if out is not None:
+        _unimplemented("TopK", "Out parameter is not supported for topk")
+    if not largest:
+        _unimplemented("TopK", "Ascending TopK is not supported")
+    k = g.op("Constant", value_t=torch.tensor(k, dtype=torch.int64))
+    from torch.onnx.symbolic_opset9 import unsqueeze
+    k = unsqueeze(g, k, 0)
+    return g.op("TopK", self, k, axis_i=dim, outputs=2)
+
+
+def _max_pool(name, tuple_fn, ndims, return_indices):
+    @parse_args('v', 'is', 'is', 'is', 'is', 'i')
+    def symbolic_fn(g, input, kernel_size, stride, padding, dilation, ceil_mode):
+        if not stride:
+            stride = kernel_size
+        kwargs = {
+            'kernel_shape_i': tuple_fn(kernel_size),
+            'pads_i': tuple_fn(padding) * 2,
+            'strides_i': tuple_fn(stride),
+            'ceil_mode_i': ceil_mode,
+        }
+        if set(tuple_fn(dilation)) != {1}:
+            kwargs['dilations_i'] = tuple_fn(dilation)
+        # easy but hacky way to get flattened indices values
+        # to be used to convert the indices values to non-flattened.
+        # In ONNX the indices are computed as a flatten 1-D tensor,
+        # so the values in indices are in [0, N x C x D1 x ... x Dn).
+        # To convert the indices to the same format used by Pytorch,
+        # we first execute a maxpool with a kernel and stride of 1 on the same input.
+        # This will result in a tensor of indices in which each index will have it's own value.
+        # Using this tensor as a reference, we extract the first index of each axis and substract
+        # it from each index of this axis in the indices to convert.
+        # This step will result in a tensor were each dimension has values of indices within
+        # the dimension it is in.
+        # For more information :
+        # https://github.com/pytorch/pytorch/pull/16455#issuecomment-460776407
+        if return_indices:
+            r, indices = g.op("MaxPool", input, outputs=2, **kwargs)
+            _, flattened_indices = g.op("MaxPool", input, outputs=2,
+                                        kernel_shape_i=[1 for _ in range(ndims)],
+                                        strides_i=[1 for _ in range(ndims)])
+            # convert indices to have non-flattened indices values
+            s = _slice_op(g, flattened_indices, axes=[2 + i for i in range(ndims)],
+                          starts=tuple_fn(0), ends=tuple_fn(1))
+            indices = sub(g, indices, s)
+            return r, indices
+        else:
+            r = g.op("MaxPool", input, outputs=1, **kwargs)
+            return r
+
+    return symbolic_fn
+
+
+max_pool1d = _max_pool("max_pool1d", _single, 1, return_indices=False)
+max_pool2d = _max_pool("max_pool2d", _pair, 2, return_indices=False)
+max_pool3d = _max_pool("max_pool3d", _triple, 3, return_indices=False)
+max_pool1d_with_indices = _max_pool("max_pool1d_with_indices", _single, 1, return_indices=True)
+max_pool2d_with_indices = _max_pool("max_pool2d_with_indices", _pair, 2, return_indices=True)
+max_pool3d_with_indices = _max_pool("max_pool3d_with_indices", _triple, 3, return_indices=True)
+
+
+def _avg_pool(name, tuple_fn):
+    @parse_args('v', 'is', 'is', 'is', 'i', 'i')
+    def symbolic_fn(g, input, kernel_size, stride, padding, ceil_mode, count_include_pad):
+        if not stride:
+            stride = kernel_size
+        padding = tuple(tuple_fn(padding))
+        if count_include_pad:
+            input = g.op("Pad", input,
+                         pads_i=((0,) * 2 + padding) * 2,
+                         mode_s='constant',
+                         value_f=0.)
+            padding = (0,) * len(padding)
+        output = g.op("AveragePool", input,
+                      kernel_shape_i=tuple_fn(kernel_size),
+                      strides_i=tuple_fn(stride),
+                      pads_i=padding * 2,
+                      ceil_mode_i=ceil_mode)
+        return output
+    return symbolic_fn
+
+
+avg_pool1d = _avg_pool('avg_pool1d', _single)
+avg_pool2d = _avg_pool('avg_pool2d', _pair)
+avg_pool3d = _avg_pool('avg_pool3d', _triple)
diff --git a/torch/onnx/symbolic.py b/torch/onnx/symbolic_opset9.py
similarity index 77%
rename from torch/onnx/symbolic.py
rename to torch/onnx/symbolic_opset9.py
index 6c0be5cb9a13..ff0d71f08466 100644
--- a/torch/onnx/symbolic.py
+++ b/torch/onnx/symbolic_opset9.py
@@ -1,233 +1,28 @@
 import torch
 from torch._C import ListType, OptionalType
 from torch.nn.modules.utils import _single, _pair, _triple
-import warnings
 
 import torch.onnx
 # This import monkey-patches graph manipulation methods on Graph, used for the
 # ONNX symbolics
 import torch.onnx.utils
 
-from functools import partial, wraps
+from functools import partial
+
+import torch.onnx.symbolic_helper as sym_help
+from torch.onnx.symbolic_helper import parse_args, _parse_arg, _unimplemented
 
 import numpy
 import math
+import warnings
 
-# EDITING THIS FILE? READ THIS FIRST!
-#
-# - This file is ONLY for ATen operators (e.g., operators that show up in the
-#   trace as aten::blah).  If you need to special case a primitive operator,
-#   look at _run_symbolic_function
-# - Parameter ordering does NOT necessarily match what is in VariableType.cpp;
-#   tensors are always first, then non-tensor arguments.
-# - Parameter names must *exactly* match the names in VariableType.cpp, because
-#   dispatch is done with keyword arguments.
-# - Looking for inplace ops?  They're detected by the trailing underscore, and
-#   transparently dispatched to their non inplace versions in
-#   'run_symbolic_function'.   See Note [Export inplace]
-#
-# ----------------------------------------------------------------------------------
-# A note on Tensor types
-# ----------------------------------------------------------------------------------
-#
-# In general, we should avoid depending on the type of Tensor Values contained
-# within the trace graph. However, this is sometimes unavoidable (due to ONNX
-# spec requirements, etc). If you are implementing a symbolic and need Tensor
-# type information, note that there are several levels of Tensor types, defined
-# in aten/src/ATen/core/jit_type.h:
-#
-# TensorType - This is a Tensor, but we don't know anything about its
-#               properties (e.g. scalar type, # dims, shapes).
-#               Appears as `Tensor` in graph print-outs.
-# DimensionedTensorType <: TensorType - Denotes a Tensor for which we know the scalar
-#                             type and number of dimensions, but not the concrete
-#                             shapes. For example, appears as 'Float(*, *)' in
-#                             graph print-outs. Useful accessor methods include
-#                             dim() and scalarType()
-# CompleteTensorType <: DimensionedTensorType - Denotes a Tensor for which we know the
-#                                               concrete sizes in addition to the information
-#                                               contained in TensorTyper. This adds a sizes()
-#                                               method which can be used to retrieve the
-#                                               concrete sizes.
-#
-# In general, we should prefer to rely on the least specific information possible.
-# For example, not relying on tensor properties at all is better than relying
-# on the number of dimensions (DimensionedTensorType) which is better than relying on
-# concrete shapes (CompleteTensorType). Doing so will make the export symbolics
-# more robust to different graphs.
-
-# ---------------------------------------------------------------------------------
-# Helper functions
-# ---------------------------------------------------------------------------------
-
-# Save some builtins as locals, because we'll shadown them below
-_sum = sum
-
-
-def _parse_arg(value, desc):
-    if desc == 'none':
-        return value
-    if desc == 'v' or not _is_value(value):
-        return value
-    if value.node().kind() == 'onnx::Constant':
-        tval = value.node()['value']
-        if desc == 'i':
-            return int(tval)
-        elif desc == 'f':
-            return float(tval)
-        elif desc == 'b':
-            return bool(tval)
-        elif desc == 't':
-            return tval
-        elif desc == 'is':
-            return [int(v) for v in tval]
-        else:
-            raise RuntimeError("ONNX symbolic doesn't know to interpret Constant node")
-    elif value.node().kind() == 'prim::ListConstruct':
-        if desc == 'is':
-            for v in value.node().inputs():
-                if v.node().kind() != 'onnx::Constant':
-                    raise RuntimeError("Failed to export an ONNX attribute, "
-                                       "since it's not constant, please try to make "
-                                       "things (e.g., kernel size) static if possible")
-            return [int(v.node()['value']) for v in value.node().inputs()]
-        else:
-            raise RuntimeError("ONNX symbolic doesn't know to interpret ListConstruct node")
-
-    raise RuntimeError("Unexpected node type: {}".format(value.node().kind()))
-
-
-def _maybe_get_const(value, desc):
-    if _is_value(value) and value.node().kind() == 'onnx::Constant':
-        return _parse_arg(value, desc)
-    return value
-
-
-def _maybe_get_scalar(value):
-    value_t = _maybe_get_const(value, 't')
-    if isinstance(value_t, torch.Tensor) and value_t.shape == ():
-        return value_t
-    return value
-
-
-def _get_const(value, desc, arg_name):
-    if _is_value(value) and value.node().kind() != 'onnx::Constant':
-        raise RuntimeError("ONNX symbolic expected a constant value of the {} argument".format(arg_name))
-    return _parse_arg(value, desc)
-
-
-def _unpack_list(list_value):
-    list_node = list_value.node()
-    assert list_node.kind() == "prim::ListConstruct"
-    return list(list_node.inputs())
-
-
-def parse_args(*arg_descriptors):
-    def decorator(fn):
-        def wrapper(g, *args):
-            # some args may be optional, so the length may be smaller
-            assert len(arg_descriptors) >= len(args)
-            args = [_parse_arg(arg, arg_desc) for arg, arg_desc in zip(args, arg_descriptors)]
-            return fn(g, *args)
-        # In Python 2 functools.wraps chokes on partially applied functions, so we need this as a workaround
-        try:
-            wrapper = wraps(fn)(wrapper)
-        except Exception:
-            pass
-        return wrapper
-    return decorator
-
-
-def _scalar(x):
-    """Convert a scalar tensor into a Python value."""
-    assert x.numel() == 1
-    return x.item()
-
-
-def _if_scalar_type_as(g, self, tensor):
-    """
-    Convert self into the same type of tensor, as necessary.
-
-    We only support implicit casting for scalars, so we never
-    actually need to insert an ONNX cast operator here; just
-    fix up the scalar.
-    """
-    if isinstance(self, torch._C.Value):
-        return self
-    elif tensor.type().kind() == "DimensionedTensorType" or tensor.type().kind() == "CompleteTensorType":
-        ty = tensor.type().scalarType().lower()
-        return getattr(self, ty)()
-    else:
-        return self
-
-
-def _is_value(x):
-    return isinstance(x, torch._C.Value)
-
-
-def _is_tensor_list(x):
-    return x.type().isSubtypeOf(ListType.ofTensors())
-
-
-def _unimplemented(op, msg):
-    warnings.warn("ONNX export failed on " + op + " because " + msg + " not supported")
-
-
-def _try_get_scalar_type(*args):
-    for arg in args:
-        try:
-            return arg.type().scalarType()
-        except RuntimeError:
-            pass
-    return None
-
-
-# ---------------------------------------------------------------------
-# ONNX operator version
-# ---------------------------------------------------------------------
-
-# READ ME BEFORE EDITING _default_onnx_opset_version:
-#
-# The variable below controls which ONNX operator set version we are
-# targeting. THIS VARIABLE HAS SEMANTIC EFFECT! Say a breaking
-# change occurred in version 8. As long as this variable < 8, you can
-# export models targeting the old behavior. However, if you bump
-# this variable to 8 or later, the breaking change will take into effect:
-# you MUST adjust any symbolic affected by breaking changes. The ONNX
-# spec publishes a *comprehensive* list of BC-breaking changes for every
-# operator revision at:
-#
-#   https://github.com/onnx/onnx/blob/master/docs/Changelog.md
-#
-# Please be sure to go through and check all of our implementations here before
-# increasing this number. This includes symbolic definitions NOT in this
-# file, so grep for "OpName" (with quotes)
-#
-# Besides, opset_version can be specified in the invocation of export()
-# and export_to_pretty_string(), and _export_onnx_opset_version will be set
-# and the symbolic functions should check it to determine the behavior
-# of the exporter.
-
-
-_default_onnx_opset_version = 9
-_onnx_master_opset = 10
-_onnx_stable_opsets = [9]
-_export_onnx_opset_version = _default_onnx_opset_version
-
-
-def _set_opset_version(opset_version):
-    global _export_onnx_opset_version
-    if opset_version == _export_onnx_opset_version:
-        return
-    if opset_version in _onnx_stable_opsets + [_onnx_master_opset]:
-        _export_onnx_opset_version = opset_version
-        return
-    raise ValueError("Unsupported ONNX opset version: " + str(opset_version))
 
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in symbolic_helper.py
 
-# ---------------------------------------------------------------------
-# Symbolic definitions
-# ---------------------------------------------------------------------
+# This file exports ONNX ops for opset 9
+# Opset 9 is supported by ONNX release 1.4.1
+# release on 01/23/19
 
 
 # Note [Pointwise by scalar]
@@ -284,60 +79,60 @@ def reshape_as(g, self, other):
 
 def add(g, self, other, alpha=None):
     # default alpha arg is to allow no-alpha add (aten add st overload no alpha)
-    if alpha and _scalar(_maybe_get_scalar(alpha)) != 1:
+    if alpha and sym_help._scalar(sym_help._maybe_get_scalar(alpha)) != 1:
         return _unimplemented("add", "alpha != 1")
     # See Note [Pointwise by scalar]
-    other = _maybe_get_scalar(other)
-    return g.op("Add", self, _if_scalar_type_as(g, other, self))
+    other = sym_help._maybe_get_scalar(other)
+    return g.op("Add", self, sym_help._if_scalar_type_as(g, other, self))
 
 
 def sub(g, self, other, alpha=None):
     # default alpha arg is to allow no-alpha sub (aten sub st overload no alpha)
-    if alpha and _scalar(_maybe_get_scalar(alpha)) != 1:
+    if alpha and sym_help._scalar(sym_help._maybe_get_scalar(alpha)) != 1:
         return _unimplemented("sub", "alpha != 1")
     # See Note [Pointwise by scalar]. Note that self or other may be scalars.
-    other = _maybe_get_scalar(other)
-    return g.op("Sub", self, _if_scalar_type_as(g, other, self))
+    other = sym_help._maybe_get_scalar(other)
+    return g.op("Sub", self, sym_help._if_scalar_type_as(g, other, self))
 
 
 def rsub(g, self, other, alpha=None):
-    other = _maybe_get_scalar(other)
-    other = _if_scalar_type_as(g, other, self)
+    other = sym_help._maybe_get_scalar(other)
+    other = sym_help._if_scalar_type_as(g, other, self)
     return sub(g, other, self, alpha=alpha)
 
 
 def mul(g, self, other):
     # See Note [Pointwise by scalar]
-    other = _maybe_get_scalar(other)
-    return g.op("Mul", self, _if_scalar_type_as(g, other, self))
+    other = sym_help._maybe_get_scalar(other)
+    return g.op("Mul", self, sym_help._if_scalar_type_as(g, other, self))
 
 
 def div(g, self, other):
     # See Note [Pointwise by scalar]
-    other = _maybe_get_scalar(other)
-    return g.op("Div", self, _if_scalar_type_as(g, other, self))
+    other = sym_help._maybe_get_scalar(other)
+    return g.op("Div", self, sym_help._if_scalar_type_as(g, other, self))
 
 
 def reciprocal(g, self):
-    return g.op("Div", _if_scalar_type_as(g, torch.ones(1), self), self)
+    return g.op("Div", sym_help._if_scalar_type_as(g, torch.ones(1), self), self)
 
 
 @parse_args('v', 'i')
 def cat(g, tensor_list, dim):
-    tensors = _unpack_list(tensor_list)
+    tensors = sym_help._unpack_list(tensor_list)
     return g.op("Concat", *tensors, axis_i=dim)
 
 
 @parse_args('v', 'i')
 def stack(g, tensor_list, dim):
-    unsqueezed = [g.op("Unsqueeze", t, axes_i=[dim]) for t in _unpack_list(tensor_list)]
+    unsqueezed = [g.op("Unsqueeze", t, axes_i=[dim]) for t in sym_help._unpack_list(tensor_list)]
     return g.op("Concat", *unsqueezed, axis_i=dim)
 
 
 def mm(g, self, other):
     # Create a dummy C tensor. Only needed for API purposes, the value is
     # since beta = 0
-    ty = _try_get_scalar_type(self, other).lower()
+    ty = sym_help._try_get_scalar_type(self, other).lower()
     C = g.constant(0, [1], ty)
     return g.op("Gemm", self, other, C, beta_f=0.0, alpha_f=1.0)
 
@@ -352,7 +147,7 @@ def matmul(g, self, other):
 
 @parse_args('v', 'v', 'v', 't', 't')
 def addmm(g, self, mat1, mat2, beta, alpha):
-    return g.op("Gemm", mat1, mat2, self, beta_f=_scalar(beta), alpha_f=_scalar(alpha))
+    return g.op("Gemm", mat1, mat2, self, beta_f=sym_help._scalar(beta), alpha_f=sym_help._scalar(alpha))
 
 
 def neg(g, self):
@@ -395,6 +190,10 @@ def sigmoid(g, self):
     return g.op("Sigmoid", self)
 
 
+def sign(g, self):
+    return g.op("Sign", self)
+
+
 def _slice_op(g, input, axes, starts, ends):
     assert len(starts) == len(ends)
     if len(starts) == 1 and starts[0] == 0 and ends[0] == 9223372036854775807:
@@ -409,7 +208,7 @@ def symbolic(g, self, dim=None, keepdim=None):
             return g.op(onnx_op_name, self, keepdims_i=0)
         else:
             # dim-reduce path
-            dim, keepdim = _get_const(dim, 'i', 'dim'), _get_const(keepdim, 'i', 'keepdim')
+            dim, keepdim = sym_help._get_const(dim, 'i', 'dim'), sym_help._get_const(keepdim, 'i', 'keepdim')
             return g.op(onnx_op_name, self, axes_i=[dim], keepdims_i=keepdim)
     return symbolic
 
@@ -442,8 +241,8 @@ def t(g, self):
 
 
 def expand(g, self, size, implicit):
-    size = _maybe_get_const(size, 'is')
-    if not _is_value(size):
+    size = sym_help._maybe_get_const(size, 'is')
+    if not sym_help._is_value(size):
         size = g.op("Constant", value_t=torch.LongTensor(size))
     return g.op("Expand", self, size)
 
@@ -504,8 +303,8 @@ def permute(g, self, dims):
 
 
 def view(g, self, size):
-    size = _maybe_get_const(size, 'is')
-    if _is_value(size):
+    size = sym_help._maybe_get_const(size, 'is')
+    if sym_help._is_value(size):
         shape = size
     else:
         if self.isCompleteTensor():
@@ -570,7 +369,7 @@ def squeeze(g, self, dim=None):
             if size == 1:
                 dims.append(i)
     else:
-        dims = [_get_const(dim, 'i', 'dim')]
+        dims = [sym_help._get_const(dim, 'i', 'dim')]
         # Handle negative dims
         for i, dim in enumerate(dims):
             if dim < 0:
@@ -607,18 +406,18 @@ def floor(g, input):
 @parse_args('v', 't', 't')
 def threshold(g, self, threshold, value):
     # See Note [Export inplace]
-    if _scalar(threshold) != 0:
+    if sym_help._scalar(threshold) != 0:
         return _unimplemented("threshold", "non-zero threshold")
-    if _scalar(value) != 0:
+    if sym_help._scalar(value) != 0:
         return _unimplemented("threshold", "non-zero value")
     return g.op("Relu", self)
 
 
 def leaky_relu(g, input, negative_slope, inplace=False):
-    negative_slope = _get_const(negative_slope, 't', 'negative_slope')
+    negative_slope = sym_help._get_const(negative_slope, 't', 'negative_slope')
     # See Note [Export inplace]
     # TODO: Talk to ONNX about unconditional cast of scalar to float
-    return g.op("LeakyRelu", input, alpha_f=_scalar(negative_slope))
+    return g.op("LeakyRelu", input, alpha_f=sym_help._scalar(negative_slope))
 
 
 @parse_args('v', 'i')
@@ -655,13 +454,13 @@ def softmax(g, input, dim, dtype=None):
         if input.type().dim() == dim + 1:
             softmax = g.op('Softmax', input, axis_i=dim)
             if dtype:
-                softmax = g.op("Cast", softmax, to_i=scalar_type_to_onnx[dtype])
+                softmax = g.op("Cast", softmax, to_i=sym_help.scalar_type_to_onnx[dtype])
             return softmax
     exp = g.op('Exp', input)
     sum = g.op('ReduceSum', exp, axes_i=[dim])
     softmax = g.op('Div', exp, sum)
     if dtype:
-        softmax = g.op("Cast", softmax, to_i=scalar_type_to_onnx[dtype])
+        softmax = g.op("Cast", softmax, to_i=sym_help.scalar_type_to_onnx[dtype])
     return softmax
 
 
@@ -700,6 +499,8 @@ def _max_pool(name, tuple_fn, ndims, return_indices):
     def symbolic_fn(g, input, kernel_size, stride, padding, dilation, ceil_mode):
         if ceil_mode and input.type().kind() != "CompleteTensorType":
             return _unimplemented(name, "input size not accesible")
+        if set(tuple_fn(dilation)) != {1}:
+            return _unimplemented(name, "dilation")
         if not stride:
             stride = kernel_size
         padding = tuple(tuple_fn(padding))
@@ -713,8 +514,6 @@ def symbolic_fn(g, input, kernel_size, stride, padding, dilation, ceil_mode):
             'pads_i': padding,
             'strides_i': tuple_fn(stride),
         }
-        if set(tuple_fn(dilation)) != {1}:
-            kwargs['dilations_i'] = tuple_fn(dilation)
         # easy but hacky way to get flattened indices values
         # to be used to convert the indices values to non-flattened.
         # In ONNX the indices are computed as a flatten 1-D tensor,
@@ -886,10 +685,22 @@ def upsample_bilinear2d(g, input, output_size, align_corners):
                 mode_s="linear")
 
 
-def wrap_logical_op_with_cast_to_uint8(func):
-    def wrap_with_cast(g, input, other):
-        return g.op("Cast", func(g, input, other), to_i=cast_pytorch_to_onnx['Byte'])
-    return wrap_with_cast
+def wrap_logical_op_with_cast_to(to_type):
+    def decorator(fn):
+        def wrap_with_cast(g, input, other):
+            return g.op("Cast", fn(g, input, other), to_i=sym_help.cast_pytorch_to_onnx[to_type])
+        return wrap_with_cast
+    return decorator
+
+
+def wrap_logical_op_with_cast_to_and_from(to_type):
+    def decorator(fn):
+        def wrap_with_cast(g, input, other):
+            to_cast_func = globals()['_cast_{}'.format(to_type)]
+            from_cast_func = wrap_logical_op_with_cast_to(input.type().scalarType())(fn)
+            return from_cast_func(g, to_cast_func(g, input, False), to_cast_func(g, other, False))
+        return wrap_with_cast
+    return decorator
 
 
 def wrap_logical_op_with_negation(func):
@@ -898,49 +709,59 @@ def wrap_with_not(g, input, other):
     return wrap_with_not
 
 
-@wrap_logical_op_with_cast_to_uint8
+@wrap_logical_op_with_cast_to('Byte')
 def eq(g, self, other):
     return g.op("Equal", self, other)
 
 
-@wrap_logical_op_with_cast_to_uint8
+@wrap_logical_op_with_cast_to('Byte')
 @wrap_logical_op_with_negation
 def ne(g, self, other):
     return g.op("Equal", self, other)
 
 
-@wrap_logical_op_with_cast_to_uint8
+@wrap_logical_op_with_cast_to('Byte')
 def gt(g, input, other):
     return gt_impl(g, input, other)
 
 
 def gt_impl(g, input, other):
-    other = _maybe_get_scalar(other)
-    return g.op("Greater", input, _if_scalar_type_as(g, other, input))
+    other = sym_help._maybe_get_scalar(other)
+    return g.op("Greater", input, sym_help._if_scalar_type_as(g, other, input))
 
 
-@wrap_logical_op_with_cast_to_uint8
+@wrap_logical_op_with_cast_to('Byte')
 def lt(g, input, other):
     return lt_impl(g, input, other)
 
 
 def lt_impl(g, input, other):
-    other = _maybe_get_scalar(other)
-    return g.op("Less", input, _if_scalar_type_as(g, other, input))
+    other = sym_help._maybe_get_scalar(other)
+    return g.op("Less", input, sym_help._if_scalar_type_as(g, other, input))
 
 
-@wrap_logical_op_with_cast_to_uint8
+@wrap_logical_op_with_cast_to('Byte')
 @wrap_logical_op_with_negation
 def ge(g, input, other):
-    other = _maybe_get_scalar(other)
-    return lt_impl(g, input, _if_scalar_type_as(g, other, input))
+    other = sym_help._maybe_get_scalar(other)
+    return lt_impl(g, input, sym_help._if_scalar_type_as(g, other, input))
 
 
-@wrap_logical_op_with_cast_to_uint8
+@wrap_logical_op_with_cast_to('Byte')
 @wrap_logical_op_with_negation
 def le(g, input, other):
-    other = _maybe_get_scalar(other)
-    return gt_impl(g, input, _if_scalar_type_as(g, other, input))
+    other = sym_help._maybe_get_scalar(other)
+    return gt_impl(g, input, sym_help._if_scalar_type_as(g, other, input))
+
+
+@wrap_logical_op_with_cast_to_and_from('Bool')
+def __and_(g, input, other):
+    return g.op('And', input, other)
+
+
+@wrap_logical_op_with_cast_to_and_from('Bool')
+def __or_(g, input, other):
+    return g.op('Or', input, other)
 
 
 def where(g, condition, self, other):
@@ -957,7 +778,7 @@ def log_softmax(g, input, dim=None, dtype=None):
         return _unimplemented("dim", "ONNX and PyTorch use different strategies to split the input.")
     return_op = g.op("LogSoftmax", input, axis_i=dim)
     if dtype:
-        return_op = g.op("Cast", return_op, to_i=scalar_type_to_onnx[dtype])
+        return_op = g.op("Cast", return_op, to_i=sym_help.scalar_type_to_onnx[dtype])
     return return_op
 
 
@@ -1064,7 +885,7 @@ def elu(g, input, alpha, scale, input_scale):
     if input_scale and input_scale != 1.:
         return _unimplemented("input_scale", "does not support input_scale in Elu")
     # See Note [Export inplace]
-    return g.op("Elu", input, alpha_f=_scalar(alpha))
+    return g.op("Elu", input, alpha_f=sym_help._scalar(alpha))
 
 
 def selu(g, input):
@@ -1077,7 +898,7 @@ def index_select(g, self, dim, index):
 
 
 def index_put(g, self, indices_list_value, values, accumulate):
-    indices_list = _unpack_list(indices_list_value)
+    indices_list = sym_help._unpack_list(indices_list_value)
     args = [self] + indices_list + [values, accumulate]
     return g.op("ATen", *args, operator_s='index_put')
 
@@ -1088,7 +909,7 @@ def type_as(g, self, other):
 
     if other.isCompleteTensor():
         other_type_name = other.type().scalarType()
-        return g.op("Cast", self, to_i=cast_pytorch_to_onnx[other_type_name])
+        return g.op("Cast", self, to_i=sym_help.cast_pytorch_to_onnx[other_type_name])
     else:
         # We don't know the type of other, bail by emitting ATen
         return g.op("ATen", self, other, operator_s="type_as")
@@ -1114,8 +935,8 @@ def log(g, self):
 
 
 def pow(g, self, exponent):
-    exponent = _maybe_get_scalar(exponent)
-    return g.op("Pow", self, _if_scalar_type_as(g, exponent, self))
+    exponent = sym_help._maybe_get_scalar(exponent)
+    return g.op("Pow", self, sym_help._if_scalar_type_as(g, exponent, self))
 
 
 def clamp(g, self, min, max):
@@ -1152,8 +973,8 @@ def max(g, self, dim_or_y=None, keepdim=None):
         return g.op("Max", self, dim_or_y)
     # torch.max(input, dim, keepdim)
     else:
-        dim = _get_const(dim_or_y, 'i', 'dim')
-        keepdim = _get_const(keepdim, 'i', 'keepdim')
+        dim = sym_help._get_const(dim_or_y, 'i', 'dim')
+        keepdim = sym_help._get_const(keepdim, 'i', 'keepdim')
         max = g.op("ReduceMax", self, axes_i=[dim], keepdims_i=keepdim)
         indices = g.op('ArgMax', self, axis_i=dim, keepdims_i=keepdim)
         return max, indices
@@ -1168,8 +989,8 @@ def min(g, self, dim_or_y=None, keepdim=None):
         return g.op("Min", self, dim_or_y)
     # torch.min(input, dim, keepdim)
     else:
-        dim = _get_const(dim_or_y, 'i', 'dim')
-        keepdim = _get_const(keepdim, 'i', 'keepdim')
+        dim = sym_help._get_const(dim_or_y, 'i', 'dim')
+        keepdim = sym_help._get_const(keepdim, 'i', 'keepdim')
         min = g.op("ReduceMin", self, axes_i=[dim], keepdims_i=keepdim)
         indices = g.op('ArgMin', self, axis_i=dim, keepdims_i=keepdim)
         return min, indices
@@ -1191,7 +1012,6 @@ def _unsupported_dropout(name):
     @parse_args('v', 'f', 'i')
     def feature_dropout(g, input, p, train):
         # NB: In inference mode, FeatureDropout is exported as an identity op.
-        from torch.onnx.symbolic import _unimplemented
         if train:
             return _unimplemented(name, "training mode")
         return input
@@ -1238,69 +1058,9 @@ def _unique2(g, input, sorted, return_inverse, return_counts):
                 outputs=3)
 
 
-# Metaprogram symbolics for each ATen native specialized cast operator.
-# For e.g. we specify a function named `_cast_uint8_t` that instantiates an
-# ONNX cast node with `to` attribute 'UINT8'
-#
-# TODO: remove these once we support Type's in the JIT IR and we can once again
-# use the unified toType operator
-cast_pytorch_to_onnx = {
-    'Byte': torch.onnx.TensorProtoDataType.UINT8,
-    'Char': torch.onnx.TensorProtoDataType.INT8,
-    'Double': torch.onnx.TensorProtoDataType.DOUBLE,
-    'Float': torch.onnx.TensorProtoDataType.FLOAT,
-    'Half': torch.onnx.TensorProtoDataType.FLOAT16,
-    'Int': torch.onnx.TensorProtoDataType.INT32,
-    'Long': torch.onnx.TensorProtoDataType.INT64,
-    'Short': torch.onnx.TensorProtoDataType.INT16,
-}
-
-scalar_name_to_pytorch = {
-    'uint8_t': 'Byte',
-    'int8_t': 'Char',
-    'double': 'Double',
-    'float': 'Float',
-    'half': 'Half',
-    'int': 'Int',
-    'int64_t': 'Long',
-    'int16_t': 'Short',
-}
-
-
-# This indicates each scalar type's corresponding
-# torch type. Related source:
-# https://github.com/pytorch/pytorch/blob/da7468853ae322252270bbb58032668bd21b7457/c10/core/ScalarType.h
-scalar_type_to_pytorch_type = [
-    torch.uint8,    # 0
-    torch.int8,     # 1
-    torch.short,    # 2
-    torch.int,      # 3
-    torch.int64,    # 4
-    torch.half,     # 5
-    torch.float,    # 6
-    torch.double,   # 7
-]
-
-
-def _cast_func_template(to_i, g, input, non_blocking):
-    return g.op("Cast", input, to_i=to_i)
-
-
-for k, v in cast_pytorch_to_onnx.items():
+for k, v in sym_help.cast_pytorch_to_onnx.items():
     name = '_cast_{}'.format(k)
-    globals()[name] = parse_args('v', 'i')(partial(_cast_func_template, v))
-
-
-scalar_type_to_onnx = [
-    cast_pytorch_to_onnx["Byte"],
-    cast_pytorch_to_onnx["Char"],
-    cast_pytorch_to_onnx["Short"],
-    cast_pytorch_to_onnx["Int"],
-    cast_pytorch_to_onnx["Long"],
-    cast_pytorch_to_onnx["Half"],
-    cast_pytorch_to_onnx["Float"],
-    cast_pytorch_to_onnx["Double"],
-]
+    globals()[name] = parse_args('v', 'i')(partial(sym_help._cast_func_template, v))
 
 
 @parse_args('v', 'i', 'v', 'v', 'b')
@@ -1309,7 +1069,7 @@ def zeros(g, sizes, dtype, layout, device, pin_memory=False):
         raise RuntimeError("onnx pin_memory support is not implemented")
     # NOTE: no way to set device and layout in ONNX, so we ignore it
     return g.op("ConstantOfShape", sizes,
-                value_t=torch.tensor([0], dtype=scalar_type_to_pytorch_type[dtype]))
+                value_t=torch.tensor([0], dtype=sym_help.scalar_type_to_pytorch_type[dtype]))
 
 
 @parse_args('v', 'i', 'v', 'v', 'b')
@@ -1318,7 +1078,7 @@ def zeros_like(g, input, dtype, layout, device, pin_memory=False):
         raise RuntimeError("onnx pin_memory support is not implemented")
     shape = g.op("Shape", input)
     return g.op("ConstantOfShape", shape,
-                value_t=torch.tensor([0], dtype=scalar_type_to_pytorch_type[dtype]))
+                value_t=torch.tensor([0], dtype=sym_help.scalar_type_to_pytorch_type[dtype]))
 
 
 @parse_args('v', 'i', 'v', 'v', 'b')
@@ -1326,7 +1086,7 @@ def ones(g, sizes, dtype, layout, device, pin_memory=False):
     if pin_memory:
         raise RuntimeError("onnx pin_memory support is not implemented")
     return g.op("ConstantOfShape", sizes,
-                value_t=torch.tensor([1], dtype=scalar_type_to_pytorch_type[dtype]))
+                value_t=torch.tensor([1], dtype=sym_help.scalar_type_to_pytorch_type[dtype]))
 
 
 @parse_args('v', 'i', 'v', 'v', 'b')
@@ -1335,20 +1095,20 @@ def ones_like(g, input, dtype, layout, device, pin_memory=False):
         raise RuntimeError("onnx pin_memory support is not implemented")
     shape = g.op("Shape", input)
     return g.op("ConstantOfShape", shape,
-                value_t=torch.tensor([1], dtype=scalar_type_to_pytorch_type[dtype]))
+                value_t=torch.tensor([1], dtype=sym_help.scalar_type_to_pytorch_type[dtype]))
 
 
 def full(g, sizes, value, dtype, layout, device, pin_memory=False):
     if pin_memory and _parse_arg(pin_memory, 'b'):
         raise RuntimeError("onnx pin_memory support is not implemented")
-    const_value = _maybe_get_const(value, 't')
-    if _is_value(const_value):
+    const_value = sym_help._maybe_get_const(value, 't')
+    if sym_help._is_value(const_value):
         tmp = zeros(sizes, dtype, layout, device)
         return add(tmp, value, g.op("Constant", value_t=torch.tensor(1)))
     else:
-        dtype = _get_const(dtype, 'i', 'dtype')
+        dtype = sym_help._get_const(dtype, 'i', 'dtype')
         return g.op("ConstantOfShape", sizes,
-                    value_t=torch.tensor([const_value], dtype=scalar_type_to_pytorch_type[dtype]))
+                    value_t=torch.tensor([const_value], dtype=sym_help.scalar_type_to_pytorch_type[dtype]))
 
 
 @parse_args('v', 'f', 'i', 'v', 'v', 'b')
@@ -1357,7 +1117,7 @@ def full_like(g, input, fill_value, dtype, layout, device, pin_memory=False):
         raise RuntimeError("onnx pin_memory support is not implemented")
     shape = g.op("Shape", input)
     return g.op("ConstantOfShape", shape,
-                value_t=torch.tensor([fill_value], dtype=scalar_type_to_pytorch_type[dtype]))
+                value_t=torch.tensor([fill_value], dtype=sym_help.scalar_type_to_pytorch_type[dtype]))
 
 
 @parse_args('v', 'v', 'v', 'v', 'i')
@@ -1422,32 +1182,32 @@ def to(g, self, *args):
             return self
         else:
             # aten::to(Tensor, ScalarType, bool, bool)
-            dtype = _get_const(args[0], 'i', 'dtype')
-            return g.op("Cast", self, to_i=scalar_type_to_onnx[dtype])
+            dtype = sym_help._get_const(args[0], 'i', 'dtype')
+            return g.op("Cast", self, to_i=sym_help.scalar_type_to_onnx[dtype])
     elif len(args) == 4:
         # aten::to(Tensor, Device, ScalarType, bool, bool)
-        dtype = _get_const(args[1], 'i', 'dtype')
-        return g.op("Cast", self, to_i=scalar_type_to_onnx[dtype])
+        dtype = sym_help._get_const(args[1], 'i', 'dtype')
+        return g.op("Cast", self, to_i=sym_help.scalar_type_to_onnx[dtype])
     elif len(args) == 5:
         # aten::to(Tensor, ScalarType, Layout, Device, bool, bool) -> Tensor
-        dtype = _get_const(args[0], 'i', 'dtype')
+        dtype = sym_help._get_const(args[0], 'i', 'dtype')
         # Layout and device are ignored
-        return g.op("Cast", self, to_i=scalar_type_to_onnx[dtype])
+        return g.op("Cast", self, to_i=sym_help.scalar_type_to_onnx[dtype])
     elif len(args) == 6:
         # aten::to(Tensor, ScalarType, Layout, Device, bool, bool, bool) -> Tensor
-        dtype = _get_const(args[0], 'i', 'dtype')
+        dtype = sym_help._get_const(args[0], 'i', 'dtype')
         # Layout and device are ignored
-        return g.op("Cast", self, to_i=scalar_type_to_onnx[dtype])
+        return g.op("Cast", self, to_i=sym_help.scalar_type_to_onnx[dtype])
     else:
         raise NotImplementedError("Unknown aten::to signature")
 
 
 def repeat(g, self, repeats):
-    if not _is_value(repeats):
+    if not sym_help._is_value(repeats):
         repeats = g.op("Constant", value_t=torch.LongTensor(repeats))
-    const_repeats = _maybe_get_const(repeats, 'is')
+    const_repeats = sym_help._maybe_get_const(repeats, 'is')
 
-    if self.isCompleteTensor() and not _is_value(const_repeats):
+    if self.isCompleteTensor() and not sym_help._is_value(const_repeats):
         sizes = self.type().sizes()
         diff_dims = len(const_repeats) - len(sizes)
         if diff_dims > 0:
@@ -1600,20 +1360,20 @@ def retrieve_state(x, start, end):
 
 @parse_args('v', 'v', 'v', 'i', 'i', 'f', 'i', 'i', 'i')
 def _lstm_full(g, input, hidden_v, weight_v, has_biases, num_layers, dropout, train, bidirectional, batch_first):
-    hidden, weight = _unpack_list(hidden_v), _unpack_list(weight_v)
+    hidden, weight = sym_help._unpack_list(hidden_v), sym_help._unpack_list(weight_v)
     return _generic_rnn(g, 'LSTM', input, hidden, weight, has_biases, num_layers,
                         dropout, train, bidirectional, batch_first)
 
 
 @parse_args('v', 'v', 'v', 'v', 'i', 'i', 'f', 'i', 'i')
 def _lstm_packed(g, input, batch_sizes, hidden_v, weight_v, has_biases, num_layers, dropout, train, bidirectional):
-    hidden, weight = _unpack_list(hidden_v), _unpack_list(weight_v)
+    hidden, weight = sym_help._unpack_list(hidden_v), sym_help._unpack_list(weight_v)
     return _generic_rnn(g, 'LSTM', input, hidden, weight, has_biases, num_layers,
                         dropout, train, bidirectional, batch_sizes=batch_sizes)
 
 
 def lstm(g, *args):
-    if _is_tensor_list(args[3]):
+    if sym_help._is_tensor_list(args[3]):
         return _lstm_packed(g, *args)
     else:
         return _lstm_full(g, *args)
@@ -1622,18 +1382,18 @@ def lstm(g, *args):
 def _one_hidden_rnn(kind):
     @parse_args('v', 'v', 'v', 'i', 'i', 'f', 'i', 'i', 'i')
     def _rnn_full(g, input, hidden, weight_v, has_biases, num_layers, dropout, train, bidirectional, batch_first):
-        weight = _unpack_list(weight_v)
+        weight = sym_help._unpack_list(weight_v)
         return _generic_rnn(g, kind, input, hidden, weight, has_biases, num_layers,
                             dropout, train, bidirectional, batch_first)
 
     @parse_args('v', 'v', 'v', 'v', 'i', 'i', 'f', 'i', 'i')
     def _rnn_packed(g, input, batch_sizes, hidden, weight_v, has_biases, num_layers, dropout, train, bidirectional):
-        weight = _unpack_list(weight_v)
+        weight = sym_help._unpack_list(weight_v)
         return _generic_rnn(g, kind, input, hidden, weight, has_biases, num_layers,
                             dropout, train, bidirectional, batch_sizes=batch_sizes)
 
     def symbolic(g, *args):
-        if _is_tensor_list(args[3]):
+        if sym_help._is_tensor_list(args[3]):
             return _rnn_packed(g, *args)
         else:
             return _rnn_full(g, *args)
@@ -1658,7 +1418,10 @@ def detach(g, input):
     return input
 
 
-def contiguous(g, input):
+@parse_args('v', 'i')
+def contiguous(g, input, memory_format):
+    if memory_format > 2:  # allower values are any, preserve and contiguous_format
+        raise RuntimeError("onnx memory_format support is not implemented")
     return input
 
 
@@ -1692,7 +1455,7 @@ def _pad_packed_sequence(g, data, batch_sizes, batch_first, padding_value, total
 
 def randn(g, *shapes):
     shapes_list = list(shapes)
-    shape = _maybe_get_const(shapes_list[0], "is")
+    shape = sym_help._maybe_get_const(shapes_list[0], "is")
     return g.op('RandomNormal', shape_i=shape)
 
 
@@ -1750,7 +1513,7 @@ def nonzero(g, input):
 @parse_args('v')
 def isnan(g, input):
     output = g.op('IsNaN', input)
-    output = _cast_func_template(cast_pytorch_to_onnx['Byte'], g, output, None)
+    output = sym_help._cast_func_template(sym_help.cast_pytorch_to_onnx['Byte'], g, output, None)
     return output
 
 
@@ -1779,6 +1542,24 @@ def argmin(g, input, dim, keepdim):
         return g.op('ArgMin', input, axis_i=dim, keepdims_i=keepdim)
 
 
+@parse_args('v', 'i', 'v', 'v')
+def scatter(g, self, dim, index, src):
+    return g.op("Scatter", self, index, src, axis_i=dim)
+
+
+@parse_args('v', 'i', 'v', 'v')
+def scatter_add(g, self, dim, index, src):
+    if self.type().kind() != "CompleteTensorType":
+        return _unimplemented("scatter_add", "input size not accesible")
+    dtype = self.type().scalarType()
+    dtype = sym_help.scalar_type_to_onnx.index(sym_help.cast_pytorch_to_onnx[dtype])
+    dims = self.type().sizes()
+    to_add = torch.zeros(dims)
+    to_add = g.op("Constant", value_t=to_add)
+    to_add = scatter(g, to_add, dim, index, src)
+    return add(g, self, to_add)
+
+
 def log2(g, self):
     _ln2 = 0.693147180559945309
     return g.op('Div', log(g, self), g.op('Constant', value_t=torch.Tensor([_ln2])))
diff --git a/torch/onnx/symbolic_registry.py b/torch/onnx/symbolic_registry.py
new file mode 100644
index 000000000000..4d4d2727b5c8
--- /dev/null
+++ b/torch/onnx/symbolic_registry.py
@@ -0,0 +1,70 @@
+import warnings
+import importlib
+from inspect import getmembers, isfunction
+
+# The symbolic registry "_registry" is a dictionary that maps operators
+# (for a specific domain and opset version) to their symbolic functions.
+# An operator is defined by its domain, opset version, and opname.
+# The keys are tuples (domain, version), (where domain is a string, and version is an int),
+# and the operator's name (string).
+# The map's entries are as follows : _registry[(domain, version)][op_name] = op_symbolic
+_registry = {}
+
+_symbolic_versions = {}
+from torch.onnx.symbolic_helper import _onnx_stable_opsets
+for opset_version in _onnx_stable_opsets:
+    module = importlib.import_module('torch.onnx.symbolic_opset{}'.format(opset_version))
+    _symbolic_versions[opset_version] = module
+
+def register_version(domain, version):
+    if not is_registered_version(domain, version):
+        global _registry
+        _registry[(domain, version)] = {}
+    register_ops_in_version(domain, version)
+
+
+def register_ops_in_version(domain, version):
+    # iterates through the symbolic functions of
+    # the specified opset version, and the previous
+    # opset versions for operators supported in
+    # previous versions
+    iter_version = version
+    while iter_version >= 9:
+        version_ops = get_ops_in_version(iter_version)
+        for op in version_ops:
+            if isfunction(op[1]) and \
+               not is_registered_op(op[0], domain, version):
+                register_op(op[0], op[1], domain, version)
+        iter_version = iter_version - 1
+
+
+def get_ops_in_version(version):
+    return getmembers(_symbolic_versions[version])
+
+
+def is_registered_version(domain, version):
+    global _registry
+    return (domain, version) in _registry
+
+
+def register_op(opname, op, domain, version):
+    if domain is None or version is None:
+        warnings.warn("ONNX export failed. The ONNX domain and/or version to register are None.")
+    global _registry
+    if not is_registered_version(domain, version):
+        _registry[(domain, version)] = {}
+    _registry[(domain, version)][opname] = op
+
+
+def is_registered_op(opname, domain, version):
+    if domain is None or version is None:
+        warnings.warn("ONNX export failed. The ONNX domain and/or version are None.")
+    global _registry
+    return (domain, version) in _registry and opname in _registry[(domain, version)]
+
+
+def get_registered_op(opname, domain, version):
+    if domain is None or version is None:
+        warnings.warn("ONNX export failed. The ONNX domain and/or version are None.")
+    global _registry
+    return _registry[(domain, version)][opname]
diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py
index b5f59d5d9379..970fda479850 100644
--- a/torch/onnx/utils.py
+++ b/torch/onnx/utils.py
@@ -56,7 +56,7 @@ def set_training(model, mode):
 def export(model, args, f, export_params=True, verbose=False, training=False,
            input_names=None, output_names=None, aten=False, export_raw_ir=False,
            operator_export_type=None, opset_version=None, _retain_param_name=True,
-           do_constant_folding=False, strip_doc_string=True):
+           do_constant_folding=False, example_outputs=None, strip_doc_string=True):
     r"""
     Export a model into ONNX format.  This exporter runs your model
     once in order to get a trace of its execution to be exported;
@@ -93,7 +93,7 @@ def export(model, args, f, export_params=True, verbose=False, training=False,
             output nodes of the graph, in order
         aten (bool, default False): [DEPRECATED. use operator_export_type] export the
             model in aten mode. If using aten mode, all the ops original exported
-            by the functions in symbolic.py are exported as ATen ops.
+            by the functions in symbolic_opset<version>.py are exported as ATen ops.
         export_raw_ir (bool, default False): [DEPRECATED. use operator_export_type]
             export the internal IR directly instead of converting it to ONNX ops.
         operator_export_type (enum, default OperatorExportTypes.ONNX):
@@ -107,11 +107,13 @@ def export(model, args, f, export_params=True, verbose=False, training=False,
             evolve before next stable release, by default we export to one stable
             opset version. Right now, supported stable opset version is 9.
             The opset_version must be _onnx_master_opset or in _onnx_stable_opsets
-            which are defined in torch/onnx/symbolic.py
+            which are defined in torch/onnx/symbolic_helper.py
         do_constant_folding (bool, default False): If True, the constant-folding
             optimization is applied to the model during export. Constant-folding
             optimization will replace some of the ops that have all constant
             inputs, with pre-computed constant nodes.
+        example_outputs (tuple of Tensors, default None): example_outputs must be provided
+            when exporting a ScriptModule or TorchScript Function.
         strip_doc_string (bool, default True): if True, strips the field
             "doc_string" from the exported model, which information about the stack
             trace.
@@ -128,7 +130,7 @@ def export(model, args, f, export_params=True, verbose=False, training=False,
     _export(model, args, f, export_params, verbose, training, input_names, output_names,
             operator_export_type=operator_export_type, opset_version=opset_version,
             _retain_param_name=_retain_param_name, do_constant_folding=do_constant_folding,
-            strip_doc_string=strip_doc_string)
+            example_outputs=example_outputs, strip_doc_string=strip_doc_string)
 
 
 # ONNX can't handle constants that are lists of tensors, which can
@@ -237,7 +239,7 @@ def _model_to_graph(model, args, verbose=False, training=False,
                     example_outputs=None, propagate=False,
                     _retain_param_name=False, do_constant_folding=False,
                     _disable_torch_constant_prop=False):
-    from torch.onnx.symbolic import _export_onnx_opset_version
+    from torch.onnx.symbolic_helper import _export_onnx_opset_version
     # Special case for common case of passing a single Tensor
     if isinstance(args, torch.Tensor):
         args = (args, )
@@ -326,7 +328,7 @@ def _export_to_pretty_string(model, args, f, export_params=True, verbose=False,
                              export_type=ExportTypes.PROTOBUF_FILE, example_outputs=None, propagate=False,
                              google_printer=False, opset_version=None, _retain_param_name=False,
                              do_constant_folding=False):
-    from torch.onnx.symbolic import _default_onnx_opset_version, _set_opset_version
+    from torch.onnx.symbolic_helper import _default_onnx_opset_version, _set_opset_version
     if opset_version is None:
         opset_version = _default_onnx_opset_version
     _set_opset_version(opset_version)
@@ -352,7 +354,7 @@ def _export(model, args, f, export_params=True, verbose=False, training=False,
     assert __IN_ONNX_EXPORT is False
     __IN_ONNX_EXPORT = True
     try:
-        from torch.onnx.symbolic import _default_onnx_opset_version, _set_opset_version
+        from torch.onnx.symbolic_helper import _default_onnx_opset_version, _set_opset_version
         if opset_version is None:
             opset_version = _default_onnx_opset_version
         _set_opset_version(opset_version)
@@ -553,7 +555,10 @@ def _run_symbolic_function(g, n, inputs, env, operator_export_type=OperatorExpor
     # NB: Returning None means the node gets cloned as is into
     # the new graph
     try:
-        import torch.onnx.symbolic
+        from torch.onnx.symbolic_helper import _export_onnx_opset_version as opset_version
+        import torch.onnx.symbolic_registry as sym_registry
+
+        sym_registry.register_version('', opset_version)
 
         # See Note [Export inplace]
         # TODO: I think this is not necessary anymore
@@ -568,7 +573,7 @@ def _run_symbolic_function(g, n, inputs, env, operator_export_type=OperatorExpor
             return None
 
         elif ns == "aten":
-            is_exportable_aten_op = hasattr(torch.onnx.symbolic, op_name)
+            is_exportable_aten_op = sym_registry.is_registered_op(op_name, '', opset_version)
             is_onnx_aten_export = operator_export_type == OperatorExportTypes.ONNX_ATEN
             is_aten_fallback_export = operator_export_type == OperatorExportTypes.ONNX_ATEN_FALLBACK
             if is_onnx_aten_export or (not is_exportable_aten_op and is_aten_fallback_export):
@@ -582,11 +587,11 @@ def _run_symbolic_function(g, n, inputs, env, operator_export_type=OperatorExpor
                 # Export it regularly
                 attrs = {k: n[k] for k in n.attributeNames()}
                 if not is_exportable_aten_op:
-                    warnings.warn("ONNX export failed on ATen operator {} because torch.onnx.symbolic.{} does not exist"
-                                  .format(op_name, op_name))
-                    return None
-                fn = getattr(torch.onnx.symbolic, op_name)
-                return fn(g, *inputs, **attrs)
+                    warnings.warn("ONNX export failed on ATen operator {} because "
+                                  "torch.onnx.symbolic_opset{}.{} does not exist"
+                                  .format(op_name, opset_version, op_name))
+                op_fn = sym_registry.get_registered_op(op_name, '', opset_version)
+                return op_fn(g, *inputs, **attrs)
 
         elif ns == "prim":
             if op_name == "Constant" and not n.mustBeNone():
@@ -614,17 +619,32 @@ def _run_symbolic_function(g, n, inputs, env, operator_export_type=OperatorExpor
                     torch._C._jit_pass_onnx_block(b, new_block, operator_export_type, env)
                 return new_op_outputs
             else:
+                # TODO: we sould lift prim's symbolic out
                 symbolic_name = 'prim_' + op_name
-                symbolic_fn = getattr(torch.onnx.symbolic, symbolic_name, None)
-                if symbolic_fn is None:
+                is_exportable = sym_registry.is_registered_op(symbolic_name, '', opset_version)
+                if not is_exportable:
                     warnings.warn("ONNX export failed on primitive operator {}; please report a bug".format(op_name))
-                    return None
+                symbolic_fn = sym_registry.get_registered_op(symbolic_name, '', opset_version)
                 attrs = {k: n[k] for k in n.attributeNames()}
                 return symbolic_fn(g, *inputs, **attrs)
 
+        # custom ops
+        elif sym_registry.is_registered_version(ns, opset_version):
+            if not sym_registry.is_registered_op(op_name, ns, opset_version):
+                warnings.warn("ONNX export failed on custom operator {}::{} because "
+                              "torch.onnx.symbolic_opset{}.{} does not exist. "
+                              "Have you registered your symbolic function with "
+                              "torch.onnx.register_custom_op_symbolic(symbolic_name, symbolic_fn)?"
+                              .format(ns, op_name, opset_version, op_name))
+            symbolic_fn = sym_registry.get_registered_op(symbolic_name, ns, opset_version)
+            attrs = {k: n[k] for k in n.attributeNames()}
+            return symbolic_fn(g, *inputs, **attrs)
+
         else:
             warnings.warn("ONNX export failed on an operator with unrecognized namespace {}::{}; "
-                          "please report a bug".format(ns, op_name))
+                          "If you are trying to export a custom operator, make sure you registered "
+                          "it with the right domain and version."
+                          "Otherwise please report a bug".format(ns, op_name))
             return None
 
     except TypeError as e:
@@ -686,6 +706,22 @@ def _node_getitem(self, k):
     return getattr(self, sel)(k)
 
 
+def register_custom_op_symbolic(symbolic_name, symbolic_fn, opset_version):
+    if not bool(re.match(r"^[a-zA-Z0-9-_]*::[a-zA-Z]+[a-zA-Z0-9-_]*$", symbolic_name)):
+        raise RuntimeError("Failed to register operator {}. \
+                           The symbolic name must match the format Domain::Name, \
+                           and sould start with a letter and contain only \
+                           alphanumerical characters"
+                           .format(symbolic_name))
+    ns, op_name = symbolic_name.split('::')
+    unaccepted_domain_names = ["onnx", "aten", "prim"]
+    if ns in unaccepted_domain_names:
+        raise RuntimeError("Failed to register operator {}. The domain {} is already a used domain."
+                           .format(symbolic_name, ns))
+    import torch.onnx.symbolic_registry as sym_registry
+    sym_registry.register_op(op_name, symbolic_fn, ns, opset_version)
+
+
 torch._C.Graph.op = _graph_op
 torch._C.Graph.at = _graph_at
 torch._C.Graph.constant = _graph_constant
diff --git a/torch/optim/lr_scheduler.py b/torch/optim/lr_scheduler.py
index b9ef23f2abc6..c0a13c8c2420 100644
--- a/torch/optim/lr_scheduler.py
+++ b/torch/optim/lr_scheduler.py
@@ -697,13 +697,25 @@ def get_lr(self):
                 for base_lr in self.base_lrs]
 
     def step(self, epoch=None):
-        """Step could be called after every update, i.e. if one epoch has 10 iterations
-        (number_of_train_examples / batch_size), we should call SGDR.step(0.1), SGDR.step(0.2), etc.
+        """Step could be called after every batch update
+
+        Example:
+            >>> scheduler = CosineAnnealingWarmRestarts(optimizer, T_0, T_mult)
+            >>> iters = len(dataloader)
+            >>> for epoch in range(20):
+            >>>     for i, sample in enumerate(dataloader):
+            >>>         inputs, labels = sample['inputs'], sample['labels']
+            >>>         scheduler.step(epoch + i / iters)
+            >>>         optimizer.zero_grad()
+            >>>         outputs = net(inputs)
+            >>>         loss = criterion(outputs, labels)
+            >>>         loss.backward()
+            >>>         optimizer.step()
 
         This function can be called in an interleaved way.
 
         Example:
-            >>> scheduler = SGDR(optimizer, T_0, T_mult)
+            >>> scheduler = CosineAnnealingWarmRestarts(optimizer, T_0, T_mult)
             >>> for epoch in range(20):
             >>>     scheduler.step()
             >>> scheduler.step(26)
@@ -716,6 +728,8 @@ def step(self, epoch=None):
                 self.T_cur = self.T_cur - self.T_i
                 self.T_i = self.T_i * self.T_mult
         else:
+            if epoch < 0:
+                raise ValueError("Expected non-negative epoch, but got {}".format(epoch))
             if epoch >= self.T_0:
                 if self.T_mult == 1:
                     self.T_cur = epoch % self.T_0
diff --git a/torch/optim/optimizer.pyi b/torch/optim/optimizer.pyi
index 2b019ac7aaac..34d24391b1f0 100644
--- a/torch/optim/optimizer.pyi
+++ b/torch/optim/optimizer.pyi
@@ -1,7 +1,7 @@
 from typing import Iterable, Union, Callable, Optional
 from .. import Tensor
 
-_params_t = Union[Iterable[Tensor], dict]
+_params_t = Union[Iterable[Tensor], Iterable[dict]]
 
 class Optimizer:
     def __init__(self, params: _params_t) -> None: ...
diff --git a/torch/tensor.py b/torch/tensor.py
index 19596f5d21b7..dc6eb97223d8 100644
--- a/torch/tensor.py
+++ b/torch/tensor.py
@@ -414,6 +414,18 @@ def __rfloordiv__(self, other):
     __ge__ = _C._TensorBase.ge
     __abs__ = _C._TensorBase.abs
 
+    def __std_mean__(self, dim=None, unbiased=True, keepdim=False):
+        if dim is None:
+            return _C._VariableFunctions.std_mean(self, unbiased)
+        else:
+            return _C._VariableFunctions.std_mean(self, dim, unbiased, keepdim)
+
+    def __var_mean__(self, dim=None, unbiased=True, keepdim=False):
+        if dim is None:
+            return _C._VariableFunctions.var_mean(self, unbiased)
+        else:
+            return _C._VariableFunctions.var_mean(self, dim, unbiased, keepdim)
+
     def __len__(self):
         if self.dim() == 0:
             raise TypeError("len() of a 0-d tensor")
diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
index be765be0ef6a..4f9737f7b11d 100644
--- a/torch/utils/cpp_extension.py
+++ b/torch/utils/cpp_extension.py
@@ -370,7 +370,7 @@ def _check_abi(self):
         check_compiler_abi_compatibility(compiler)
 
     def _add_compile_flag(self, extension, flag):
-        extension.extra_compile_args = copy.copy(extension.extra_compile_args)
+        extension.extra_compile_args = copy.deepcopy(extension.extra_compile_args)
         if isinstance(extension.extra_compile_args, dict):
             for args in extension.extra_compile_args.values():
                 args.append(flag)
@@ -429,7 +429,6 @@ def CppExtension(name, sources, *args, **kwargs):
         libraries = kwargs.get('libraries', [])
         libraries.append('c10')
         libraries.append('caffe2')
-        libraries.append('torch')
         libraries.append('torch_python')
         libraries.append('_C')
         kwargs['libraries'] = libraries
@@ -476,7 +475,6 @@ def CUDAExtension(name, sources, *args, **kwargs):
         libraries.append('c10')
         libraries.append('c10_cuda')
         libraries.append('caffe2')
-        libraries.append('torch')
         libraries.append('torch_python')
         libraries.append('caffe2_gpu')
         libraries.append('_C')
@@ -891,7 +889,6 @@ def _prepare_ldflags(extra_ldflags, with_cuda, verbose):
 
         extra_ldflags.append('c10.lib')
         extra_ldflags.append('caffe2.lib')
-        extra_ldflags.append('torch.lib')
         extra_ldflags.append('torch_python.lib')
         if with_cuda:
             extra_ldflags.append('caffe2_gpu.lib')
diff --git a/torch/utils/data/_utils/worker.py b/torch/utils/data/_utils/worker.py
index 2258c6f55ffd..689c85cd3331 100644
--- a/torch/utils/data/_utils/worker.py
+++ b/torch/utils/data/_utils/worker.py
@@ -75,8 +75,13 @@ def _worker_loop(dataset, index_queue, data_queue, done_event, collate_fn, seed,
 
         data_queue.cancel_join_thread()
 
+        init_exception = None
+
         if init_fn is not None:
-            init_fn(worker_id)
+            try:
+                init_fn(worker_id)
+            except Exception:
+                init_exception = ExceptionWrapper(sys.exc_info())
 
         watchdog = ManagerWatchdog()
 
@@ -96,7 +101,11 @@ def _worker_loop(dataset, index_queue, data_queue, done_event, collate_fn, seed,
                 continue
             idx, batch_indices = r
             try:
-                samples = collate_fn([dataset[i] for i in batch_indices])
+                if init_exception is not None:
+                    samples = init_exception
+                    init_exception = None
+                else:
+                    samples = collate_fn([dataset[i] for i in batch_indices])
             except Exception:
                 # It is important that we don't store exc_info in a variable,
                 # see NOTE [ Python Traceback Reference Cycle Problem ]
diff --git a/torch/utils/mkldnn.py b/torch/utils/mkldnn.py
index de0a0b267c96..1ff1176cc5f4 100644
--- a/torch/utils/mkldnn.py
+++ b/torch/utils/mkldnn.py
@@ -1,43 +1,149 @@
 from __future__ import absolute_import, division, print_function, unicode_literals
-import functools
 
 import torch
 
 
-def to_mkldnn(module):
-    def t_fn(t):
-        if t.is_floating_point():
-            return t.to_mkldnn()
+class MkldnnLinear(torch.jit.ScriptModule):
+    def __init__(self, dense_module):
+        super(MkldnnLinear, self).__init__()
+        self.register_buffer('weight', dense_module.weight.to_mkldnn())
+        if dense_module.bias is not None:
+            self.register_buffer('bias', dense_module.bias.to_mkldnn())
+        else:
+            # TODO: Remove this once ScriptModule supports registering None buffer
+            self.register_buffer(
+                'bias',
+                torch.zeros([dense_module.weight.size(0)], dtype=torch.float).to_mkldnn())
+
+    @torch.jit.script_method
+    def __getstate__(self):
+        return (self.weight.to_dense(), self.bias.to_dense())
+
+    @torch.jit.script_method
+    def __setstate__(self, state):
+        # type: (Tuple[Tensor, Tensor]) -> None
+        self.weight = state[0].to_mkldnn()
+        self.bias = state[1].to_mkldnn()
+
+    @torch.jit.script_method
+    def forward(self, x):
+        return torch._C._nn.mkldnn_linear(x, self.weight, self.bias)
+
+
+class MkldnnConv2d(torch.jit.ScriptModule):
+    __constants__ = ['stride', 'padding', 'dilation', 'groups']
+
+    def __init__(self, dense_module):
+        super(MkldnnConv2d, self).__init__()
+
+        self.stride = dense_module.stride
+        self.padding = dense_module.padding
+        self.dilation = dense_module.dilation
+        self.groups = dense_module.groups
+
+        self.register_buffer('weight', dense_module.weight.to_mkldnn())
+        if dense_module.bias is not None:
+            self.register_buffer('bias', dense_module.bias.to_mkldnn())
+        else:
+            # TODO: Remove this once ScriptModule supports registering None buffer
+            self.register_buffer(
+                'bias',
+                torch.zeros([dense_module.weight.size(0)], dtype=torch.float).to_mkldnn())
+
+    @torch.jit.script_method
+    def __getstate__(self):
+        return (self.weight.to_dense(), self.bias.to_dense())
+
+    @torch.jit.script_method
+    def __setstate__(self, state):
+        # type: (Tuple[Tensor, Tensor]) -> None
+        self.weight = torch._C._nn.mkldnn_reorder_conv2d_weight(
+            state[0].to_mkldnn(),
+            self.padding,
+            self.stride,
+            self.dilation,
+            self.groups)
+        self.bias = state[1].to_mkldnn()
+
+    @torch.jit.script_method
+    def forward(self, x):
+        return torch.conv2d(
+            x,
+            self.weight,
+            self.bias,
+            self.stride,
+            self.padding,
+            self.dilation,
+            self.groups)
+
 
+class MkldnnBatchNorm2d(torch.jit.ScriptModule):
+    __constants__ = ['exponential_average_factor', 'eps']
+
+    def __init__(self, dense_module):
+        super(MkldnnBatchNorm2d, self).__init__()
+
+        assert(not dense_module.training)
+        assert(dense_module.track_running_stats)
+        assert(dense_module.affine)
+
+        if dense_module.momentum is None:
+            self.exponential_average_factor = 0.0
+        else:
+            self.exponential_average_factor = dense_module.momentum
+        self.eps = dense_module.eps
+
+        self.register_buffer('weight', dense_module.weight.to_mkldnn())
+        self.register_buffer('bias', dense_module.bias.to_mkldnn())
+        self.register_buffer('running_mean', dense_module.running_mean.to_mkldnn())
+        self.register_buffer('running_var', dense_module.running_var.to_mkldnn())
+
+    @torch.jit.script_method
+    def __getstate__(self):
+        weight = self.weight.to_dense()
+        bias = self.bias.to_dense()
+        running_mean = self.running_mean.to_dense()
+        running_var = self.running_var.to_dense()
+        return (weight, bias, running_mean, running_var)
+
+    @torch.jit.script_method
+    def __setstate__(self, state):
+        # type: (Tuple[Tensor, Tensor, Tensor, Tensor]) -> None
+        self.weight = state[0].to_mkldnn()
+        self.bias = state[1].to_mkldnn()
+        self.running_mean = state[2].to_mkldnn()
+        self.running_var = state[3].to_mkldnn()
+
+    @torch.jit.script_method
+    def forward(self, x):
+        return torch.batch_norm(
+            x,
+            self.weight,
+            self.bias,
+            self.running_mean,
+            self.running_var,
+            False,  # training
+            self.exponential_average_factor,
+            self.eps,
+            False,  # cuda_enabled
+        )
+
+
+def to_mkldnn(module):
     def m_fn(m):
-        # TODO: This is a temporary hack to work around the fact that
-        # nn.Linear is decomposed into addmm/matmul. Later we will
-        # change nn.Linear to directly call aten linear and we can
-        # remove this patch
         if isinstance(m, torch.nn.Linear):
-            m.forward = functools.partial(
-                torch._C._nn.linear,
-                weight=m.weight,
-                bias=m.bias)
-
-        for param in m._parameters.values():
-            if param is not None:
-                # Tensors stored in modules are graph leaves, and we don't
-                # want to create copy nodes, so we have to unpack the data.
-                param.data = t_fn(param.data)
-                if param._grad is not None:
-                    param._grad.data = t_fn(param._grad.data)
-
-        for key, buf in m._buffers.items():
-            if buf is not None:
-                m._buffers[key] = t_fn(buf)
-
-        if isinstance(m, torch.nn.Conv2d):
-            m.weight.data = torch._C._nn.mkldnn_reorder_conv2d_weight(
-                m.weight.data,
-                m.padding,
-                m.stride,
-                m.dilation,
-                m.groups)
-
-    return module.apply(m_fn)
+            return MkldnnLinear(m)
+        elif isinstance(m, torch.nn.Conv2d):
+            return MkldnnConv2d(m)
+        elif isinstance(m, torch.nn.BatchNorm2d):
+            return MkldnnBatchNorm2d(m)
+        else:
+            return m
+
+    def m_fn_rec(m):
+        new_m = m_fn(m)
+        for name, sub_m in m.named_children():
+            setattr(new_m, name, m_fn_rec(sub_m))
+        return new_m
+
+    return m_fn_rec(module)
diff --git a/torch/utils/tensorboard/_pytorch_graph.py b/torch/utils/tensorboard/_pytorch_graph.py
index 1ec9162f1dd6..917fabfce6c5 100644
--- a/torch/utils/tensorboard/_pytorch_graph.py
+++ b/torch/utils/tensorboard/_pytorch_graph.py
@@ -11,7 +11,7 @@
 import torch
 from ._proto_graph import node_proto
 from torch.onnx.utils import OperatorExportTypes
-
+from torch.onnx import _optimize_trace
 
 methods_OP = ['attributeNames', 'hasMultipleOutputs', 'hasUses', 'inputs',
               'kind', 'outputs', 'outputsSize', 'scopeName']
@@ -247,60 +247,6 @@ def graph(model, args, verbose=False, operator_export_type='ONNX', omit_useless_
     """
     operator_export_type = getattr(OperatorExportTypes, operator_export_type)
 
-    # This code is similar to torch/onnx/utils.py, but adjusted to provide
-    # the most visually understandable output.
-    #
-    # For example, the commented out line
-    #
-    #    # torch._C._jit_pass_onnx_peephole(graph).
-    #
-    # This pass removes a lot of scope information. The amount of optimization
-    # cannot be too much (lots of information lost) or too little (too much
-    # useless information), therefore I copy-pasted the code so that it will
-    # not be affected by torch/onnx/utils.py changes.
-    def _optimize_trace(trace, operator_export_type):
-        trace.set_graph(_optimize_graph(trace.graph(), operator_export_type))
-
-    def _optimize_graph(graph, operator_export_type):
-        # torch._C._jit_pass_remove_inplace_ops(graph)
-        # we record now record some ops like ones/zeros
-        # into a trace where we previously recorded constants
-        # use constant prop to maintain our current level of onnx support
-        # without implementing symbolics for all of them
-        torch._C._jit_pass_constant_propagation(graph)
-        torch.onnx.utils._split_tensor_list_constants(graph, graph)
-        # run dce to eliminate dead parts of the graph that might have been
-        # left behind by things like symbolic_override
-        torch._C._jit_pass_dce(graph)
-        torch._C._jit_pass_lint(graph)
-
-        # torch._C._jit_pass_canonicalize_ops(graph)
-        torch._C._jit_pass_lint(graph)
-
-        torch._C._jit_pass_peephole(graph, True)
-        torch._C._jit_pass_lint(graph)
-
-        # onnx only supports tensors, but 1 / 2 = 0.5 and tensor(1) / tensor(2) = 0
-        torch._C._jit_pass_prepare_division_for_onnx(graph)
-        # onnx only supports tensors, so we turn all out number types into tensors
-        torch._C._jit_pass_erase_number_types(graph)
-        # onnx does not support tuples, so try to remove them
-        torch._C._jit_pass_lower_all_tuples(graph)
-        torch._C._jit_pass_peephole(graph, True)
-        torch._C._jit_pass_lint(graph)
-
-        if operator_export_type != OperatorExportTypes.RAW:
-            graph = torch._C._jit_pass_onnx(graph, operator_export_type)
-            torch._C._jit_pass_lint(graph)
-            # torch._C._jit_pass_onnx_peephole(graph)
-            torch._C._jit_pass_lint(graph)
-        torch._C._jit_pass_dce(graph)
-        torch._C._jit_pass_lint(graph)
-        torch._C._jit_pass_fixup_onnx_loops(graph)
-        torch._C._jit_pass_lint(graph)
-        graph = torch._C._jit_pass_canonicalize(graph)
-        torch._C._jit_pass_lint(graph)
-        return graph
 
     with torch.onnx.set_training(model, False):
         try:
@@ -314,7 +260,7 @@ def _optimize_graph(graph, operator_export_type):
                 torch.onnx.export(
                     model, args, tempfile.TemporaryFile(), verbose=True)
             except RuntimeError:
-                print("Your model fails onnx too, please report to onnx team")
+                print("Your model cannot be exported by onnx, please report to onnx team")
             # Create an object matching
             # https://github.com/tensorflow/tensorboard/blob/master/tensorboard/compat/proto/graph.proto
             # The producer version has been reverse engineered from standard
diff --git a/torch/utils/tensorboard/_utils.py b/torch/utils/tensorboard/_utils.py
index 0e6b5d12cf05..8bd9b37029b7 100644
--- a/torch/utils/tensorboard/_utils.py
+++ b/torch/utils/tensorboard/_utils.py
@@ -37,6 +37,12 @@ def render_to_rgb(figure):
 
 
 def _prepare_video(V):
+    """
+    Converts a 5D tensor [batchsize, time(frame), channel(color), height, width]
+    into 4D tensor with dimension [time(frame), new_width, new_height, channel].
+    A batch of images are spreaded to a grid, which forms a frame.
+    e.g. Video with batchsize 16 will have a 4x4 grid.
+    """
     b, t, c, h, w = V.shape
 
     if V.dtype == np.uint8:
diff --git a/torch/utils/tensorboard/summary.py b/torch/utils/tensorboard/summary.py
index 111bba120e42..ab3b9a5b8d8e 100644
--- a/torch/utils/tensorboard/summary.py
+++ b/torch/utils/tensorboard/summary.py
@@ -378,7 +378,7 @@ def custom_scalars(layout):
         categories.append(layout_pb2.Category(title=k, chart=charts))
 
     layout = layout_pb2.Layout(category=categories)
-    PluginData = [SummaryMetadata.PluginData(plugin_name='custom_scalars')]
+    PluginData = SummaryMetadata.PluginData(plugin_name='custom_scalars')
     smd = SummaryMetadata(plugin_data=PluginData)
     tensor = TensorProto(dtype='DT_STRING',
                          string_val=[layout.SerializeToString()],
diff --git a/torch/utils/tensorboard/writer.py b/torch/utils/tensorboard/writer.py
index ca0d09daf25d..8d127421ace8 100644
--- a/torch/utils/tensorboard/writer.py
+++ b/torch/utils/tensorboard/writer.py
@@ -35,30 +35,30 @@ class FileWriter(object):
     training.
     """
 
-    def __init__(self, logdir, max_queue=10, flush_secs=120, filename_suffix=''):
+    def __init__(self, log_dir, max_queue=10, flush_secs=120, filename_suffix=''):
         """Creates a `FileWriter` and an event file.
-        On construction the writer creates a new event file in `logdir`.
+        On construction the writer creates a new event file in `log_dir`.
         The other arguments to the constructor control the asynchronous writes to
         the event file.
 
         Args:
-          logdir: A string. Directory where event file will be written.
+          log_dir: A string. Directory where event file will be written.
           max_queue: Integer. Size of the queue for pending events and
             summaries before one of the 'add' calls forces a flush to disk.
             Default is ten items.
           flush_secs: Number. How often, in seconds, to flush the
             pending events and summaries to disk. Default is every two minutes.
           filename_suffix: A string. Suffix added to all event filenames
-            in the logdir directory. More details on filename construction in
+            in the log_dir directory. More details on filename construction in
             tensorboard.summary.writer.event_file_writer.EventFileWriter.
         """
         # Sometimes PosixPath is passed in and we need to coerce it to
         # a string in all cases
         # TODO: See if we can remove this in the future if we are
         # actually the ones passing in a PosixPath
-        logdir = str(logdir)
+        log_dir = str(log_dir)
         self.event_writer = EventFileWriter(
-            logdir, max_queue, flush_secs, filename_suffix)
+            log_dir, max_queue, flush_secs, filename_suffix)
 
     def get_logdir(self):
         """Returns the directory where event file will be written."""
@@ -147,7 +147,7 @@ def reopen(self):
 
 
 class SummaryWriter(object):
-    """Writes entries directly to event files in the logdir to be
+    """Writes entries directly to event files in the log_dir to be
     consumed by TensorBoard.
 
     The `SummaryWriter` class provides a high-level API to create an event file
@@ -157,31 +157,31 @@ class SummaryWriter(object):
     training.
     """
 
-    def __init__(self, logdir=None, comment='', purge_step=None, max_queue=10,
+    def __init__(self, log_dir=None, comment='', purge_step=None, max_queue=10,
                  flush_secs=120, filename_suffix=''):
         """Creates a `SummaryWriter` that will write out events and summaries
         to the event file.
 
         Args:
-            logdir (string): Save directory location. Default is
+            log_dir (string): Save directory location. Default is
               runs/**CURRENT_DATETIME_HOSTNAME**, which changes after each run.
               Use hierarchical folder structure to compare
               between runs easily. e.g. pass in 'runs/exp1', 'runs/exp2', etc.
               for each new experiment to compare across them.
-            comment (string): Comment logdir suffix appended to the default
-              ``logdir``. If ``logdir`` is assigned, this argument has no effect.
+            comment (string): Comment log_dir suffix appended to the default
+              ``log_dir``. If ``log_dir`` is assigned, this argument has no effect.
             purge_step (int):
               When logging crashes at step :math:`T+X` and restarts at step :math:`T`,
               any events whose global_step larger or equal to :math:`T` will be
               purged and hidden from TensorBoard.
-              Note that crashed and resumed experiments should have the same ``logdir``.
+              Note that crashed and resumed experiments should have the same ``log_dir``.
             max_queue (int): Size of the queue for pending events and
               summaries before one of the 'add' calls forces a flush to disk.
               Default is ten items.
             flush_secs (int): How often, in seconds, to flush the
               pending events and summaries to disk. Default is every two minutes.
             filename_suffix (string): Suffix added to all event filenames in
-              the logdir directory. More details on filename construction in
+              the log_dir directory. More details on filename construction in
               tensorboard.summary.writer.event_file_writer.EventFileWriter.
 
         Examples::
@@ -201,13 +201,13 @@ def __init__(self, logdir=None, comment='', purge_step=None, max_queue=10,
             # folder location: runs/May04_22-14-54_s-MacBook-Pro.localLR_0.1_BATCH_16/
 
         """
-        if not logdir:
+        if not log_dir:
             import socket
             from datetime import datetime
             current_time = datetime.now().strftime('%b%d_%H-%M-%S')
-            logdir = os.path.join(
+            log_dir = os.path.join(
                 'runs', current_time + '_' + socket.gethostname() + comment)
-        self.logdir = logdir
+        self.log_dir = log_dir
         self.purge_step = purge_step
         self.max_queue = max_queue
         self.flush_secs = flush_secs
@@ -246,7 +246,7 @@ def _check_caffe2_blob(self, item):
     def _get_file_writer(self):
         """Returns the default FileWriter instance. Recreates it if closed."""
         if self.all_writers is None or self.file_writer is None:
-            self.file_writer = FileWriter(self.logdir, self.max_queue,
+            self.file_writer = FileWriter(self.log_dir, self.max_queue,
                                           self.flush_secs, self.filename_suffix)
             self.all_writers = {self.file_writer.get_logdir(): self.file_writer}
             if self.purge_step is not None:
@@ -258,6 +258,10 @@ def _get_file_writer(self):
                 self.purge_step = None
         return self.file_writer
 
+    def get_logdir(self):
+        """Returns the directory where event files will be written."""
+        return self.log_dir
+
     def add_scalar(self, tag, scalar_value, global_step=None, walltime=None):
         """Add scalar data to summary.
 
@@ -381,13 +385,48 @@ def add_histogram_raw(self, tag, min, max, num, sum, sum_squares,
             num (int): Number of values
             sum (float or int): Sum of all values
             sum_squares (float or int): Sum of squares for all values
-            bucket_limits (torch.Tensor, numpy.array): Upper value per bucket
+            bucket_limits (torch.Tensor, numpy.array): Upper value per bucket.
+              The number of elements of it should be the same as `bucket_counts`.
             bucket_counts (torch.Tensor, numpy.array): Number of values per bucket
             global_step (int): Global step value to record
             walltime (float): Optional override default walltime (time.time())
               seconds after epoch of event
             see: https://github.com/tensorflow/tensorboard/blob/master/tensorboard/plugins/histogram/README.md
+
+        Examples::
+
+            from torch.utils.tensorboard import SummaryWriter
+            import numpy as np
+            writer = SummaryWriter()
+            dummy_data = []
+            for idx, value in enumerate(range(50)):
+                dummy_data += [idx + 0.001] * value
+
+            bins = list(range(50+2))
+            bins = np.array(bins)
+            values = np.array(dummy_data).astype(float).reshape(-1)
+            counts, limits = np.histogram(values, bins=bins)
+            sum_sq = values.dot(values)
+            writer.add_histogram_raw(
+                tag='histogram_with_raw_data',
+                min=values.min(),
+                max=values.max(),
+                num=len(values),
+                sum=values.sum(),
+                sum_squares=sum_sq,
+                bucket_limits=limits[1:].tolist(),
+                bucket_counts=counts.tolist(),
+                global_step=0)
+            writer.close()
+
+        Expected result:
+
+        .. image:: _static/img/tensorboard/add_histogram_raw.png
+           :scale: 50 %
+
         """
+        if len(bucket_limits) != len(bucket_counts):
+            raise ValueError('len(bucket_limits) != len(bucket_counts), see the document.')
         self._get_file_writer().add_summary(
             histogram_raw(tag,
                           min,
@@ -833,6 +872,16 @@ def add_custom_scalars(self, layout):
         """
         self._get_file_writer().add_summary(custom_scalars(layout))
 
+    def flush(self):
+        """Flushes the event file to disk.
+        Call this method to make sure that all pending events have been written to
+        disk.
+        """
+        if self.all_writers is None:
+            return
+        for writer in self.all_writers.values():
+            writer.flush()
+
     def close(self):
         if self.all_writers is None:
             return  # ignore double close
diff --git a/ubsan.supp b/ubsan.supp
index f1579d3f946f..233429525665 100644
--- a/ubsan.supp
+++ b/ubsan.supp
@@ -1,2 +1,3 @@
 vptr:libtorch.so
+vptr:libcaffe2.so
 bounds:asmjit::Zone::_alloc