In [1]:
import os
import subprocess

LIB = '/home/xgao/anaconda3/lib/python3.7/site-packages/torch/lib/'
FILE = 'libtorch_cuda.so'

def demangle(symbol):
    out = subprocess.Popen(['c++filt', symbol],
                           stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
    stdout, stderr = out.communicate()
    return stdout.decode().strip()

def get_symbols():
    out = subprocess.Popen(['cuobjdump', '-symbols', os.path.join(LIB, FILE)],
                           stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
    stdout, stderr = out.communicate()
    l = stdout.decode().split('\n')
    len_ = len('STT_FUNC         STB_GLOBAL STO_ENTRY      ')
    l = [x.strip()[len_:] for x in l if 'add_kernel' in x]
    l = [x for x in l if 'lambda(float, float)' in demangle(x)]
    return l

def get_asm(symbol):
    out = subprocess.Popen(['cuobjdump', '-sass', '--function', symbol, os.path.join(LIB, FILE)],
                           stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
    stdout, stderr = out.communicate()
    l = stdout.decode().split('\n')
    l = [x.strip() for x in l if 'FFMA' in x or 'LD' in x or 'ST' in x]
    return '\n'.join(l)

def run():
    import torch
    print(torch.__version__)
    print(torch.version.git_version)
    print()
    for s in get_symbols():
        print('**Symbol:**')
        print(demangle(s))
        print()
        print("**ASM:**")
        print(get_asm(s))
        print()

In [2]:
run()

1.4.0a0+78cba90
78cba90a8c5584d4d1de574647ffe9daf63c851f

**Symbol:**
void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl<at::native::add_kernel_cuda(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(float, float)#1}>(at::TensorIterator&, at::native::add_kernel_cuda(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(float, float)#1} const&)::{lambda(int)#4}>(int, at::native::gpu_kernel_impl<at::native::add_kernel_cuda(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(float, float)#1}>(at::TensorIterator&, at::native::add_kernel_cuda(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(float, float)#1} const&)::{lambda(int)#4})

**ASM:**
/*10a0*/                   LDG.E.SYS R4, [R4] ;                            /* 0x000000000404738

/*00d0*/                   LDG.E.SYS R5, [R4] ;                            /* 0x0000000004057381 */
/*00e0*/                   LDG.E.SYS R2, [R2] ;                            /* 0x0000000002027381 */
/*0120*/                   FFMA R9, R2, c[0x0][0x1a0], R5 ;                /* 0x0000680002097a23 */
/*0130*/                   STG.E.SYS [R6], R9 ;                            /* 0x0000000906007386 */

**Symbol:**
void at::native::elementwise_kernel<512, 1, at::native::gpu_kernel_impl<at::native::add_kernel_cuda(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(float, float)#1}>(at::TensorIterator&, at::native::add_kernel_cuda(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(float, float)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl<at::native::add_kernel_cuda(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#4}::operator()(

/*0ed0*/                   LDC.U8 R3, c[0x0][0x3a1] ;                     /* 0x0000e840ff037b82 */
/*1010*/                   LDG.E.U16.SYS R0, [R2] ;                       /* 0x0000000002007381 */
/*1040*/                   LDG.E.S8.SYS R2, [R2] ;                        /* 0x0000000002027381 */
/*1070*/                   LDG.E.U8.SYS R0, [R2] ;                        /* 0x0000000002007381 */
/*1120*/                   LDG.E.U16.SYS R0, [R2] ;                       /* 0x0000000002007381 */
/*1150*/                   LDG.E.64.SYS R2, [R2] ;                        /* 0x0000000002027381 */
/*1180*/                   LDG.E.SYS R0, [R2] ;                           /* 0x0000000002007381 */
/*1250*/                   LDG.E.SYS R0, [R2] ;                           /* 0x0000000002007381 */
/*1270*/                   LDG.E.64.SYS R2, [R2] ;                        /* 0x0000000002027381 */
/*12a0*/                   LDG.E.SYS R0, [R2] ;                           /* 0x0000000002007381 */
/*1340*/  

/*00a0*/                   LDG.E.SYS R2, [R2] ;                            /* 0x0000000002027381 */
/*00f0*/                   FFMA R7, R7, c[0x0][0x1a0], R2 ;                /* 0x0000680007077a23 */
/*0100*/                   STG.E.SYS [R4], R7 ;                            /* 0x0000000704007386 */

**Symbol:**
void at::native::elementwise_kernel<512, 1, at::native::gpu_kernel_impl<at::native::gpu_kernel_with_scalars<at::native::add_kernel_cuda(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(float, float)#1}>(at::TensorIterator&, at::native::add_kernel_cuda(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(float, float)#1} const&)::{lambda(float)#2}>(at::TensorIterator&, at::native::add_kernel_cuda(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(float, float)#1})::{lambda(int)#1}>(int, at::native::gpu_

/*0ed0*/                   LDC.U8 R3, c[0x0][0x3a1] ;                     /* 0x0000e840ff037b82 */
/*1010*/                   LDG.E.U16.SYS R0, [R2] ;                       /* 0x0000000002007381 */
/*1040*/                   LDG.E.S8.SYS R2, [R2] ;                        /* 0x0000000002027381 */
/*1070*/                   LDG.E.U8.SYS R0, [R2] ;                        /* 0x0000000002007381 */
/*1120*/                   LDG.E.U16.SYS R0, [R2] ;                       /* 0x0000000002007381 */
/*1150*/                   LDG.E.64.SYS R2, [R2] ;                        /* 0x0000000002027381 */
/*1180*/                   LDG.E.SYS R0, [R2] ;                           /* 0x0000000002007381 */
/*1250*/                   LDG.E.SYS R0, [R2] ;                           /* 0x0000000002007381 */
/*1270*/                   LDG.E.64.SYS R2, [R2] ;                        /* 0x0000000002027381 */
/*12a0*/                   LDG.E.SYS R0, [R2] ;                           /* 0x0000000002007381 */
/*1340*/  

/*00a0*/                   LDG.E.SYS R2, [R2] ;                            /* 0x0000000002027381 */
/*00f0*/                   FFMA R7, R2, R7, c[0x0][0x1a0] ;                /* 0x0000680002077623 */
/*0100*/                   STG.E.SYS [R4], R7 ;                            /* 0x0000000704007386 */

**Symbol:**
void at::native::elementwise_kernel<512, 1, at::native::gpu_kernel_impl<at::native::gpu_kernel_with_scalars<at::native::add_kernel_cuda(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(float, float)#1}>(at::TensorIterator&, at::native::add_kernel_cuda(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(float, float)#1} const&)::{lambda(float)#1}>(at::TensorIterator&, at::native::add_kernel_cuda(at::TensorIterator&, c10::Scalar)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(float, float)#1})::{lambda(int)#1}>(int, at::native::gpu_