 # Functions

In [1]:
import contextlib, torch, random, time, types
import torch.nn as nn
from functools import wraps
import hashlib, numpy as np

In [2]:
# take in a pytorch tensor and hashes it
def sha256_cpu(x):
    h = hashlib.sha256()
    h.update(x.detach().cpu().numpy().tobytes())
    return h.hexdigest()

In [3]:
def _record(tag, a, b, out, cfg):
  """
  tag:
  a: input matrix
  b: weight matrix
  out: output matrix
  cfg: config (requires attributes records (list to store log entries), sample_rate (float))
  """
  if random.random() > cfg.sample_rate:
      return
  t0 = time.perf_counter()
  h_in  = sha256_cpu(a)
  h_out = sha256_cpu(out) if out is not None else None
  h_ms  = (time.perf_counter() - t0) * 1000 # time to perform hashing
  cfg.records.append(dict(
      tag       = tag,
      shape_a   = tuple(a.shape),
      shape_b   = tuple(b.shape) if torch.is_tensor(b) else None,
      shape_out = tuple(out.shape) if torch.is_tensor(out) else None,
      hash_in   = h_in,
      hash_out  = h_out,
      hash_ms   = h_ms,
  ))

In [4]:
def _wrap_matmul(fn, name, cfg):
  """
  Wraps a matmul function with logging capabilities.

  Args:
    fn: The original matrix multiplication function to be wrapped.
    name: A string representing the tag or identifier for the operation.
    cfg: The configuration object for logging.

  Returns:
    A wrapped version of the matrix multiplication function that logs
    execution details.
  """
  @wraps(fn)
  def wrapper(*args, **kw):
      out = fn(*args, **kw)
      _record(name, args[0], args[1], out, cfg)
      return out
  return wrapper

In [5]:
@contextlib.contextmanager # turning this function into a context manager so it can be used as "with verification(): .."
def verification(sample_rate=0.2):
    """
    Hashes every GEMM in forward and backward.
    Stores records in cfg.records (thread‑safe).

    The captured data includes:
        - Input and output shapes
        - Hashes of input and output tensors
        - Time taken for hashing
        - Operation tag (e.g., 'mm', 'linear_fw')

    Args:
        sample_rate (float, optional): The fraction of GEMM operations to
            record. Defaults to 0.2, meaning 20% of operations will be
            sampled.

    Yields:
        list: A list of records, each containing information about a
            recorded GEMM operation.
    """
    # initialize empty config
    cfg = types.SimpleNamespace(sample_rate=sample_rate, records=[])

    # ── patch tensor‑level GEMMs ─────────────────────────────────────────
    patched = []
    for name in ("mm", "matmul", "bmm"):
        orig = getattr(torch, name)
        setattr(torch, name, _wrap_matmul(orig, name, cfg))
        patched.append((torch, name, orig))

    # ── global forward & backward hooks for nn.Linear ───────────────────
    from torch.nn.modules.module import (
        register_module_forward_hook,
        register_module_full_backward_hook
    )

    def fwd_hook(mod, inp, out):
        if isinstance(mod, nn.Linear):
            _record("linear_fw", inp[0], mod.weight.t(), out, cfg)

    def bwd_hook(mod, grad_in, grad_out):
        if isinstance(mod, nn.Linear):
            _record("linear_bw", grad_out[0], mod.weight, grad_in[0], cfg)

    h_fwd = register_module_forward_hook(fwd_hook)
    h_bwd = register_module_full_backward_hook(bwd_hook)

    try:
        yield cfg.records
    finally:
        for tgt, n, orig in patched:
            setattr(tgt, n, orig)
        h_fwd.remove()
        h_bwd.remove()

# Example usage

## Helper Functions

In [6]:
sha256_cpu(torch.randn(100, 100))

'ce53dda94234345609c81163010ab4e7fe5acfcb75e74a0fc752d821dc1f35f2'

In [7]:
def my_matmul(a,b):
  return a @ b

print(my_matmul(torch.tensor([[1,2],[2,3]]),torch.tensor([[1,2],[3,4]])))

cfg = types.SimpleNamespace(sample_rate=1.0, records=[])
wrapped_matmul = _wrap_matmul(my_matmul, "my_matful",cfg=cfg)
wrapped_matmul(torch.tensor([[1,2],[2,3]]),torch.tensor([[1,2],[3,4]]))
print(cfg.records)

tensor([[ 7, 10],
        [11, 16]])
[{'tag': 'my_matful', 'shape_a': (2, 2), 'shape_b': (2, 2), 'shape_out': (2, 2), 'hash_in': 'c323a0f168d63ec3e8b94a1f60cfc9e38a42fc746b9ff89f0f0696bad59c5e57', 'hash_out': '0393e95fdbfd5ef4751204ad352102b42d726bda68ac921e84da3f9c076ca547', 'hash_ms': 0.0762330000725342}]


## Running verifiction on a simple NN

In [8]:
import torch, torch.nn as nn, torch.nn.functional as F
from tabulate import tabulate

class Tiny(nn.Module):
    def __init__(self):
        super().__init__()
        self.l1 = nn.Linear(128, 64)
        self.l2 = nn.Linear(64, 32)
    def forward(self, x):
        return self.l2(F.relu(self.l1(x)))

model = Tiny().cuda()
x = torch.randn(4, 128, device="cuda")

with verification(sample_rate=1.0) as recs:
    y = model(x)

headers = recs[0].keys()
table_data = [list(rec.values()) for rec in recs]

print(tabulate(table_data, headers=headers, tablefmt="grid"))

+-----------+-----------+-----------+-------------+------------------------------------------------------------------+------------------------------------------------------------------+-----------+
| tag       | shape_a   | shape_b   | shape_out   | hash_in                                                          | hash_out                                                         |   hash_ms |
| linear_fw | (4, 128)  | (128, 64) | (4, 64)     | b005ee210f751ab638f3f22aeb55448fbae5d44ad931bcd5983d5c776cb655d1 | f5727fb8558a3fa707ac6feb07b51d39f00887396079a1f7d9ed7f1d1a49e3ca |  0.233563 |
+-----------+-----------+-----------+-------------+------------------------------------------------------------------+------------------------------------------------------------------+-----------+
| linear_fw | (4, 64)   | (64, 32)  | (4, 32)     | 2a8df514433cffe4f3555e3a95f9c87a05712f12e6ad31508cf0b1e50156ada8 | 878ea4fe623fe20d66c6f3ddc863cdc17908aa298df73da08a0d1dcb68e90039 |  0.274372 |
+---------

In [9]:
def timeit(fn, *a, iters=50, **kw):
    torch.cuda.synchronize()
    t0 = time.perf_counter()
    for _ in range(iters):
        fn(*a, **kw)
    torch.cuda.synchronize()
    return (time.perf_counter()-t0)*1000/iters

# baseline
base_ms = timeit(model, x, iters=100)

# with POMM
with verification(sample_rate=1.0):
    pomm_ms = timeit(model, x, iters=100)

print(f"baseline {base_ms:.3f} ms | with‑hash {pomm_ms:.3f} ms | overhead {(pomm_ms-base_ms)/base_ms*100:5.1f}%")

baseline 0.318 ms | with‑hash 1.565 ms | overhead 392.0%


## Running on inference on GPT-2

In [10]:
!pip -q install transformers sentencepiece

In [11]:
import torch, time, transformers
from transformers import AutoTokenizer, AutoModelForCausalLM

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

model_name = "gpt2"
tokenizer  = AutoTokenizer.from_pretrained(model_name)
model      = AutoModelForCausalLM.from_pretrained(model_name).to(device)
model.eval()

BATCH   = 16
SEQ_LEN = 32

prompt_text = "The quick brown fox jumps over the lazy dog. " * 4
tokens  = tokenizer(prompt_text, return_tensors="pt")["input_ids"][0][:SEQ_LEN]
inputs  = tokens.unsqueeze(0).repeat(BATCH, 1).to(device)   # (B, L)

Device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [12]:
# baseline
# base_ms = timed_forward(model, inputs, iters=20)
base_ms = timeit(lambda: model(inputs).logits, iters=20)
print(f"Baseline  (no POMM): {base_ms:6.2f} ms / batch")

# using the verification
with verification(sample_rate=1.0) as records:
    pomm_ms = timeit(lambda: model(inputs).logits, iters=20)

print(f"With POMM (sha256): {pomm_ms:6.2f} ms / batch")
print(f"Overhead: {(pomm_ms - base_ms) / base_ms * 100:5.1f}%")

headers = records[0].keys()
table_data = [list(rec.values()) for rec in records]

print(tabulate(table_data, headers=headers, tablefmt="grid"))

Baseline  (no POMM):  57.06 ms / batch




With POMM (sha256): 827.55 ms / batch
Overhead: 1350.4%
+-----------+---------------+--------------+-----------------+------------------------------------------------------------------+------------------------------------------------------------------+-----------+
| tag       | shape_a       | shape_b      | shape_out       | hash_in                                                          | hash_out                                                         |   hash_ms |
| linear_fw | (16, 32, 768) | (768, 50257) | (16, 32, 50257) | 7a34aaba4c4a7bc16f53d3bb091961bc56b6f94373e52a762295094ecf0d30df | a7371efd3cd8e08cdbdf27a0d1c051f2ccd825e2840ab37f89649cde84a95327 |  1848.74  |
+-----------+---------------+--------------+-----------------+------------------------------------------------------------------+------------------------------------------------------------------+-----------+
| linear_fw | (16, 32, 768) | (768, 50257) | (16, 32, 50257) | 7a34aaba4c4a7bc16f53d3bb091961bc56b6f94373e52

## Running training on MNIST

In [None]:
!pip install torchvision

In [14]:
import torch, torch.nn as nn, torch.optim as optim, torchvision
from torch.utils.data import DataLoader
from torchvision import transforms

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

class MNISTMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(28*28, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 10)
    def forward(self, x):
        x = x.view(x.size(0), -1)
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

model = MNISTMLP().to(device)
opt   = optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()

# data
transform = transforms.Compose([transforms.ToTensor()])
train_ds  = torchvision.datasets.MNIST(root="/tmp", download=True, train=True, transform=transform)
train_dl  = DataLoader(train_ds, batch_size=64, shuffle=True)

# running one epoch of training
with verification(sample_rate=1.0) as records:
    model.train()
    for i, (x, y) in enumerate(train_dl):
        if i == 200: break
        x, y = x.to(device), y.to(device)

        opt.zero_grad()
        out = model(x)
        loss = loss_fn(out, y)
        loss.backward()
        opt.step()

        if i % 50 == 0:
            print(f"batch {i:3d} | loss {loss.item():.4f}")

print("\n=== POMM summary ===")
fw = sum(1 for r in records if r['tag'] == 'linear_fw')
bw = sum(1 for r in records if r['tag'] == 'linear_bw')
print(f"forward GEMMs logged : {fw}")
print(f"backward GEMMs logged: {bw}")
print(f"total records        : {len(records)}")
print("sample record keys   :", list(records[0].keys()))


Device: cuda
batch   0 | loss 2.3004
batch  50 | loss 0.8431
batch 100 | loss 0.4851
batch 150 | loss 0.3713

=== POMM summary ===
forward GEMMs logged : 600
backward GEMMs logged: 600
total records        : 1200
sample record keys   : ['tag', 'shape_a', 'shape_b', 'shape_out', 'hash_in', 'hash_out', 'hash_ms']


# Testing GPU Hashing

In [2]:
import cupy as cp, cudf, torch, time, random
from torch.utils import dlpack

def sha256_gpu(t):
    cupy_view = cp.from_dlpack(torch.to_dlpack(t.view(torch.uint8)))
    s = cudf.Series([cupy_view])
    return s.hash_values(method="sha256")[0]

def _record_gpu(tag, a, b, out, cfg):
    """
    Same contract as your _record(), but hashes with the GPU.
    """
    if random.random() > cfg.sample_rate:
        return
    torch.cuda.synchronize()
    t0 = time.perf_counter()

    h_in  = sha256_gpu(a)
    h_out = sha256_gpu(out) if out is not None else None

    torch.cuda.synchronize()
    h_ms  = (time.perf_counter() - t0) * 1e3  # ms spent hashing

    cfg.records.append(dict(
        tag       = tag,
        shape_a   = tuple(a.shape),
        shape_b   = tuple(b.shape) if torch.is_tensor(b) else None,
        shape_out = tuple(out.shape) if torch.is_tensor(out) else None,
        hash_in   = h_in,
        hash_out  = h_out,
        hash_ms   = h_ms,
        device    = "gpu",
    ))

In [33]:
def _wrap_matmul_gpu(fn, name, cfg):
  """
  Wraps a matmul function with logging capabilities.

  Args:
    fn: The original matrix multiplication function to be wrapped.
    name: A string representing the tag or identifier for the operation.
    cfg: The configuration object for logging.

  Returns:
    A wrapped version of the matrix multiplication function that logs
    execution details.
  """
  @wraps(fn)
  def wrapper(*args, **kw):
      out = fn(*args, **kw)
      _record_gpu(name, args[0], args[1], out, cfg)
      return out
  return wrapper

In [37]:
def my_matmul(a,b):
  a = a.cuda().type(torch.float32)
  b = b.cuda().type(torch.float32)
  return a @ b

print(my_matmul(torch.tensor([[1,2],[2,3]]),torch.tensor([[1,2],[3,4]])))

cfg = types.SimpleNamespace(sample_rate=1.0, records=[])
wrapped_matmul_gpu = _wrap_matmul_gpu(my_matmul, "my_matful",cfg=cfg)
wrapped_matmul_gpu(torch.tensor([[1,2],[2,3]]),torch.tensor([[1,2],[3,4]]))
print(cfg.records)

tensor([[ 7., 10.],
        [11., 16.]], device='cuda:0')


RuntimeError: CuPy is built against CUDA, different from the backend that backs the incoming DLPack tensor

In [38]:
import cupy as cp, cudf, torch
from torch.utils import dlpack

def sha256_gpu(t: torch.Tensor) -> str:
    # 1. ensure data is on‑GPU
    if not t.is_cuda:
        t = t.cuda(non_blocking=True)

    # 2. view as bytes & make it contiguous
    t_bytes = t.contiguous().view(torch.uint8)

    # 3. zero‑copy CuPy view via DLPack
    cupy_view = cp.from_dlpack(dlpack.to_dlpack(t_bytes))

    # 4. hash with cuDF (row‑wise hashing → one row, one digest)
    digest = cudf.DataFrame({'x': cupy_view}).hash_values(method="sha256")[0]

    return digest            # already a 64‑char hex string


def _record_gpu(tag, a, b, out, cfg):
    if random.random() > cfg.sample_rate:
        return
    torch.cuda.synchronize()
    t0 = time.perf_counter()

    h_in  = sha256_gpu(a)
    h_out = sha256_gpu(out) if out is not None else None

    torch.cuda.synchronize()
    h_ms  = (time.perf_counter() - t0) * 1e3

    cfg.records.append(dict(
        tag       = tag,
        shape_a   = tuple(a.shape),
        shape_b   = tuple(b.shape) if torch.is_tensor(b) else None,
        shape_out = tuple(out.shape) if torch.is_tensor(out) else None,
        hash_in   = h_in,
        hash_out  = h_out,
        hash_ms   = h_ms,
        device    = "GPU",
    ))

def my_matmul(a, b): return a @ b

cfg = types.SimpleNamespace(sample_rate=1.0, records=[])
mm   = _wrap_matmul_gpu(my_matmul, "matmul_gpu", cfg)

a = torch.tensor([[1,2],[2,3]], dtype=torch.int64)
b = torch.tensor([[1,2],[3,4]], dtype=torch.int64)

print(mm(a, b))
print(cfg.records[0]['hash_in'][:16], '…', cfg.records[0]['hash_ms'], 'ms')


ValueError: Data must be 1-dimensional

In [3]:
import cupy as cp, cudf, torch, time, random
from functools import wraps
from torch.utils import dlpack

# ----------  GPU SHA‑256  ----------
def sha256_gpu(t: torch.Tensor) -> str:
    """
    Return a hex SHA‑256 digest computed entirely on the GPU.
    """
    # 1. Make sure data lives on the GPU and is contiguous
    if not t.is_cuda:
        t = t.cuda(non_blocking=True)
    t_bytes = t.contiguous().view(torch.uint8).flatten()         # 1‑D

    # 2. Zero‑copy CuPy view via DLPack
    cupy_vec = cp.from_dlpack(dlpack.to_dlpack(t_bytes))

    # 3. One‑row / one‑column DataFrame → one digest
    #    (column name doesn’t matter; we pick 'b')
    df = cudf.DataFrame({'b': cupy_vec.reshape(1, -1)})
    return df.hash_values(method="sha256")[0]      # hex string


# ----------  logger wrapper  ----------
def _record_gpu(tag, a, b, out, cfg):
    if random.random() > cfg.sample_rate:
        return
    torch.cuda.synchronize()
    t0 = time.perf_counter()

    h_in  = sha256_gpu(a)
    h_out = sha256_gpu(out) if out is not None else None

    torch.cuda.synchronize()
    h_ms = (time.perf_counter() - t0) * 1e3

    cfg.records.append(dict(
        tag       = tag,
        shape_a   = tuple(a.shape),
        shape_b   = tuple(b.shape) if torch.is_tensor(b) else None,
        shape_out = tuple(out.shape) if torch.is_tensor(out) else None,
        hash_in   = h_in,
        hash_out  = h_out,
        hash_ms   = h_ms,
        device    = "GPU",
    ))

# ----------  matmul decorator  ----------
from functools import wraps, partial
import types, torch

def _wrap_matmul_gpu(fn, name, cfg):
    @wraps(fn)
    def wrapper(*args, **kw):
        out = fn(*args, **kw)
        _record_gpu(name, args[0], args[1], out, cfg)
        return out
    return wrapper

def my_matmul(a, b):         # original op
    return a @ b


cfg = types.SimpleNamespace(sample_rate=1.0, records=[])

mm = _wrap_matmul_gpu(my_matmul, "matmul_gpu", cfg)

a = torch.tensor([[1, 2],
                  [2, 3]], dtype=torch.float32, device='cuda')
b = torch.tensor([[1, 2],
                  [3, 4]], dtype=torch.float32, device='cuda')

print(mm(a, b))          # should print the product
print(cfg.records)       # digest + timing now logged


ValueError: Data must be 1-dimensional

In [5]:
# GPU SHA‑256 demo  ── hashes any message ≤ 55 bytes (fits in one 512‑bit block)
# !pip install numba --quiet            # uncomment if Numba isn't installed

import numpy as np
from numba import cuda, uint32

# 64 SHA‑256 round constants
K = np.array([
    0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
    0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
    0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
    0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
    0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
    0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
    0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
    0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
], dtype=np.uint32)

# -------- device helpers ----------------------------------------------------
@cuda.jit(device=True, inline=True)
def rotr(x, n):                       # rotate right
    return ((x >> n) | (x << (32 - n))) & 0xffffffff

# -------- main kernel -------------------------------------------------------
@cuda.jit
def sha256_kernel(words, digests):
    i = cuda.grid(1)                  # 1 thread → 1 message
    if i >= words.shape[0]:
        return

    # message schedule array in registers / local memory
    w = cuda.local.array(64, uint32)
    for t in range(16):                # first 16 words come from caller
        w[t] = words[i, t]

    for t in range(16, 64):            # extend to 64 words
        s0 = rotr(w[t-15], 7) ^ rotr(w[t-15], 18) ^ (w[t-15] >> 3)
        s1 = rotr(w[t-2], 17) ^ rotr(w[t-2], 19) ^ (w[t-2] >> 10)
        w[t] = (w[t-16] + s0 + w[t-7] + s1) & 0xffffffff

    # initial hash value (H0‑H7)
    a,b,c,d,e,f,g,h = (
        0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
        0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19)

    for t in range(64):
        S1  = rotr(e,6)  ^ rotr(e,11) ^ rotr(e,25)
        ch  = (e & f) ^ ((~e) & g)
        temp1 = (h + S1 + ch + K[t] + w[t]) & 0xffffffff
        S0  = rotr(a,2)  ^ rotr(a,13) ^ rotr(a,22)
        maj = (a & b) ^ (a & c) ^ (b & c)
        temp2 = (S0 + maj) & 0xffffffff

        h,g,f,e,d,c,b,a = g,f,e,(d+temp1)&0xffffffff, c,b,a,(temp1+temp2)&0xffffffff

    # add the compressed chunk to current hash value
    digests[i, 0] = (0x6a09e667 + a) & 0xffffffff
    digests[i, 1] = (0xbb67ae85 + b) & 0xffffffff
    digests[i, 2] = (0x3c6ef372 + c) & 0xffffffff
    digests[i, 3] = (0xa54ff53a + d) & 0xffffffff
    digests[i, 4] = (0x510e527f + e) & 0xffffffff
    digests[i, 5] = (0x9b05688c + f) & 0xffffffff
    digests[i, 6] = (0x1f83d9ab + g) & 0xffffffff
    digests[i, 7] = (0x5be0cd19 + h) & 0xffffffff

# -------- host‑side helpers --------------------------------------------------
def pad_single_block(msg: bytes) -> np.ndarray:
    """Return one 512‑bit block (16 × uint32) with SHA‑256 padding."""
    bit_len = (len(msg) * 8).to_bytes(8, 'big')
    msg = msg + b'\x80'                             # append 1‑bit
    msg += b'\x00' * ((56 - len(msg) % 64) % 64)    # pad with zeros
    msg += bit_len                                  # append length
    block = np.frombuffer(msg[:64], dtype='>u4')    # big‑endian
    return block.astype(np.uint32)                  # convert to little‑endian words

# -------- demo ----------------------------------------------------------------
messages = [b'hello', b'GPU SHA-256 demo!']  # use "-" not "‑"
blocks   = np.stack([pad_single_block(m) for m in messages])

d_words   = cuda.to_device(blocks)
d_digests = cuda.device_array((len(messages), 8), dtype=np.uint32)

threads_per_block = 128
blocks_per_grid   = (len(messages) + threads_per_block - 1) // threads_per_block
sha256_kernel[blocks_per_grid, threads_per_block](d_words, d_digests)

hashes = d_digests.copy_to_host()
for m, h in zip(messages, hashes):
    print(f"{m!r}  →  {''.join(f'{x:08x}' for x in h)}")

ERROR:numba.cuda.cudadrv.driver:Call to cuLinkAddData results in CUDA_ERROR_UNSUPPORTED_PTX_VERSION


LinkerError: [222] Call to cuLinkAddData results in CUDA_ERROR_UNSUPPORTED_PTX_VERSION
ptxas application ptx input, line 9; fatal   : Unsupported .version 8.5; current version is '8.4'