<a href="https://colab.research.google.com/github/tonycccccc/tony.github.io/blob/main/MemoryProfiling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Notebook for doing MemoryProfiling -- All the attention layer related matrices in this notebook are generated through running target githuib repo. \
Author: Zeyu Chen

In [None]:
import tensorflow as tf
import numpy as np

In [None]:
! nvidia-smi

Tue Dec 14 16:55:39 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P0    27W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Memory Profiling Sample Method
def MemoryCheck(self, memory_used):
if (tf.config.list_physical_devices('GPU')):
  # Reser the memory state
  tf.config.experimental.reset_memory_stats('GPU:0')
  # Creates the first peak memory usage.
  x = tf.convert_to_tensor(memory_used)
  del x
  memory = tf.config.experimental.get_memory_info('GPU:0')['peak']
  return memory

In [None]:
# Iteration times we want to test our algorithm

iteration_num = 12
matrix_num = 12

In [None]:
import glob
import os
# load matrix from files
all_query_files = glob.glob("/content/drive/MyDrive/models/transformer/logging_query*.txt")
all_key_files = glob.glob("/content/drive/MyDrive/models/transformer/logging_key*.txt")
all_value_files = glob.glob("/content/drive/MyDrive/models/transformer/logging_value*.txt")
all_bias_files = glob.glob("/content/drive/MyDrive/models/transformer/logging_bias*.txt")
query_matrix = []
key_matrix = []
value_matrix = []
for file in all_query_files:
    query_file = tf.io.read_file(file)
    query = tf.io.parse_tensor(query_file, out_type=tf.float32)
    query_matrix.append(query)
for file in all_key_files:
    key_file = tf.io.read_file(file)
    key = tf.io.parse_tensor(key_file, out_type=tf.float32)
    key_matrix.append(key)
for file in all_value_files:
    value_file = tf.io.read_file(file)
    value = tf.io.parse_tensor(value_file, out_type=tf.float32)
    value_matrix.append(value)
query_matrix[0].shape

TensorShape([64, 64, 16, 64])

In [None]:
# Testing matrix with all one large matrixes
# tf.concat not doing same thing
# All three matrix at this time should have shape as (64k) * 64 * 16 * 64
query_matrix = np.concatenate(query_matrix)
key_matrix = np.concatenate(key_matrix)
value_matrix = np.concatenate(value_matrix)
print(query_matrix.shape)
print(key_matrix.shape)
print(value_matrix.shape)

(576, 64, 16, 64)
(576, 64, 16, 64)
(576, 64, 16, 64)


In [None]:
# Testing matrix mixed with large and normal matrix
query_matrix = np.random.shuffle(query_matrix)
prob = np.random.rand(1)
data = []
for i in range(query_matrix.size):
  if prob > 0.5:
    num = np.random.randint(10)
    matrix = np.random.choice(query_matrix, [num, 1])
    data.append(np.concatenate(matrix))
  else:
    data.append(np.random.choice(query_matrix, 1))
query_matrix = np.array(data)

In [None]:
# Testing with several randomly generated large matrix
data_query = []
data_key = []
data_value = []
np.random.shuffle(query_matrix)
np.random.shuffle(key_matrix)
np.random.shuffle(value_matrix)
for i in range(iteration_num):
  if i == iteration_num // 2:
    np.random.shuffle(query_matrix)
    np.random.shuffle(key_matrix)
    np.random.shuffle(value_matrix)
  data_query.append(np.concatenate(query_matrix))
  data_key.append(np.concatenate(key_matrix))
  data_value.append(np.concatenate(value_matrix))
  # data_query.append(query_matrix)
  # data_key.append(key_matrix)
  # data_value.append(value_matrix)
# query_matrix = np.concatenate(data_query)[None, :, :, :, :]
# key_matrix = np.concatenate(data_key)[None, :, :, :, :]
# value_matrix = np.concatenate(data_value)[None, :, :, :, :]
# query_matrix = np.concatenate(data_query)
# key_matrix = np.concatenate(data_key)
# value_matrix = np.concatenate(data_value)
query_matrix = np.array(data_query)
key_matrix = np.array(data_key)
value_matrix = np.array(data_value)

In [None]:
# Generate Test Matrix with shape 1 * n * 16 * 64
# Set matrix_size to 2k, 4k, 8k, 16k, 32k, and 64k
matrix_size = 32 * 1024
data_query = []
data_key = []
data_value = []
for i in range(matrix_size // 64):
  idx = np.random.randint(0, 576)
  data_query.append(query_matrix[idx, :, :, :])
  data_key.append(key_matrix[idx, :, :, :])
  data_value.append(value_matrix[idx, :, :, :])
query_matrix = np.concatenate(data_query)[None, :, :, :]
key_matrix = np.concatenate(data_key)[None, :, :, :]
value_matrix = np.concatenate(data_value)[None, :, :, :]

In [None]:
query_matrix.shape

(12, 576, 64, 16, 64)

In [None]:
# bias matrix are generated randomly with undetermined shape in the original code repo, so process it separately here
dummy = []
for file in all_bias_files:
    bias_file = tf.io.read_file(file)
    bias = tf.io.parse_tensor(bias_file, out_type=tf.float32)
    dummy = dummy + list(bias.numpy().reshape([-1]))
dummy = np.array(dummy)
#idx = np.random.choice(np.array(dummy), tuple(query_matrix.shape), replace=True)
idx = np.random.choice(np.array(dummy), [1, 64, 1, 64], replace=True)
bias_matrix = idx

In [None]:
# Convert all numpy matrices to tensors before doing the memory profiling
query_matrix = tf.convert_to_tensor(query_matrix)
key_matrix = tf.convert_to_tensor(key_matrix)
value_matrix = tf.convert_to_tensor(value_matrix)

In [None]:
# Append all profiling result to a csv file so we can do data visualization
old_file_path = "/content/drive/MyDrive/models/transformer/old_time_result.txt"
if False:
  os.remove(old_file_path)
f = open(old_file_path, 'a')
f.write("\nTest with one matrix with 576 * 64 * 16 * 64 shape\n")

52

In [None]:
import time
# Code with separated test matrix
res = []
if (tf.config.list_physical_devices('GPU')):
  tf.keras.backend.clear_session()
  print("Before doing attention operations. Reset the memory")
  tf.config.experimental.reset_memory_stats('GPU:0')
  memory1 = tf.config.experimental.get_memory_info('GPU:0')
  print(memory1['peak'])
  print(memory1['current'])
  # f.write("Before running : (%f, %f)\n" % (memory1['peak'], memory1['current']))
  start = time.time()
  for i in range(matrix_num):
    query = query_matrix[i]
    key = key_matrix[i]
    value = value_matrix[i]
    # query = query_matrix
    # key = key_matrix
    # value = value_matrix
    bias_value = bias_matrix
    bias = bias_value[0, 0, 0, 0]
    logits = tf.einsum("BTNH,BFNH->BNFT", key, query)
    logits += bias
    weights = tf.nn.softmax(logits, name="attention_weights")
    weights = tf.nn.dropout(weights, rate=0.4)
    old_attention_output = tf.einsum("BNFT,BTNH->BFNH", weights, value)
    res.append(old_attention_output)
    memory2 = tf.config.experimental.get_memory_info('GPU:0')
    print("Memory after attention operation.")
    print(memory2['peak'])
    print(memory2['current'])
    #f.write("After Iteration %d : (%f, %f)\n" % (i, memory2['peak'], memory2['current']))
    f.write("Iteration %d : time %f" % (i, time.time()-start))
f.close()

Before doing attention operations. Reset the memory
5435819264
5435819264
Memory after attention operation.
6446647040
5888804608
Memory after attention operation.
6635390720
6073353984
Memory after attention operation.
6824134400
6224348928
Memory after attention operation.
7012878080
6375343872
Memory after attention operation.
7050626816
6492784384
Memory after attention operation.
7541360384
6979323648
Memory after attention operation.
7692355328
7130318592
Memory after attention operation.
7843350272
7281313536
Memory after attention operation.
7994345216
7432308480
Memory after attention operation.
8145340160
7583303424
Memory after attention operation.
8296335104
7734298368
Memory after attention operation.
8485078784
7923042048


In [None]:
new_file_path = "/content/drive/MyDrive/models/transformer/newtime_result.txt"
if False:
  os.remove(new_file_path)
f = open(new_file_path, 'a')
f.write("\nTest with one matrix with 576 * 64 * 16 * 64 shape\n")

52

In [None]:
import time
# Try some new stuff
new_res = []
if (tf.config.list_physical_devices('GPU')):
  tf.keras.backend.clear_session()
  print("Before doing attention operations. Reset the memory")
  tf.config.experimental.reset_memory_stats('GPU:0')
  memory1 = tf.config.experimental.get_memory_info('GPU:0')
  print(memory1['peak'])
  print(memory1['current'])
  start = time.time()
  # f.write("Before running : (%f, %f)\n" % (memory1['peak'], memory1['current']))
  # 将变量整合到function里面，尝试融合多个value矩阵concat形成的大矩阵， 尝试更高的granularity
  for i in range(matrix_num):
    query = query_matrix[i]
    key = key_matrix[i]
    value = value_matrix[i]
    # query = query_matrix
    # key = key_matrix
    # value = value_matrix
    batch_size, source_length, head_num, dim = tf.shape(query).numpy()
    _,key_length,_,_, = tf.shape(key).numpy()
    batch_granularity = 32
    head_granularity = 8
    bias_value = bias_matrix[0, np.random.randint(64), 0, np.random.randint(64)]
    # The outermost loop loops through batch_size, with granularity as stride
    for batch in tf.range(0, batch_size, batch_granularity):
      for head in tf.range(0, head_num, head_granularity):
        batch_termination = batch + batch_granularity if batch + batch_granularity <= batch_size else batch_size
        for unit_batch in tf.range(batch, batch_termination):
          head_termination = head + head_granularity if head + head_granularity <= head_num else head_num
          for unit_head in tf.range(head, head_termination):
            for query_source in tf.split(query[unit_batch, :, unit_head, :], num_or_size_splits=1, axis=0):
              key_source = key[unit_batch, :, unit_head, :]
              result = tf.matmul(query_source, tf.transpose(key_source))
              result = bias_value + result
              #Row granularity
              result = tf.nn.softmax(result, name="attention_weights")
              result = tf.nn.dropout(result, rate=0.4)
            value_source = value[unit_batch, :, unit_head, :]
            attention_output = tf.matmul(result, value_source) #Matrix with size F * H

            if unit_head == head:
              attention_output = tf.expand_dims(attention_output, axis=1)
              output_from_unit_head = attention_output
            else:
              attention_output = tf.expand_dims(attention_output, axis=1)
              output_from_unit_head = tf.concat([output_from_unit_head, attention_output], 1)
          if unit_batch == batch:
            output_from_unit_head = tf.expand_dims(output_from_unit_head, axis=0)
            output_from_unit_batch = output_from_unit_head
          else:
            output_from_unit_head = tf.expand_dims(output_from_unit_head, axis=0)
            output_from_unit_batch = tf.concat([output_from_unit_batch, output_from_unit_head], 0)
        if head == 0:
          suboutput = output_from_unit_batch
        else:
          suboutput = tf.concat([suboutput, output_from_unit_batch], 2)
      if batch == 0:
        output = suboutput
      else:
        output = tf.concat([output, suboutput], 0)
    new_res.append(output)
    memory2 = tf.config.experimental.get_memory_info('GPU:0')
    print("Memory profiling using FLAT")
    print(memory2['peak'])
    print(memory2['current'])
    # f.write("After Iteration %d : (%f, %f)\n" % (i, memory2['peak'], memory2['current']))
    f.write("After Iteration %d : (%f)\n" % (i, time.time() - start))
f.close()

Before doing attention operations. Reset the memory
5435819264
5435819264
Memory profiling using FLAT
5774075648
5774075648
Memory profiling using FLAT
5918676736
5913999104
Memory profiling using FLAT
6118516480
6066730752
Memory profiling using FLAT
6247065344
6247028480
Memory profiling using FLAT
6467192576
6467192576
Memory profiling using FLAT
6626645760
6626645760
Memory profiling using FLAT
6766069504
6764517120
Memory profiling using FLAT
6992389632
6992389632
Memory profiling using FLAT
7131326208
7131326208
Memory profiling using FLAT
7349475072
7349475072
Memory profiling using FLAT
7492183808
7492183808
Memory profiling using FLAT
7702025984
7702025984


In [None]:
! nvidia-smi

Wed Dec  1 03:04:30 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0    33W / 250W |  15855MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
optimized_file_path = "/content/drive/MyDrive/models/transformer/optimizedtime_result.txt"
if False:
  os.remove(new_file_path)
f = open(optimized_file_path, 'a')
f.write("\nTest with 12 matrix with each 576 * 64 * 16 * 64 shape\n")

56

In [None]:
print(query_matrix.shape)

(1, 16384, 16, 64)


In [None]:
import time
# What if user wants to try even higher granularity?
new_res = []
if (tf.config.list_physical_devices('GPU')):
  # tf.keras.backend.clear_session()
  print("Before doing attention operations. Reset the memory")
  tf.config.experimental.reset_memory_stats('GPU:0')
  memory1 = tf.config.experimental.get_memory_info('GPU:0')
  print(memory1['peak'])
  print(memory1['current'])
  f.write("Before running : (%f, %f)\n" % (memory1['peak'], memory1['current']))
  start = time.time()
  for i in range(matrix_num):
    qeury = query_matrix[i]
    key = key_matrix[i]
    value = value_matrix[i]
    # query = query_matrix
    # key = key_matrix
    # value = value_matrix
    batch_size, source_length, head_num, dim = tf.shape(query).numpy()
    _,key_length,_,_, = tf.shape(key).numpy()
    batch_granularity = 32
    head_granularity = 16
    bias_value = bias_matrix[0, np.random.randint(64), 0, np.random.randint(64)]
    for batch in tf.range(0, batch_size, batch_granularity):
      for head in tf.range(0, head_num, head_granularity):
        batch_termination = batch + batch_granularity if batch + batch_granularity <= batch_size else batch_size
        for unit_batch in tf.range(batch, batch_termination):
          head_termination = head + head_granularity if head + head_granularity <= head_num else head_num
          for unit_head in tf.range(head, head_termination, 8):
            # query_source now should be 64 * 8 * 64
            query_source = tf.gather(query[unit_batch, :, :, :], indices=tf.range(unit_head, unit_head + 8), axis=1)
            key_source = tf.gather(key[unit_batch, :, :, :], indices=tf.range(unit_head, unit_head + 8), axis=1)
            result = tf.einsum("TNH, FNH->NFT", key_source, query_source)
            # result += bias_value[None, :, :]
            result += bias_value
            result = tf.nn.softmax(result, name="attention_weights")
            result = tf.nn.dropout(result, rate=0.4)
            if unit_head == head:
              logit = result
            else:
              # Concatenate over head num dimension
              logit = tf.concat([logit, result], axis=0)
          value_source = value[unit_batch, :, :, :]
          attention_output = tf.einsum("NFT,TNH->FNH", logit, value_source)
          if unit_batch == batch:
            attention_output = tf.expand_dims(attention_output, axis=0)
            output_from_unit_batch = attention_output
          else:
            attention_output = tf.expand_dims(attention_output, axis=0)
            output_from_unit_batch = tf.concat([output_from_unit_batch, attention_output], 0)
        if head == 0:
          suboutput = output_from_unit_batch
        else:
          suboutput = tf.concat([suboutput, output_from_unit_batch], 2)
      if batch == 0:
        output = suboutput
      else:
        output = tf.concat([output, suboutput], 0)
    new_res.append(output)
    memory2 = tf.config.experimental.get_memory_info('GPU:0')
    print("Memory profiling using FLAT")
    print(memory2['peak'])
    print(memory2['current'])
    f.write("After Iteration %d : (%f)\n" % (i, time.time()-start))
f.close()

Before doing attention operations. Reset the memory
5435819264
5435819264
Memory profiling using FLAT
5501091584
5501091584
Memory profiling using FLAT
5517639424
5517639424
Memory profiling using FLAT
5538381824
5534615040
Memory profiling using FLAT
5551195648
5551195648
Memory profiling using FLAT
5571280896
5571280896
Memory profiling using FLAT
5594744320
5594744320
Memory profiling using FLAT
5611388672
5611388672
Memory profiling using FLAT
5628821248
5628821248
Memory profiling using FLAT
5645565696
5645565696
Memory profiling using FLAT
5664997120
5661984256
Memory profiling using FLAT
5684002816
5681316096
Memory profiling using FLAT
5707104512
5698357248


In [None]:
# Check Result
diff = np.abs((np.concatenate(res) - np.concatenate(new_res))) < 1
np.all(diff)

In [None]:
output

In [None]:
old_attention_output = tf.cast(old_attention_output, dtype=tf.float64)
old_attention_output.shape

TensorShape([32, 32, 2, 10])

In [None]:
np.abs(output - old_attention_output) < 0.1

In [None]:
1. GPU timing
2. visualization
3. Compress to one function
4. Powerpoint
5. Github page with a UI