In [1]:
import importlib.metadata
importlib.metadata.version("torch")

'2.2.1'

In [None]:
pip install transformers accelerate

In [1]:
pip install flash-attn --no-build-isolation

^C
Note: you may need to restart the kernel to use updated packages.


**Profiler Implementation**

https://github.com/pytorch/pytorch/blob/487ebcac3bc10b4b4b0631dafe2a12ddb0852f2a/torch/csrc/profiler/python/init.cpp

https://github.com/pytorch/kineto/tree/main/libkineto/src

**Profiler log level**

  VERBOSE = 0,
  INFO = 1,
  WARNING = 2,
  ERROR = 3,
  STAGE = 4,
  ENUM_COUNT = 5

In [1]:
import os
os.environ["KINETO_LOG_LEVEL"] = "0"

In [2]:
import torch

torch.__version__

'2.2.1'

In [3]:
import torch
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("croissantllm/CroissantLLMBase", use_safetensors=False, device_map="auto", torch_dtype=torch.float16, attn_implementation="flash_attention_2")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [4]:
from torch.profiler import profile, record_function, ProfilerActivity

input_ids = torch.randint(low=0, high=32000, size=(50,1024), dtype=torch.int64).to(model.device)
attention_mask = torch.ones(50,1024).to(model.device)
model.eval()

with profile(
  activities=[ProfilerActivity.CPU,ProfilerActivity.CUDA],
  with_stack=True,
  experimental_config=torch._C._profiler._ExperimentalConfig(verbose=True)
) as prof:
    with record_function("model_inference"):
        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, use_cache=False, output_attentions=False, output_hidden_states=False) 

print(prof.key_averages().table())

-----------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                         Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-----------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
              model_inference         3.76%     208.695ms       100.00%        5.550s        5.550s      17.441ms         0.16%       10.949s       10.949s             1  
              aten::embedding         0.02%       1.310ms         1.29%      71.408ms      71.408ms     951.000us         0.01%      98.468ms      98.468ms             1  
                aten::reshape         0.08%       4.467ms         0.11%       6.235ms      25.764us       2.578ms         0.02%      15.751

In [5]:
from torch.nn import ModuleList
import inspect

memory_unit_mb = 1024*1024

def display_modules(module, name_prefix=None, depth=0, max_depth=99, forward_methods=None):
    if forward_methods is None:
        forward_methods = {}
    header = module.__class__.__name__
    if name_prefix is not None:
        header = f"{name_prefix}#{header}" 
    depth_prefix = "  "*depth
    print(depth_prefix+"---------------------")
    print(depth_prefix+header)
    if len(list(module.named_parameters(recurse=False))) > 0:
        print(depth_prefix+"> parameters")
        for name,parameter in module.named_parameters(recurse=False):
            print(depth_prefix+f"- {name}: {get_tensor_description(parameter)}")
    if len(list(module.named_buffers(recurse=False))) > 0:
        print(depth_prefix+"> buffers")
        for name,buffer in module.named_buffers(recurse=False):
            print(depth_prefix+f"- {name}: {get_tensor_description(buffer)}")
    if len(list(module.named_children())) > 0:
        print(depth_prefix+"> submodules")
        for name,submodule in module.named_children():
            print(depth_prefix+f"- {name}: {submodule.__class__.__name__}")
    source_code = inspect.getsource(module.forward)
    forward_methods[module.__class__.__name__] = source_code
    if depth < max_depth:
        for name,submodule in module.named_children():
            if isinstance(submodule, ModuleList):
                display_module_list(submodule, name_prefix=name, depth=depth+1, max_depth=max_depth, forward_methods=forward_methods)
            else:
                display_modules(submodule, name_prefix=name, depth=depth+1, max_depth=max_depth, forward_methods=forward_methods)
    if depth==0:
        print()
        print()
        for module_type,source_code in forward_methods.items():
            print("---------------------")
            print(f"{module_type}.forward()")
            print("---------------------")
            print(source_code)
            
def display_module_list(module_list, name_prefix=None, depth=0, max_depth=1, forward_methods=None):
    # ------------------------------
    # Detect repeated layers in ModuleList: code inspired from Pytorch: ModuleList.__repr__    
    list_of_reprs = [repr(item) for item in module_list]
    if len(list_of_reprs) == 0:
        return

    start_end_indices = [[0, 0]]
    repeated_blocks = [list_of_reprs[0]]
    for i, r in enumerate(list_of_reprs[1:], 1):
        if r == repeated_blocks[-1]:
            start_end_indices[-1][1] += 1
            continue

        start_end_indices.append([i, i])
        repeated_blocks.append(r)
    # -------------------------------
    
    depth_prefix = "  "*depth
    print(depth_prefix+"---------------------")
    print(depth_prefix+f"{name_prefix}#ModuleList")
    print(depth_prefix+"> submodules")
    named_submodules = []
    for (start_id, end_id) in start_end_indices:
        submodule = module_list[start_id]
        if start_id != end_id:      
            name = f"{start_id}..{end_id}"
            print(depth_prefix+f"- {name}: {(end_id-start_id+1)}X {submodule.__class__.__name__}")
        else:
            name = str(start_id)
            print(depth_prefix+f"- {name}: {submodule.__class__.__name__}")        
        named_submodules.append((name,submodule))
    if depth < max_depth:
        for name,submodule in named_submodules:
            if isinstance(submodule, ModuleList):
                display_module_list(submodule, name_prefix=name, depth=depth+1, max_depth=max_depth, forward_methods=forward_methods)
            else:
                display_modules(submodule, name_prefix=name, depth=depth+1, max_depth=max_depth, forward_methods=forward_methods)

def get_tensor_description(t):
    dtype = str(t.dtype)[6:]
    dimensions = str(t.size())[11:-1]
    total_byte_size = t.numel() * t.element_size()
    return f"{dtype} {dimensions} ({(total_byte_size/memory_unit_mb):.1f} MB)"

In [6]:
display_modules(model)

---------------------
LlamaForCausalLM
> submodules
- model: LlamaModel
- lm_head: Linear
  ---------------------
  model#LlamaModel
  > buffers
  - causal_mask: int64 [2048, 2048] (32.0 MB)
  > submodules
  - embed_tokens: Embedding
  - layers: ModuleList
  - norm: LlamaRMSNorm
    ---------------------
    embed_tokens#Embedding
    > parameters
    - weight: float16 [32000, 2048] (125.0 MB)
    ---------------------
    layers#ModuleList
    > submodules
    - 0..23: 24X LlamaDecoderLayer
      ---------------------
      0..23#LlamaDecoderLayer
      > submodules
      - self_attn: LlamaFlashAttention2
      - mlp: LlamaMLP
      - input_layernorm: LlamaRMSNorm
      - post_attention_layernorm: LlamaRMSNorm
        ---------------------
        self_attn#LlamaFlashAttention2
        > submodules
        - q_proj: Linear
        - k_proj: Linear
        - v_proj: Linear
        - o_proj: Linear
        - rotary_emb: LlamaRotaryEmbedding
          ---------------------
          q_pr

In [39]:
def profile_forward(model, batch_size=1, seq_length=None):
    if seq_length is None: seq_length = model.config.max_position_embeddings
    input_ids = torch.randint(low=0, high=32000, size=(batch_size,seq_length), dtype=torch.int64).to(model.device)
    attention_mask = torch.ones(batch_size,seq_length).to(model.device)
    model.eval()
    with torch.profiler.profile(activities=[ProfilerActivity.CPU,ProfilerActivity.CUDA], record_shapes=True, profile_memory=True, with_stack=True, with_flops=True, with_modules=True, experimental_config=torch._C._profiler._ExperimentalConfig(verbose=True)) as prof:
        with torch.profiler.record_function("MODEL INFERENCE"):
            with torch.no_grad():
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, use_cache=False, output_attentions=False, output_hidden_states=False)        

    return prof

In [40]:
prof = profile_forward(model)

In [179]:
print(prof.key_averages().table())

-----------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                         Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  Total MFLOPs  
-----------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
              MODEL INFERENCE        29.18%     224.864ms       100.00%     770.521ms     770.521ms      30.171ms         3.85%     782.769ms     782.769ms           0 b           0 b     250.00 Mb      -8.50 Gb             1            --  
              aten::embedding 

In [41]:
events = prof.events()

In [195]:
module_file = __import__(model.__module__).__file__
packages_dir = "site-packages"
index = module_file.find(packages_dir)
packages_path = module_file[:index + len(packages_dir)+1]
packages_path


def add_call_stacks(event):
    filtered_stack = []
    torch_calls = []
    for frame in event.stack:
        if frame.endswith("profile_forward"):
            break
        elif not frame.startswith("<built-in") and not frame.startswith("torch/"):
            function = frame.split(": ")[1]
            if function!="_call_impl":
                filtered_stack.append(function)
        elif frame.startswith("<built-in method"):
            frame_words = frame.split(" ")
            torch_calls.append(frame_words[2])
            torch_calls.append(frame_words[4])
        elif frame.startswith("<built-in function"):
            frame_words = frame.split(" ")
            torch_calls.append(frame_words[2][:-1])
    filtered_stack.reverse()    
    event.call_stack = ".".join(filtered_stack)
    torch_calls.reverse()
    event.torch_stack = ".".join(torch_calls)

def print_profiler_event(event, show_code_lines=False):
    csindex = 0
    call_site = event.stack[csindex].split(": ")[0]
    while call_site.startswith("<built-in") or call_site.startswith("torch/"):
        csindex += 1
        call_site = event.stack[csindex].split(": ")[0]
        
    if show_code_lines:
        base_dir = packages_path
        relative_path = call_site.split("(")[0]
        line_number = int(call_site.split("(")[1][:-1])
        try:
            with open(base_dir+relative_path, 'r', encoding='utf-8') as file:
                file_content = file.read()
                call_line = ""
                for idx,line in enumerate(file_content.splitlines()):
                    if idx>=line_number-1:
                        call_line += f"{idx} {line}\n"
                        if line.strip().startswith("return "):
                            break
        except:
            call_line = ""
    
    add_call_stacks(event)
    
    filtered_inputs = []
    for input_shape in event.input_shapes:
        if len(input_shape)>0:
            filtered_inputs.append(input_shape)
    
    print(f"- call stack : {event.call_stack}")
    print(f"- torch stack: {event.torch_stack}")
    print(f"- kernel     : {event.name}")
    print(f"- inputs     : {filtered_inputs}")
    print(f"- cpu time   : {event.cpu_time/1000:.2f} ms")
    print(f"- cpu memory : {event.cpu_memory_usage/1024/1024:.2f} MB")
    print(f"- gpu time   : {event.cuda_time/1000:.2f} ms")
    print(f"- gpu memory : {event.cuda_memory_usage/1024/1024:.2f} MB")
    print(f"- flops      : {event.flops}")    
    if show_code_lines:
        print(f"- func src   : {call_site}")
        print(call_line)

In [196]:
print_profiler_event(events[1203], show_code_lines=True)

- call stack : LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_6.forward.LlamaFlashAttention2_6.forward.LlamaRotaryEmbedding_6.forward
- torch stack: Tensor.float
- kernel     : aten::_to_copy
- inputs     : [[1, 1, 2048]]
- cpu time   : 0.17 ms
- cpu memory : 0.00 MB
- gpu time   : 0.16 ms
- gpu memory : 0.01 MB
- flops      : 0
- func src   : transformers\models\llama\modeling_llama.py(119)
118     def forward(self, x, position_ids, seq_len=None):
119         if seq_len is not None:
121 
122         # x: [bs, num_attention_heads, seq_len, head_size]
123         inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
124         position_ids_expanded = position_ids[:, None, :].float()
125         freqs = (inv_freq_expanded @ position_ids_expanded).transpose(1, 2)
126         emb = torch.cat((freqs, freqs), dim=-1)
127         cos = emb.cos().to(dtype=x.dtype)
128         sin = emb.sin().to(dtype=x.dtype)
129         # backwards c

In [180]:
total_time = 0
for event in events:
    if event.cpu_parent is not None and event.cpu_parent.id == events[0].id:
        total_time += event.cpu_time
events[0].cpu_time ,total_time + events[0].self_cpu_time_total

(770521.0, 770521.0)

In [189]:
threshold = events[0].cuda_time/1000

In [193]:
total_time = 0
count_events = 0
count_threshold = 0
for event in events:
    if event.cpu_parent is not None and event.cpu_parent.id == events[0].id:
        count_events += 1
        if event.cuda_time>=threshold:
            count_threshold += 1
            total_time += event.cuda_time
count_threshold, count_threshold/count_events*100, total_time/events[0].cuda_time*100

(282, 16.62735849056604, 79.96982506972044)

In [205]:
coalesce_layers = "LlamaDecoderLayer"

call_stack = "LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaRMSNorm_0.forward"
start_index = call_stack.find(coalesce_layers)
if start_index >= 0:
    dot_index = call_stack.find('.', start_index)
    layer_index = int(call_stack[start_index+len(coalesce_layers)+1:dot_index])
    call_stack = call_stack[0:start_index+len(coalesce_layers)]+call_stack[dot_index:]
print(call_stack,layer_index)

LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer.forward.LlamaRMSNorm_0.forward 0


In [231]:
from IPython.display import display, Markdown

coalesce_layers = "LlamaDecoderLayer"

coalesced_events = []
first_layer0_event_index = 0
layer0_events_count = 0
layer_index = 0
event_index_in_layer = 0
for event in events:
    if event.cpu_parent is not None and event.cpu_parent.id == events[0].id:
        add_call_stacks(event)
        
        key = event.call_stack
        start_index = key.find(coalesce_layers)
        if start_index >= 0:
            dot_index = key.find('.', start_index)
            current_layer_index = int(key[start_index+len(coalesce_layers)+1:dot_index])

            if first_layer0_event_index == 0:
                first_layer0_event_index = len(coalesced_events)
                print(f"first layers 0 event index: {first_layer0_event_index}")
            if layer0_events_count == 0 and current_layer_index == 1:
                layer0_events_count = len(coalesced_events) - first_layer0_event_index
                print(f"layers 0 events count: {layer0_events_count}, events length: {len(coalesced_events)}")
            
            if current_layer_index > layer_index:
                layer_index = current_layer_index 
                if event_index_in_layer != layer0_events_count:
                    print(f"ERROR at layer {layer_index}: number of events {event_index_in_layer} different of layer 0 events count: {layer0_events_count}")
                    break
                event_index_in_layer = 0                        
            
            if layer_index == 0:
                event.layers_count = 1
                event.layers_cpu_time = event.cpu_time
                event.layers_cuda_time = event.cuda_time
                coalesced_events.append(event)
            else:
                first_event = coalesced_events[first_layer0_event_index + event_index_in_layer]
                first_event.layers_count += 1
                first_event.layers_cpu_time += event.cpu_time
                first_event.layers_cuda_time += event.cuda_time

            event_index_in_layer += 1
        else:
            coalesced_events.append(event)

table =  "| Cuda time (µs) | Cuda time (%) | Calls | Stack | PyTorch | Function |\n" 
table += "| -------------- | ------------- | ----- | ----- | ------- | -------- |\n" 
for event in coalesced_events:
    if getattr(event, "layers_count", 0) > 0:
        table += f"| {int(event.layers_cuda_time)} | {(event.layers_cuda_time/events[0].cuda_time*100):.2f} | {event.layers_count} | {event.call_stack} | {event.torch_stack} | {event.name} |\n"
    else:
        table += f"| {int(event.cuda_time)} | {(event.cuda_time/events[0].cuda_time*100):.2f} | 1 | {event.call_stack} | {event.torch_stack} | {event.name} |\n"

display(Markdown(table.replace("__","\\_\\_")))

first layers 0 event index: 6
layers 0 events count: 70, events length: 76


| Cuda time (µs) | Cuda time (%) | Calls | Stack | PyTorch | Function |
| -------------- | ------------- | ----- | ----- | ------- | -------- |
| 1653 | 0.21 | 1 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.Embedding_0.forward.embedding | type.embedding | aten::embedding |
| 111 | 0.01 | 1 | LlamaForCausalLM_0.forward.LlamaModel_0.forward | type.arange | aten::arange |
| 38 | 0.00 | 1 | LlamaForCausalLM_0.forward.LlamaModel_0.forward | Tensor.unsqueeze | aten::unsqueeze |
| 56 | 0.01 | 1 | LlamaForCausalLM_0.forward.LlamaModel_0.forward._update_causal_mask.\_\_contains\_\_ |  | aten::eq |
| 223 | 0.03 | 1 | LlamaForCausalLM_0.forward.LlamaModel_0.forward._update_causal_mask.\_\_contains\_\_ | Tensor.any | aten::any |
| 163 | 0.02 | 1 | LlamaForCausalLM_0.forward.LlamaModel_0.forward._update_causal_mask.\_\_contains\_\_ | Tensor.item | aten::item |
| 10961 | 1.40 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaRMSNorm_0.forward | Tensor.to | aten::to |
| 10873 | 1.39 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaRMSNorm_0.forward | Tensor.pow | aten::pow |
| 6732 | 0.86 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaRMSNorm_0.forward | Tensor.mean | aten::mean |
| 289 | 0.04 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaRMSNorm_0.forward |  | aten::add |
| 285 | 0.04 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaRMSNorm_0.forward | type.rsqrt | aten::rsqrt |
| 10428 | 1.33 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaRMSNorm_0.forward |  | aten::mul |
| 9258 | 1.18 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaRMSNorm_0.forward | Tensor.to | aten::to |
| 6025 | 0.77 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaRMSNorm_0.forward |  | aten::mul |
| 45574 | 5.82 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaFlashAttention2_0.forward.Linear_0.forward | linear | aten::linear |
| 46018 | 5.88 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaFlashAttention2_0.forward.Linear_1.forward | linear | aten::linear |
| 45519 | 5.82 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaFlashAttention2_0.forward.Linear_2.forward | linear | aten::linear |
| 56 | 0.01 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaFlashAttention2_0.forward | Tensor.view | aten::view |
| 169 | 0.02 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaFlashAttention2_0.forward | Tensor.transpose | aten::transpose |
| 93 | 0.01 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaFlashAttention2_0.forward | Tensor.view | aten::view |
| 1437 | 0.18 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaFlashAttention2_0.forward | Tensor.transpose | aten::transpose |
| 588 | 0.08 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaFlashAttention2_0.forward | Tensor.view | aten::view |
| 1569 | 0.20 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaFlashAttention2_0.forward | Tensor.transpose | aten::transpose |
| 1166 | 0.15 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaFlashAttention2_0.forward.LlamaRotaryEmbedding_0.forward |  | aten::unsqueeze |
| 1555 | 0.20 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaFlashAttention2_0.forward.LlamaRotaryEmbedding_0.forward |  | aten::slice |
| 1689 | 0.22 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaFlashAttention2_0.forward.LlamaRotaryEmbedding_0.forward |  | aten::unsqueeze |
| 203 | 0.03 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaFlashAttention2_0.forward.LlamaRotaryEmbedding_0.forward | Tensor.float | aten::to |
| 1475 | 0.19 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaFlashAttention2_0.forward.LlamaRotaryEmbedding_0.forward | Tensor.expand | aten::expand |
| 1439 | 0.18 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaFlashAttention2_0.forward.LlamaRotaryEmbedding_0.forward |  | aten::slice |
| 2316 | 0.30 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaFlashAttention2_0.forward.LlamaRotaryEmbedding_0.forward |  | aten::unsqueeze |
| 1595 | 0.20 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaFlashAttention2_0.forward.LlamaRotaryEmbedding_0.forward |  | aten::slice |
| 5464 | 0.70 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaFlashAttention2_0.forward.LlamaRotaryEmbedding_0.forward | Tensor.float | aten::to |
| 13297 | 1.70 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaFlashAttention2_0.forward.LlamaRotaryEmbedding_0.forward |  | aten::matmul |
| 1824 | 0.23 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaFlashAttention2_0.forward.LlamaRotaryEmbedding_0.forward | Tensor.transpose | aten::transpose |
| 1630 | 0.21 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaFlashAttention2_0.forward.LlamaRotaryEmbedding_0.forward | type.cat | aten::cat |
| 1398 | 0.18 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaFlashAttention2_0.forward.LlamaRotaryEmbedding_0.forward | Tensor.cos | aten::cos |
| 4817 | 0.62 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaFlashAttention2_0.forward.LlamaRotaryEmbedding_0.forward | Tensor.to | aten::to |
| 1676 | 0.21 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaFlashAttention2_0.forward.LlamaRotaryEmbedding_0.forward | Tensor.sin | aten::sin |
| 4618 | 0.59 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaFlashAttention2_0.forward.LlamaRotaryEmbedding_0.forward | Tensor.to | aten::to |
| 2237 | 0.29 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaFlashAttention2_0.forward.apply_rotary_pos_emb | Tensor.unsqueeze | aten::unsqueeze |
| 1785 | 0.23 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaFlashAttention2_0.forward.apply_rotary_pos_emb | Tensor.unsqueeze | aten::unsqueeze |
| 6993 | 0.89 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaFlashAttention2_0.forward.apply_rotary_pos_emb |  | aten::mul |
| 1026 | 0.13 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaFlashAttention2_0.forward.apply_rotary_pos_emb.rotate_half |  | aten::slice |
| 1107 | 0.14 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaFlashAttention2_0.forward.apply_rotary_pos_emb.rotate_half |  | aten::slice |
| 3849 | 0.49 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaFlashAttention2_0.forward.apply_rotary_pos_emb.rotate_half |  | aten::neg |
| 7671 | 0.98 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaFlashAttention2_0.forward.apply_rotary_pos_emb.rotate_half | type.cat | aten::cat |
| 5780 | 0.74 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaFlashAttention2_0.forward.apply_rotary_pos_emb |  | aten::mul |
| 6947 | 0.89 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaFlashAttention2_0.forward.apply_rotary_pos_emb |  | aten::add |
| 4993 | 0.64 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaFlashAttention2_0.forward.apply_rotary_pos_emb |  | aten::mul |
| 1142 | 0.15 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaFlashAttention2_0.forward.apply_rotary_pos_emb.rotate_half |  | aten::slice |
| 1211 | 0.15 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaFlashAttention2_0.forward.apply_rotary_pos_emb.rotate_half |  | aten::slice |
| 3920 | 0.50 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaFlashAttention2_0.forward.apply_rotary_pos_emb.rotate_half |  | aten::neg |
| 6038 | 0.77 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaFlashAttention2_0.forward.apply_rotary_pos_emb.rotate_half | type.cat | aten::cat |
| 4992 | 0.64 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaFlashAttention2_0.forward.apply_rotary_pos_emb |  | aten::mul |
| 6954 | 0.89 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaFlashAttention2_0.forward.apply_rotary_pos_emb |  | aten::add |
| 917 | 0.12 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaFlashAttention2_0.forward | Tensor.transpose | aten::transpose |
| 1030 | 0.13 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaFlashAttention2_0.forward | Tensor.transpose | aten::transpose |
| 2641 | 0.34 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaFlashAttention2_0.forward | Tensor.transpose | aten::transpose |
| 44055 | 5.63 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaFlashAttention2_0.forward._flash_attention_forward.flash_attn_func.apply | FunctionMeta.apply | FlashAttnFunc |
| 165 | 0.02 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaFlashAttention2_0.forward | Tensor.reshape | aten::reshape |
| 41821 | 5.34 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaFlashAttention2_0.forward.Linear_3.forward | linear | aten::linear |
| 6378 | 0.81 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward |  | aten::add |
| 8433 | 1.08 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaRMSNorm_1.forward | Tensor.to | aten::to |
| 9885 | 1.26 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaRMSNorm_1.forward | Tensor.pow | aten::pow |
| 5839 | 0.75 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaRMSNorm_1.forward | Tensor.mean | aten::mean |
| 770 | 0.10 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaRMSNorm_1.forward |  | aten::add |
| 815 | 0.10 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaRMSNorm_1.forward | type.rsqrt | aten::rsqrt |
| 9109 | 1.16 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaRMSNorm_1.forward |  | aten::mul |
| 9236 | 1.18 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaRMSNorm_1.forward | Tensor.to | aten::to |
| 5497 | 0.70 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaRMSNorm_1.forward |  | aten::mul |
| 83935 | 10.72 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaMLP_0.forward.Linear_4.forward | linear | aten::linear |
| 11112 | 1.42 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaMLP_0.forward.SiLU_0.forward.silu | silu | aten::silu |
| 80307 | 10.26 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaMLP_0.forward.Linear_5.forward | linear | aten::linear |
| 17147 | 2.19 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaMLP_0.forward |  | aten::mul |
| 84765 | 10.83 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward.LlamaMLP_0.forward.Linear_6.forward | linear | aten::linear |
| 6261 | 0.80 | 24 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaDecoderLayer_0.forward |  | aten::add |
| 353 | 0.05 | 1 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaRMSNorm_48.forward | Tensor.to | aten::to |
| 111 | 0.01 | 1 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaRMSNorm_48.forward | Tensor.pow | aten::pow |
| 80 | 0.01 | 1 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaRMSNorm_48.forward | Tensor.mean | aten::mean |
| 8 | 0.00 | 1 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaRMSNorm_48.forward |  | aten::add |
| 47 | 0.01 | 1 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaRMSNorm_48.forward | type.rsqrt | aten::rsqrt |
| 183 | 0.02 | 1 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaRMSNorm_48.forward |  | aten::mul |
| 102 | 0.01 | 1 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaRMSNorm_48.forward | Tensor.to | aten::to |
| 65 | 0.01 | 1 | LlamaForCausalLM_0.forward.LlamaModel_0.forward.LlamaRMSNorm_48.forward |  | aten::mul |
| 9440 | 1.21 | 1 | LlamaForCausalLM_0.forward.Linear_168.forward | linear | aten::linear |
| 1166 | 0.15 | 1 | LlamaForCausalLM_0.forward | Tensor.float | aten::to |


In [194]:
for event in events:
    if event.cpu_parent is not None and event.cpu_parent.id == events[0].id:
        if event.cuda_time>=threshold:
            print(event.name, int(event.cuda_time), f"{(event.cuda_time/events[0].cuda_time*100):.2f} %")
            if event.cpu_children is not None:
                for child_event in event.cpu_children:
                    if child_event.cuda_time>=threshold:
                        print("-", child_event.name, int(child_event.cuda_time), f"{(child_event.cuda_time/event.cuda_time*100):.2f} %")
                        if child_event.cpu_children is not None:
                            for grandchild_event in child_event.cpu_children:
                                if grandchild_event.cuda_time>=threshold:
                                    print(" ","-", grandchild_event.name, int(grandchild_event.cuda_time), f"{(grandchild_event.cuda_time/event.cuda_time*100):.2f} %")
                                    if grandchild_event.cpu_children is not None:
                                        for Grandchild_event in grandchild_event.cpu_children:
                                            if Grandchild_event.cuda_time>=threshold:
                                                print(" "," ","-", Grandchild_event.name, int(Grandchild_event.cuda_time), f"{(Grandchild_event.cuda_time/event.cuda_time*100):.2f} %")

aten::embedding 1653 0.21 %
- aten::index_select 1394 84.33 %
aten::to 1929 0.25 %
- aten::_to_copy 1895 98.24 %
  - aten::copy_ 1782 92.38 %
aten::pow 2127 0.27 %
aten::mean 1261 0.16 %
aten::mul 2072 0.26 %
aten::to 1765 0.23 %
- aten::_to_copy 1758 99.60 %
  - aten::copy_ 1744 98.81 %
aten::mul 1129 0.14 %
aten::linear 5832 0.75 %
- aten::matmul 5805 99.54 %
  - aten::mm 5628 96.50 %
aten::linear 5639 0.72 %
- aten::matmul 5501 97.55 %
  - aten::mm 5394 95.66 %
aten::linear 5628 0.72 %
- aten::matmul 5501 97.74 %
  - aten::mm 5409 96.11 %
aten::mul 1187 0.15 %
aten::cat 1065 0.14 %
aten::mul 1125 0.14 %
aten::add 1650 0.21 %
aten::mul 1099 0.14 %
aten::cat 1081 0.14 %
aten::mul 1121 0.14 %
aten::add 1643 0.21 %
FlashAttnFunc 6576 0.84 %
aten::linear 5800 0.74 %
- aten::matmul 5658 97.55 %
  - aten::mm 5614 96.79 %
aten::add 1619 0.21 %
aten::to 1744 0.22 %
- aten::_to_copy 1734 99.43 %
  - aten::copy_ 1720 98.62 %
aten::pow 2120 0.27 %
aten::mean 1267 0.16 %
aten::mul 2098 0.27 %
at

Jarvislabs Pytorch VM config:

### Jeremy Howard : How to install Pytorch and cuda with conda

https://twitter.com/jeremyphoward/status/1697435241152127369

1. Install miniconda

https://github.com/fastai/fastsetup/blob/master/setup-conda.sh

2. Find out what CUDA version PyTorch expects by going to their website and seeing what the latest "compute platform" version is.

https://pytorch.org/

> conda install pytorch ... pytorch-cuda=12.1 ...

3. Install CUDA

```
conda install cuda -c nvidia/label/cuda-12.1.0
```

4.Copy the command to install pytorch from their website, but replace `-c nvidia` with a version specific label, as shown below:

```
conda install pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia/label/cuda-12.1.0`
```

### ubuntu version

```
root@ac9978bd266c:~# lsb_release -a
No LSB modules are available.
Distributor ID: Ubuntu
Description:    Ubuntu 22.04.3 LTS
Release:        22.04
Codename:       jammy

root@ac9978bd266c:~# uname -a
Linux ac9978bd266c 5.15.0-89-generic #99-Ubuntu SMP Mon Oct 30 20:42:41 UTC 2023 x86_64 x86_64 x86_64 GNU/Linux

root@ac9978bd266c:~# cat /etc/os-release
PRETTY_NAME="Ubuntu 22.04.3 LTS"
NAME="Ubuntu"
VERSION_ID="22.04"
VERSION="22.04.3 LTS (Jammy Jellyfish)"
VERSION_CODENAME=jammy
ID=ubuntu
ID_LIKE=debian
HOME_URL="https://www.ubuntu.com/"
SUPPORT_URL="https://help.ubuntu.com/"
BUG_REPORT_URL="https://bugs.launchpad.net/ubuntu/"
PRIVACY_POLICY_URL="https://www.ubuntu.com/legal/terms-and-policies/privacy-policy"
UBUNTU_CODENAME=jammy
```

### container setup

Docker container ID

```
root@ac9978bd266c:~# echo $HOSTNAME
ac9978bd266c
```

Docker container entrypoint

```
root@ac9978bd266c:~# cat /docker-entrypoint.sh
#!/bin/bash
source /opt/conda/etc/profile.d/conda.sh
conda activate py3.10                   
echo "PasswordAuthentication no" >> /etc/ssh/sshd_config
service ssh start
export SHELL="/bin/bash"

env HOME=/home code-server --host 0.0.0.0 --port 7007 --auth none&
env HOME=/home jupyter lab --ip=0.0.0.0 --NotebookApp.token=$TOKEN  --allow-root --port 8889
```

```
root@ac9978bd266c:~# ps aux
USER         PID %CPU %MEM    VSZ   RSS TTY      STAT START   TIME COMMAND
root           1  0.0  0.0   4360  3392 ?        Ss   08:26   0:00 /bin/bash /docker-entrypoint.sh
root          18  0.0  0.0  15428  3732 ?        Ss   08:26   0:00 sshd: /usr/sbin/sshd [listener] 0 of 10-100 startups
root          19  0.0  0.0 1238088 64232 ?       Sl   08:26   0:00 /usr/lib/code-server/lib/node /usr/lib/code-server --host 0.0.0.0 --port 7007 --auth none
root          20  0.2  0.0 888616 94924 ?        Rl   08:26   0:07 /root/miniconda3/envs/py3.10/bin/python /root/miniconda3/envs/py3.10/bin/jupyter-lab --ip=0.0.0.0 --NotebookApp.token=3vAcOEC8d551ymv0ppLqoR69HaQEfJB-1KEiXP1WwS8WlZbWf0_R7tCOxXoV8T8G --allo
root          46  0.0  0.0 1172236 64432 ?       Sl   08:26   0:00 /usr/lib/code-server/lib/node /usr/lib/code-server/out/node/entry
```

Jarvislab Urls

- Jupyterlab: https://ac9978bd266c.notebooksh.jarvislabs.net/
- VS Code:    https://ac9978bd266c0.notebooksh.jarvislabs.net/
- Any service listening on port 6006 inside the container: https://ac9978bd266c1.notebooksh.jarvislabs.net/

### miniconda install

```
root@ac9978bd266c:/root/miniconda3# which conda
/root/miniconda3/bin/conda

root@ac9978bd266c:/root/miniconda3# conda --version
conda 23.11.0

root@ac9978bd266c:~# which python
/root/miniconda3/envs/py3.10/bin/python

root@ac9978bd266c:/root/miniconda3# python --version
Python 3.10.13
```

### miniconda environments

```
root@ac9978bd266c:/root/miniconda3# conda env list
base                     /root/miniconda3
py3.10                   /root/miniconda3/envs/py3.10

root@ac9978bd266c:/root/miniconda3/envs/py3.10/conda-meta# cat history 
==> 2024-01-30 10:00:19 <==
# cmd: /root/miniconda3/bin/conda create -n py3.10 python=3.10
# conda version: 23.11.0
```

### apt source

https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64 

### nvidia packages

```
cuda-cccl-12-3/unknown,now 12.3.101-1 amd64 [installed,automatic]
cuda-compat-12-3/unknown,now 545.23.08-1 amd64 [installed]
cuda-cudart-12-3/unknown,now 12.3.101-1 amd64 [installed]
cuda-cudart-dev-12-3/unknown,now 12.3.101-1 amd64 [installed]
cuda-cuobjdump-12-3/unknown,now 12.3.101-1 amd64 [installed,automatic]
cuda-cupti-12-3/unknown,now 12.3.101-1 amd64 [installed,automatic]
cuda-cupti-dev-12-3/unknown,now 12.3.101-1 amd64 [installed,automatic]
cuda-cuxxfilt-12-3/unknown,now 12.3.101-1 amd64 [installed,automatic]
cuda-driver-dev-12-3/unknown,now 12.3.101-1 amd64 [installed,automatic]
cuda-gdb-12-3/unknown,now 12.3.101-1 amd64 [installed,automatic]
cuda-nvdisasm-12-3/unknown,now 12.3.101-1 amd64 [installed,automatic]
cuda-nvml-dev-12-3/unknown,now 12.3.101-1 amd64 [installed]
cuda-nvprof-12-3/unknown,now 12.3.101-1 amd64 [installed]
cuda-nvprune-12-3/unknown,now 12.3.101-1 amd64 [installed,automatic]
cuda-nvtx-12-3/unknown,now 12.3.101-1 amd64 [installed]
cuda-opencl-12-3/unknown,now 12.3.101-1 amd64 [installed,automatic]
cuda-opencl-dev-12-3/unknown,now 12.3.101-1 amd64 [installed,automatic]
cuda-profiler-api-12-3/unknown,now 12.3.101-1 amd64 [installed,automatic]
cuda-sanitizer-12-3/unknown,now 12.3.101-1 amd64 [installed,automatic]
cuda-toolkit-12-3-config-common/unknown,now 12.3.101-1 all [installed,automatic]
cuda-toolkit-12-config-common/unknown,now 12.3.101-1 all [installed,automatic]
cuda-toolkit-config-common/unknown,now 12.3.101-1 all [installed,automatic]
libcublas-12-3/unknown,now 12.3.4.1-1 amd64 [installed]
libcublas-dev-12-3/unknown,now 12.3.4.1-1 amd64 [installed]
libcufft-12-3/unknown,now 11.0.12.1-1 amd64 [installed,automatic]
libcufft-dev-12-3/unknown,now 11.0.12.1-1 amd64 [installed,automatic]
libcufile-12-3/unknown,now 1.8.1.2-1 amd64 [installed,automatic]
libcufile-dev-12-3/unknown,now 1.8.1.2-1 amd64 [installed,automatic]
libcusolver-12-3/unknown,now 11.5.4.101-1 amd64 [installed,automatic]
libcusolver-dev-12-3/unknown,now 11.5.4.101-1 amd64 [installed,automatic]
libcusparse-12-3/unknown,now 12.2.0.103-1 amd64 [installed]
libcusparse-dev-12-3/unknown,now 12.2.0.103-1 amd64 [installed]
libnpp-12-3/unknown,now 12.2.3.2-1 amd64 [installed]
libnpp-dev-12-3/unknown,now 12.2.3.2-1 amd64 [installed]
libnvjitlink-12-3/unknown,now 12.3.101-1 amd64 [installed,automatic]
libnvjitlink-dev-12-3/unknown,now 12.3.101-1 amd64 [installed,automatic]
libnvjpeg-12-3/unknown,now 12.3.0.81-1 amd64 [installed,automatic]
libnvjpeg-dev-12-3/unknown,now 12.3.0.81-1 amd64 [installed,automatic]
nsight-compute-2023.3.1/unknown,now 2023.3.1.1-1 amd64 [installed,automatic]
```

### Environment variables

```
NV_LIBCUBLAS_VERSION=12.3.4.1-1
NVIDIA_VISIBLE_DEVICES=3
NV_NVML_DEV_VERSION=12.3.101-1
NV_LIBNCCL_DEV_PACKAGE=libnccl-dev=2.19.3-1+cuda12.3
NV_LIBNCCL_DEV_PACKAGE_VERSION=2.19.3-1
PYTHON_VERSION=3.10
NVIDIA_REQUIRE_CUDA=cuda>=12.3 brand=tesla,driver>=470,driver<471 brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 brand=geforce,driver>=470,driver<471 brand=geforcertx,driver>=470,driver<471 brand=quadro,driver>=470,driver<471 brand=quadrortx,driver>=470,driver<471 brand=titan,driver>=470,driver<471 brand=titanrtx,driver>=470,driver<471 brand=tesla,driver>=525,driver<526 brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526 brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526 brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526 brand=tesla,driver>=535,driver<536 brand=unknown,driver>=535,driver<536 brand=nvidia,driver>=535,driver<536 brand=nvidiartx,driver>=535,driver<536 brand=geforce,driver>=535,driver<536 brand=geforcertx,driver>=535,driver<536 brand=quadro,driver>=535,driver<536 brand=quadrortx,driver>=535,driver<536 brand=titan,driver>=535,driver<536 brand=titanrtx,driver>=535,driver<536
NV_LIBCUBLAS_DEV_PACKAGE=libcublas-dev-12-3=12.3.4.1-1
NV_NVTX_VERSION=12.3.101-1
NV_CUDA_CUDART_DEV_VERSION=12.3.101-1
NV_LIBCUSPARSE_VERSION=12.2.0.103-1
NV_LIBNPP_VERSION=12.2.3.2-1
NCCL_VERSION=2.19.3-1
NVIDIA_DRIVER_CAPABILITIES=compute,utility
NV_NVPROF_DEV_PACKAGE=cuda-nvprof-12-3=12.3.101-1
NV_LIBNPP_PACKAGE=libnpp-12-3=12.2.3.2-1
NV_LIBNCCL_DEV_PACKAGE_NAME=libnccl-dev
NV_LIBCUBLAS_DEV_VERSION=12.3.4.1-1
NVIDIA_PRODUCT_NAME=CUDA
NV_LIBCUBLAS_DEV_PACKAGE_NAME=libcublas-dev-12-3
NV_CUDA_CUDART_VERSION=12.3.101-1
CUDA_VERSION=12.3.1
NV_LIBCUBLAS_PACKAGE=libcublas-12-3=12.3.4.1-1
NV_CUDA_NSIGHT_COMPUTE_DEV_PACKAGE=cuda-nsight-compute-12-3=12.3.1-1
PYDEVD_USE_FRAME_EVAL=NO
NV_LIBNPP_DEV_PACKAGE=libnpp-dev-12-3=12.2.3.2-1
NV_LIBCUBLAS_PACKAGE_NAME=libcublas-12-3
NV_LIBNPP_DEV_VERSION=12.2.3.2-1
NV_LIBCUSPARSE_DEV_VERSION=12.2.0.103-1
LIBRARY_PATH=/usr/local/cuda/lib64/stubs
NV_CUDA_LIB_VERSION=12.3.1-1
NVARCH=x86_64
NV_CUDA_COMPAT_PACKAGE=cuda-compat-12-3
NV_LIBNCCL_PACKAGE=libnccl2=2.19.3-1+cuda12.3
LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64
NV_CUDA_NSIGHT_COMPUTE_VERSION=12.3.1-1
NV_NVPROF_VERSION=12.3.101-1
PATH=/root/miniconda3/envs/py3.10/bin:/root/miniconda3/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
NV_LIBNCCL_PACKAGE_NAME=libnccl2
NV_LIBNCCL_PACKAGE_VERSION=2.19.3-1
```

### python packages

```
libpython3-stdlib/jammy-updates,jammy-security,now 3.10.6-1~22.04 amd64 [installed,automatic]
libpython3.10-minimal/jammy-updates,jammy-security,now 3.10.12-1~22.04.3 amd64 [installed,automatic]
libpython3.10-stdlib/jammy-updates,jammy-security,now 3.10.12-1~22.04.3 amd64 [installed,automatic]
libpython3.10/jammy-updates,jammy-security,now 3.10.12-1~22.04.3 amd64 [installed,automatic]
python3-dbus/jammy,now 1.2.18-3build1 amd64 [installed,automatic]
python3-distro/jammy,now 1.7.0-1 all [installed,automatic]
python3-gi/jammy-updates,now 3.42.1-0ubuntu1 amd64 [installed,automatic]
python3-minimal/jammy-updates,jammy-security,now 3.10.6-1~22.04 amd64 [installed,automatic]
python3.10-minimal/jammy-updates,jammy-security,now 3.10.12-1~22.04.3 amd64 [installed,automatic]
python3.10/jammy-updates,jammy-security,now 3.10.12-1~22.04.3 amd64 [installed,automatic]
python3/jammy-updates,jammy-security,now 3.10.6-1~22.04 amd64 [installed,automatic]
```

```
print(sys.path)
['/home', 
 '/root/miniconda3/envs/py3.10/lib/python310.zip', 
 '/root/miniconda3/envs/py3.10/lib/python3.10', 
 '/root/miniconda3/envs/py3.10/lib/python3.10/lib-dynload', 
 '', 
 '/root/miniconda3/envs/py3.10/lib/python3.10/site-packages']
```

### Jupyterlab config

/root/miniconda3/envs/py3.10/etc/jupyter/jupyter_notebook_config.d/jupyter-lsp-notebook.json

```
root@ac9978bd266c:~# cat /root/miniconda3/envs/py3.10/etc/jupyter/jupyter_notebook_config.d/jupyter-lsp-notebook.json
{
  "NotebookApp": {
    "nbserver_extensions": {
      "jupyter_lsp": true
    }
  }
}
```

/root/miniconda3/envs/py3.10/etc/jupyter/jupyter_notebook_config.d/jupyterlab.json

```
root@ac9978bd266c:~# cat /root/miniconda3/envs/py3.10/etc/jupyter/jupyter_notebook_config.d/jupyterlab.json
{
  "NotebookApp": {
    "nbserver_extensions": {
      "jupyterlab": true
    }
  }
}
```

/root/miniconda3/envs/py3.10/etc/jupyter/jupyter_server_config.d/jupyter-lsp-jupyter-server.json

```
root@ac9978bd266c:~# cat /root/miniconda3/envs/py3.10/etc/jupyter/jupyter_server_config.d/jupyter-lsp-jupyter-server.json
{
  "ServerApp": {
    "jpserver_extensions": {
      "jupyter_lsp": true
    }
  }
}
```

/root/miniconda3/envs/py3.10/etc/jupyter/jupyter_server_config.d/jupyter_server_terminals.json

```
root@ac9978bd266c:~# cat /root/miniconda3/envs/py3.10/etc/jupyter/jupyter_server_config.d/jupyter_server_terminals.json
{
  "ServerApp": {
    "jpserver_extensions": {
      "jupyter_server_terminals": true
    }
  }
}
```

/root/miniconda3/envs/py3.10/etc/jupyter/jupyter_server_config.d/jupyterlab.json

```
root@ac9978bd266c:~# cat /root/miniconda3/envs/py3.10/etc/jupyter/jupyter_server_config.d/jupyterlab.json
{
  "ServerApp": {
    "jpserver_extensions": {
      "jupyterlab": true
    }
  }
}
```

/root/miniconda3/envs/py3.10/etc/jupyter/jupyter_server_config.d/notebook.json


```
root@ac9978bd266c:~# cat /root/miniconda3/envs/py3.10/etc/jupyter/jupyter_server_config.d/notebook.json
{
  "ServerApp": {
    "jpserver_extensions": {
      "notebook": true
    }
  }
}
```

/root/miniconda3/envs/py3.10/etc/jupyter/jupyter_server_config.d/notebook_shim.json

```
root@ac9978bd266c:~# cat /root/miniconda3/envs/py3.10/etc/jupyter/jupyter_server_config.d/notebook_shim.json
{
    "ServerApp": {
        "jpserver_extensions": {
            "notebook_shim": true
        }
    }
}
```

/root/miniconda3/envs/py3.10/etc/jupyter/nbconfig/notebook.d/widgetsnbextension.json 

```
root@ac9978bd266c:~# cat /root/miniconda3/envs/py3.10/etc/jupyter/nbconfig/notebook.d/widgetsnbextension.json 
{
  "load_extensions": {
    "jupyter-js-widgets/extension": true
  }
}
```

### code-server config

```
root@ac9978bd266c:~# cat ~/.config/code-server/config.yaml

> nothing

root@ac9978bd266c:~# /usr/lib/code-server/bin/code-server --list-extensions

> nothing
```

### pip list

```
diffusers                 0.26.3
jupyterlab                4.0.11
numpy                     1.26.3
nvidia-cublas-cu12        12.1.3.1
nvidia-cuda-cupti-cu12    12.1.105
nvidia-cuda-nvrtc-cu12    12.1.105
nvidia-cuda-runtime-cu12  12.1.105
nvidia-cudnn-cu12         8.9.2.26
nvidia-cufft-cu12         11.0.2.54
nvidia-curand-cu12        10.3.2.106
nvidia-cusolver-cu12      11.4.5.107
nvidia-cusparse-cu12      12.1.0.106
nvidia-nccl-cu12          2.18.1
nvidia-nvjitlink-cu12     12.3.101
nvidia-nvtx-cu12          12.1.105
pandas                    2.2.0
spacy                     3.7.4
torch                     2.1.2
transformers              4.37.2
triton                    2.1.0
```

```
root@ac9978bd266c:~# pip show torch
Name: torch
Version: 2.1.2
Summary: Tensors and Dynamic neural networks in Python with strong GPU acceleration
Home-page: https://pytorch.org/
Author: PyTorch Team
Author-email: packages@pytorch.org
License: BSD-3
Location: /root/miniconda3/envs/py3.10/lib/python3.10/site-packages
Requires: filelock, fsspec, jinja2, networkx, nvidia-cublas-cu12, nvidia-cuda-cupti-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-runtime-cu12, nvidia-cudnn-cu12, nvidia-cufft-cu12, nvidia-curand-cu12, nvidia-cusolver-cu12, nvidia-cusparse-cu12, nvidia-nccl-cu12, nvidia-nvtx-cu12, sympy, triton, typing-extensions
Required-by: torchaudio, torchvision
```

```
root@ac9978bd266c:/root/miniconda3# conda list -n py3.10
# packages in environment at /root/miniconda3/envs/py3.10:
#
# Name                    Version                   Build  Channel
_libgcc_mutex             0.1                        main  
_openmp_mutex             5.1                       1_gnu  
annotated-types           0.6.0                    pypi_0    pypi
anyio                     4.2.0                    pypi_0    pypi
argon2-cffi               23.1.0                   pypi_0    pypi
argon2-cffi-bindings      21.2.0                   pypi_0    pypi
arrow                     1.3.0                    pypi_0    pypi
asttokens                 2.4.1                    pypi_0    pypi
async-lru                 2.0.4                    pypi_0    pypi
attrs                     23.2.0                   pypi_0    pypi
babel                     2.14.0                   pypi_0    pypi
beautifulsoup4            4.12.3                   pypi_0    pypi
bleach                    6.1.0                    pypi_0    pypi
blis                      0.7.11                   pypi_0    pypi
bzip2                     1.0.8                h7b6447c_0  
ca-certificates           2023.12.12           h06a4308_0  
catalogue                 2.0.10                   pypi_0    pypi
certifi                   2023.11.17               pypi_0    pypi
cffi                      1.16.0                   pypi_0    pypi
charset-normalizer        3.3.2                    pypi_0    pypi
click                     8.1.7                    pypi_0    pypi
cloudpathlib              0.16.0                   pypi_0    pypi
comm                      0.2.1                    pypi_0    pypi
confection                0.1.4                    pypi_0    pypi
cymem                     2.0.8                    pypi_0    pypi
debugpy                   1.8.0                    pypi_0    pypi
decorator                 5.1.1                    pypi_0    pypi
defusedxml                0.7.1                    pypi_0    pypi
diffusers                 0.26.3                   pypi_0    pypi
exceptiongroup            1.2.0                    pypi_0    pypi
executing                 2.0.1                    pypi_0    pypi
fastjsonschema            2.19.1                   pypi_0    pypi
filelock                  3.13.1                   pypi_0    pypi
fqdn                      1.5.1                    pypi_0    pypi
fsspec                    2023.12.2                pypi_0    pypi
ftfy                      6.1.3                    pypi_0    pypi
huggingface-hub           0.20.3                   pypi_0    pypi
idna                      3.6                      pypi_0    pypi
importlib-metadata        7.0.1                    pypi_0    pypi
ipykernel                 6.29.0                   pypi_0    pypi
ipython                   8.20.0                   pypi_0    pypi
ipywidgets                8.1.1                    pypi_0    pypi
isoduration               20.11.0                  pypi_0    pypi
jedi                      0.19.1                   pypi_0    pypi
jinja2                    3.1.3                    pypi_0    pypi
json5                     0.9.14                   pypi_0    pypi
jsonpointer               2.4                      pypi_0    pypi
jsonschema                4.21.1                   pypi_0    pypi
jsonschema-specifications 2023.12.1                pypi_0    pypi
jupyter-client            8.6.0                    pypi_0    pypi
jupyter-core              5.7.1                    pypi_0    pypi
jupyter-events            0.9.0                    pypi_0    pypi
jupyter-lsp               2.2.2                    pypi_0    pypi
jupyter-server            2.12.5                   pypi_0    pypi
jupyter-server-terminals  0.5.2                    pypi_0    pypi
jupyterlab                4.0.11                   pypi_0    pypi
jupyterlab-pygments       0.3.0                    pypi_0    pypi
jupyterlab-server         2.25.2                   pypi_0    pypi
jupyterlab-widgets        3.0.9                    pypi_0    pypi
langcodes                 3.3.0                    pypi_0    pypi
ld_impl_linux-64          2.38                 h1181459_1  
libffi                    3.4.4                h6a678d5_0  
libgcc-ng                 11.2.0               h1234567_1  
libgomp                   11.2.0               h1234567_1  
libstdcxx-ng              11.2.0               h1234567_1  
libuuid                   1.41.5               h5eee18b_0  
markupsafe                2.1.4                    pypi_0    pypi
matplotlib-inline         0.1.6                    pypi_0    pypi
mistune                   3.0.2                    pypi_0    pypi
mpmath                    1.3.0                    pypi_0    pypi
murmurhash                1.0.10                   pypi_0    pypi
nbclient                  0.9.0                    pypi_0    pypi
nbconvert                 7.14.2                   pypi_0    pypi
nbformat                  5.9.2                    pypi_0    pypi
ncurses                   6.4                  h6a678d5_0  
nest-asyncio              1.6.0                    pypi_0    pypi
networkx                  3.2.1                    pypi_0    pypi
notebook                  7.0.7                    pypi_0    pypi
notebook-shim             0.2.3                    pypi_0    pypi
numpy                     1.26.3                   pypi_0    pypi
nvidia-cublas-cu12        12.1.3.1                 pypi_0    pypi
nvidia-cuda-cupti-cu12    12.1.105                 pypi_0    pypi
nvidia-cuda-nvrtc-cu12    12.1.105                 pypi_0    pypi
nvidia-cuda-runtime-cu12  12.1.105                 pypi_0    pypi
nvidia-cudnn-cu12         8.9.2.26                 pypi_0    pypi
nvidia-cufft-cu12         11.0.2.54                pypi_0    pypi
nvidia-curand-cu12        10.3.2.106               pypi_0    pypi
nvidia-cusolver-cu12      11.4.5.107               pypi_0    pypi
nvidia-cusparse-cu12      12.1.0.106               pypi_0    pypi
nvidia-nccl-cu12          2.18.1                   pypi_0    pypi
nvidia-nvjitlink-cu12     12.3.101                 pypi_0    pypi
nvidia-nvtx-cu12          12.1.105                 pypi_0    pypi
openssl                   3.0.12               h7f8727e_0  
overrides                 7.7.0                    pypi_0    pypi
packaging                 23.2                     pypi_0    pypi
pandas                    2.2.0                    pypi_0    pypi
pandocfilters             1.5.1                    pypi_0    pypi
parso                     0.8.3                    pypi_0    pypi
pexpect                   4.9.0                    pypi_0    pypi
pillow                    10.2.0                   pypi_0    pypi
pip                       23.3.2                   pypi_0    pypi
platformdirs              4.1.0                    pypi_0    pypi
preshed                   3.0.9                    pypi_0    pypi
prometheus-client         0.19.0                   pypi_0    pypi
prompt-toolkit            3.0.43                   pypi_0    pypi
psutil                    5.9.8                    pypi_0    pypi
ptyprocess                0.7.0                    pypi_0    pypi
pure-eval                 0.2.2                    pypi_0    pypi
pycparser                 2.21                     pypi_0    pypi
pydantic                  2.6.1                    pypi_0    pypi
pydantic-core             2.16.2                   pypi_0    pypi
pygments                  2.17.2                   pypi_0    pypi
python                    3.10.13              h955ad1f_0  
python-dateutil           2.8.2                    pypi_0    pypi
python-json-logger        2.0.7                    pypi_0    pypi
pytz                      2024.1                   pypi_0    pypi
pyyaml                    6.0.1                    pypi_0    pypi
pyzmq                     25.1.2                   pypi_0    pypi
readline                  8.2                  h5eee18b_0  
referencing               0.33.0                   pypi_0    pypi
regex                     2023.12.25               pypi_0    pypi
requests                  2.31.0                   pypi_0    pypi
rfc3339-validator         0.1.4                    pypi_0    pypi
rfc3986-validator         0.1.1                    pypi_0    pypi
rpds-py                   0.17.1                   pypi_0    pypi
safetensors               0.4.2                    pypi_0    pypi
send2trash                1.8.2                    pypi_0    pypi
setuptools                68.2.2          py310h06a4308_0  
six                       1.16.0                   pypi_0    pypi
smart-open                6.4.0                    pypi_0    pypi
sniffio                   1.3.0                    pypi_0    pypi
soupsieve                 2.5                      pypi_0    pypi
spacy                     3.7.4                    pypi_0    pypi
spacy-legacy              3.0.12                   pypi_0    pypi
spacy-loggers             1.0.5                    pypi_0    pypi
sqlite                    3.41.2               h5eee18b_0  
srsly                     2.4.8                    pypi_0    pypi
stack-data                0.6.3                    pypi_0    pypi
sympy                     1.12                     pypi_0    pypi
terminado                 0.18.0                   pypi_0    pypi
thinc                     8.2.3                    pypi_0    pypi
tinycss2                  1.2.1                    pypi_0    pypi
tk                        8.6.12               h1ccaba5_0  
tokenizers                0.15.2                   pypi_0    pypi
tomli                     2.0.1                    pypi_0    pypi
torch                     2.1.2                    pypi_0    pypi
torchaudio                2.1.2                    pypi_0    pypi
torchvision               0.16.2                   pypi_0    pypi
tornado                   6.4                      pypi_0    pypi
tqdm                      4.66.2                   pypi_0    pypi
traitlets                 5.14.1                   pypi_0    pypi
transformers              4.37.2                   pypi_0    pypi
triton                    2.1.0                    pypi_0    pypi
typer                     0.9.0                    pypi_0    pypi
types-python-dateutil     2.8.19.20240106          pypi_0    pypi
typing-extensions         4.9.0                    pypi_0    pypi
tzdata                    2024.1                   pypi_0    pypi
uri-template              1.3.0                    pypi_0    pypi
urllib3                   2.1.0                    pypi_0    pypi
wasabi                    1.1.2                    pypi_0    pypi
wcwidth                   0.2.13                   pypi_0    pypi
weasel                    0.3.4                    pypi_0    pypi
webcolors                 1.13                     pypi_0    pypi
webencodings              0.5.1                    pypi_0    pypi
websocket-client          1.7.0                    pypi_0    pypi
wheel                     0.41.2          py310h06a4308_0  
widgetsnbextension        4.0.9                    pypi_0    pypi
xz                        5.4.5                h5eee18b_0  
zipp                      3.17.0                   pypi_0    pypi
zlib                      1.2.13               h5eee18b_0  
```