# LLM Profiling

Launching the large language models which are contained in `src/models` and using NSightCompute to profile their kernels on an **NVIDIA H100**.

The goal of this is to perform principle component analysis (PCA) to examine new large language model kernels in comparision to existing GPU benchmark suites such as Rodinia.

In [2]:
from glob import glob
import os
import subprocess

import pandas as pd
import numpy as np
from time import time

import matplotlib.pyplot as plt

In [3]:
!mkdir -p ncu-reports

# Generating a ncu report for each model
NVTX_RANGE = "generation/"
BASE_PATH = "../src/models/"
MODELS = [
    "phi2.py",
    "phi3.5.py",
    "orca-mini-7b.py",
    "jamba1.5.mini.py",
    "flux.py",
    "llama70B.py",
]
SPEEDUP_PARAMS = "--replay-mode application --warp-sampling-max-passes 1"
SLEEP_TIME = 10 * 60 # 10 minutes

def clean_up_model_name(model_name: str) -> str:
    return model_name.replace(".py", "").replace(".", "")

diff = {}

for model in MODELS:
    print(f"Profiling {model}")

    model_name = clean_up_model_name(model)
    model_path = os.path.join(BASE_PATH, model)

    start_time = time()
    
    print(f'ncu {SPEEDUP_PARAMS} --sampling-interval auto --sampling-max-passes 5 --section SpeedOfLight --app-replay-buffer memory --apply-rules no --metrics sm__warps_active.avg.per_cycle_active,sm__warps_active.avg.pct_of_peak_sustained_active,sm__throughput.avg.pct_of_peak_sustained_elapsed,sm__maximum_warps_per_active_cycle_pct,sm__maximum_warps_avg_per_active_cycle,sm__cycles_active.avg,lts__throughput.avg.pct_of_peak_sustained_elapsed,launch__waves_per_multiprocessor,launch__thread_count,launch__shared_mem_per_block_static,launch__shared_mem_per_block_dynamic,launch__shared_mem_per_block_driver,launch__shared_mem_per_block,launch__shared_mem_config_size,launch__registers_per_thread,launch__occupancy_per_shared_mem_size,launch__occupancy_per_register_count,launch__occupancy_per_block_size,launch__occupancy_limit_warps,launch__occupancy_limit_shared_mem,launch__occupancy_limit_registers,launch__occupancy_limit_blocks,launch__grid_size,launch__func_cache_config,launch__block_size,l1tex__throughput.avg.pct_of_peak_sustained_active,gpu__time_duration.sum,gpu__compute_memory_throughput.avg.pct_of_peak_sustained_elapsed,gpc__cycles_elapsed.max,gpc__cycles_elapsed.avg.per_second,breakdown:sm__throughput.avg.pct_of_peak_sustained_elapsed,breakdown:gpu__compute_memory_throughput.avg.pct_of_peak_sustained_elapsed,launch__occupancy_per_cluster_size,launch__occupancy_cluster_pct,launch__occupancy_cluster_gpu_pct,launch__cluster_size,launch__cluster_scheduling_policy,launch__cluster_max_potential_size,launch__cluster_max_active,gpu__dram_throughput.avg.pct_of_peak_sustained_elapsed,dram__cycles_elapsed.avg.per_second --target-processes all --nvtx --nvtx-include "{NVTX_RANGE}" -o {model_name} -f python {model_path}')

    end_time = time()

    diff[model] = end_time - start_time
    
    print(f"Done profiling {model}")
    break

Profiling phi2.py
ncu --replay-mode application --warp-sampling-max-passes 1 --sampling-interval auto --sampling-max-passes 5 --section SpeedOfLight --app-replay-buffer memory --apply-rules no --metrics sm__warps_active.avg.per_cycle_active,sm__warps_active.avg.pct_of_peak_sustained_active,sm__throughput.avg.pct_of_peak_sustained_elapsed,sm__maximum_warps_per_active_cycle_pct,sm__maximum_warps_avg_per_active_cycle,sm__cycles_active.avg,lts__throughput.avg.pct_of_peak_sustained_elapsed,launch__waves_per_multiprocessor,launch__thread_count,launch__shared_mem_per_block_static,launch__shared_mem_per_block_dynamic,launch__shared_mem_per_block_driver,launch__shared_mem_per_block,launch__shared_mem_config_size,launch__registers_per_thread,launch__occupancy_per_shared_mem_size,launch__occupancy_per_register_count,launch__occupancy_per_block_size,launch__occupancy_limit_warps,launch__occupancy_limit_shared_mem,launch__occupancy_limit_registers,launch__occupancy_limit_blocks,launch__grid_size,la