#  Tensorflow Timeline Analysis on Model Zoo Benchmark among different data types

This jupyter notebook will help you evaluate performance benefits among different data types on the level of Tensorflow operations via several models from Intel Model Zoo. The notebook will show users a bar chart like the picture below for the Tensorflow operation level performance comparison. The red horizontal line represents the performance of Tensorflow operations from one data type as performance baseline, and the blue bars represent the speedup of Tensorflow operations by using other data type with oneDNN. The orange bars represent the speedup of Tensorflow operations by using other data type without oneDNN. Users should be able to see a good speedup for those operations accelerated by Intel DL Boost instructions. 
> NOTE : Users need to get Tensorflow timeline json files from other Jupyter notebooks like benchmark_date_types_perf_comparison
  first to proceed this Jupyter notebook.

<img src="images\compared_tf_op_duration_ratio_bar_types.png" width="700">

# Get Platform Information 

In [None]:
# ignore all warning messages
import warnings
warnings.filterwarnings('ignore')

In [None]:
from profiling.profile_utils import PlatformUtils
plat_utils = PlatformUtils()
plat_utils.dump_platform_info()

# Section 1 : Prerequisites

In [None]:
!pip install cxxfilt

%matplotlib inline
import matplotlib.pyplot as plt
import tensorflow as tf

In [None]:
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1500)

## List out the Timeline folders

First, list out all Timeline folders from previous runs.

In [None]:
import os
filenames= os.listdir (".") 
result = []
keyword = "Timeline"
for filename in filenames: 
    if os.path.isdir(os.path.join(os.path.abspath("."), filename)): 
        if filename.find(keyword) != -1:
                result.append(filename)
result.sort()

index =0 
for folder in result:
    print(" %d : %s " %(index, folder))
    index+=1

## Select a Timeline folder from previous runs
#### ACTION: Please select one Timeline folder and change FdIndex accordingly

In [None]:
# use the "FD_INDEX" environment variable value if it exists.
import os
env_fd_index=os.environ.get('FD_INDEX', '')
if env_fd_index != '':
    FdIndex= int(env_fd_index)
else:
    ## USER INPUT
    FdIndex= int(input('Input a index number of a folder: '))

List out all Timeline json files inside Timeline folder.

In [None]:
import os
TimelineFd = result[FdIndex]
print(TimelineFd)
datafiles = [TimelineFd +os.sep+ x for x in os.listdir(TimelineFd) if '.json' == x[-5:]]
print(datafiles)
if len(datafiles) is 0:
    print("ERROR! No json file in the selected folder. Please select other folder.")
elif len(datafiles) is 1:
    print("WARNING! There is only 1 json file in the selected folder. Please select other folder to proceed Section 1.2.")

<a id='section_2'></a>
# Section 2: Performance analysis between two different data types

## Section 2.1 : Analyze TF Timeline results

### Step 1: Select  two TF timeline files with different data types for analysis

#### List out all timeline files in the selected folder

In [None]:
if len(datafiles) is 1:
    print("ERROR! There is only 1 json file in the selected folder.")
    print("Please select other Timeline folder from beginnning to proceed Section 1.2.")

for i in range(len(datafiles)):
    print(" %d : %s " %(i, datafiles[i]))


#### ACTION: Please select one timeline file as a perfomance baseline and the other as a comparison target
Please put the related index for your selected timeline file.  
In general, fp32 data type should be the performance baseline.

In [None]:
# use the "BASELINE_INDEX" environment variable value if it exists.
import os
env_baseline_index=os.environ.get('BASELINE_INDEX', '')
if env_baseline_index != '':
    Baseline_Index= int(env_baseline_index)
else:
    ## USER INPUT
    Baseline_Index= int(input('Input a index number of a Performance Baseline: '))

# comparison target
env_comparison_index=os.environ.get('COMPARISON_INDEX', '')
if env_comparison_index != '':
    Comparison_Index= int(env_comparison_index)
else:
    ## USER INPUT
    Comparison_Index= int(input('Input a index number of a Performance Comparison: '))


#### List out two selected timeline files

In [None]:
selected_datafiles = []
selected_datafiles.append(datafiles[Baseline_Index])
selected_datafiles.append(datafiles[Comparison_Index])
print(selected_datafiles)

#### Find the related oneDNN JITDUMP folder and Tensorflow Log file with oneDNN Verbose log

In [None]:
from profiling.profile_utils import PerfPresenter
perfp=PerfPresenter()

tag0, tag1 = perfp.get_diff_from_csv_filenames(selected_datafiles[0],selected_datafiles[1])
tags = []
jitdumps_fd_path = []
logs_path = []

for tag in [tag0, tag1]:
    # TEMP FIX
    if tag == 'bf16':
        tag = 'bfloat16'
    if tag == 'f32':
        tag = 'fp32'
    JITDUMP_FD_PATH= TimelineFd + os.sep +'JITDUMP_'+ tag
    if os.path.isdir(os.path.join(os.path.abspath("."), JITDUMP_FD_PATH)):
        logs = [JITDUMP_FD_PATH +os.sep+ x for x in os.listdir(JITDUMP_FD_PATH) if '.log.old' == x[-8:]]
        LOG_PATH = logs[0]
        jitdumps_fd_path.append(JITDUMP_FD_PATH)
        logs_path.append(LOG_PATH)
    tags.append(tag)
print(tags)
print(jitdumps_fd_path)
print(logs_path)

### Step 2: Parsing timeline results into CSV files

In [None]:
%matplotlib agg
from profiling.profile_utils import TFTimelinePresenter
csvfiles=[]

tfp = TFTimelinePresenter(True)
for fn in selected_datafiles:
    if fn.find('/'):
        fn_nofd=fn.split('/')[1]
    else:
        fn_nofd=fn
    tfile_name= fn_nofd.split('.')[0]
    tfile_prefix = fn_nofd.split('_')[0]
    tfile_postfix = fn_nofd.strip(tfile_prefix)[1:] 
    csvpath = TimelineFd +os.sep+tfile_name+'.csv'
    print(csvpath)
    csvfiles.append(csvpath)
    timeline_pd = tfp.postprocess_timeline(tfp.read_timeline(fn))
    timeline_pd = timeline_pd[timeline_pd['ph'] == 'X']
    tfp.get_tf_ops_time(timeline_pd,fn,tfile_prefix)

### Step 3: Pre-processing for the two CSV files

In [None]:
import os
import pandas as pd

csvarray=[]
for csvf in csvfiles:
    print("read into pandas :",csvf)
    a = pd.read_csv(csvf)
    csvarray.append(a)

a = csvarray[0]
b = csvarray[1]

In [None]:
# Find tags among CSV files
tags=[]
from profiling.profile_utils import PerfPresenter
perfp=PerfPresenter()
tag0, tag1 = perfp.get_diff_from_csv_filenames(csvfiles[0],csvfiles[1])
tags = [tag0, tag1]
print('tags : ',tags)

### Step 4: Merge two CSV files and caculate the speedup accordingly
Users can check column "speedup" for the speedup from bfloat16 or int8.  
If the operation uses oneDNN for acceleration, the "mkl_op" column should be marked as "True". 


In [None]:
import os
import pandas as pd
fdir='merged'
if not os.path.exists(fdir):
    os.mkdir(fdir)

fpaths=[]
fpaths.append(fdir+os.sep+'merged.csv')
fpaths.append(fdir+os.sep+'diff_'+tags[0]+'.csv')
fpaths.append(fdir+os.sep+'diff_'+tags[1]+'.csv')
merged=tfp.merge_two_csv_files_v2(fpaths, a, b, tags)
merged

#### The unique Tensorflow operations from the first csv/Timline file

In [None]:
%matplotlib inline
print("Operations are only in", tags[0], " run")
extra1 = pd.read_csv(fpaths[1])
extra1

#### The unique Tensorflow operations from the second csv/Timline file

In [None]:
print("Operations are only in", tags[1], " run")
extra2 = pd.read_csv(fpaths[2])
extra2

### Step 5: Draw a bar chart for elapsed time of TF ops among two different data types
The first diagram compares the elapsed time of operations among two different data types.  
The second diagram shows the speedup of TF operations from comparison target.  
The blue bar of second diagram is accelerated by oneDNN operation.

In [None]:
%matplotlib inline
print(fpaths[0])
tfp.plot_compare_bar_charts(fpaths[0], tags=tags)
tfp.plot_compare_ratio_bar_charts(fpaths[0], tags=['','oneDNN ops'])

### Step 6: Draw pie charts for elapsed time of TF ops among different data types
Users should be able to identify top hotspots from below pie charts among different data types.  

> NOTE: Users could also compare elapsed time of TF ops among any two different TF timeline files.

We will have following pie charts in sequence:
1. the pie chart for elpased time of TF ops from stock TF or the first csv/Timeline file
2. the pie chart for elpased time of unique TF ops from stock TF or the first csv/Timeline file
3. the pie chart for elpased time of TF ops from Intel TF or the second csv/Timeline file
4. the pie chart for elpased time of unique TF ops from Intel TF or the second csv/Timeline file
5. the pie chart for elpased time of common TF ops among stock & Intel TF or two csv/Timeline files


#### The pie chart for elapsed time of TF ops from the first csv/Timline file
understand which TF operations spend most of time.

In [None]:
tfp.plot_pie_chart(csvfiles[0], tags[0])

#### The pie chart for elapsed time of  unique TF operations from the first csv/Timline file
understand if there is any unique TF operation.

In [None]:
tfp.plot_pie_chart(fpaths[1], tags[0])

#### The pie chart for elapsed time of TF ops from the second csv/Timline file
understand which TF operations spend most of time.

In [None]:
tfp.plot_pie_chart(csvfiles[1], tags[1])

#### The pie chart for elapsed time of  unique TF operations from the second csv/Timline file
understand if there is any unique TF operation.

In [None]:
tfp.plot_pie_chart(fpaths[2], tags[1])

#### The pie chart for elapsed time of common TF ops among  two csv/Timline files
understand top hotspots differences among two csv/Timeline files.

In [None]:
tfp.plot_compare_pie_charts(fpaths[0], tags=tags)

<a id='section_2_2'></a>
## Section 2.2: Analyze oneDNN debug logs and JIT dumps

>NOTE: Section 2.2 is only relevant if user had DNNL_VERBOSE or DNNL_JIT_DUMP enabled

### Step 1: Parse related oneDNN Verbose logs

>NOTE: Step 1-3 is only relevant if user had DNNL_VERBOSE enabled.

In [None]:
from profiling.profile_utils import oneDNNUtils, oneDNNLog
onednn = oneDNNUtils()

log1 = oneDNNLog()
log1.load_log(logs_path[0])
exec_data1 = log1.exec_data

log2 = oneDNNLog()
log2.load_log(logs_path[1])
exec_data2 = log2.exec_data
print(logs_path)

###  Step 2:  Primitives Type Speedup from comparison target ( bfloat16 or int8)
The digram below shows performance speedup from the comparison target data type such as VNNI, or BF16.

In [None]:
 onednn.stats_comp('type', 'time',log2, log1, tags=tags)

### Step 3:  Time breakdown for JIT kernel type

oneDNN uses just-in-time compilation (JIT) to generate optimal code for some functions based on input parameters and instruction set supported by the system.   
Therefore, users can see different JIT kernel type among different CPU and GPU architectures.  
For example, users can see avx_core_vnni JIT kernel if the workload uses VNNI instruction on Cascake Lake platform.  
Users can also see different OCL kernels among different Intel GPU generations.  
Moreover, users can identify the top hotspots of JIT kernel executions with this time breakdown.  


 Time breakdown of Baseline Data Type for JIT kernel type 

In [None]:
print("Time Breakdown of %s data type" %(tags[0]))
onednn.breakdown(exec_data1,"jit","time")
onednn.breakdown(exec_data1,"type","time")

 Time breakdown of Comparison Data Type for JIT kernel type 

In [None]:
print("Time Breakdown of %s data type" %(tags[1]))
onednn.breakdown(exec_data2,"jit","time")
onednn.breakdown(exec_data2,"type","time")

### Step 4: Inspect JIT Kernel 

>NOTE: Step 4 is only relevant if user had DNNL_JIT_DUMP enabled

In this section, we analyze dump JIT files of the comparison data type.    
Users should be able to see the exact CPU instruction usage like VNNI or BF16 from those JIT Dump files.

In [None]:
print('will inspect JIT codes from %s data type in %s '%(tags[1], jitdumps_fd_path[1]))

##### List out all JIT Dump Files with index number

In [None]:
import os
filenames= os.listdir (jitdumps_fd_path[1]) 
result = []
keyword = ".bin"
for filename in filenames: 
    #if os.path.isdir(os.path.join(os.path.abspath("."), filename)): 
    if filename.find(keyword) != -1:
        result.append(filename)
result.sort()

jit_fd_index =0 
for folder in result:
    print(" %d : %s " %(jit_fd_index, folder))
    jit_fd_index+=1

##### ACTION :  Pick a JIT Dump file by putting its index value below

In [None]:
## USER INPUT
FdIndex = 0 
# use the "FD_INDEX" environment variable value if it exists.
import os
env_jit_fd_index=os.environ.get('JIT_FD_INDEX', '')
if env_jit_fd_index != '':
    FdIndex= int(env_jit_fd_index)
else:
    ## USER INPUT
    if jit_fd_index > 0:
        FdIndex= int(input('Input a index number of a JIT folder: '))

##### export JIT Dump file to environment variable JITFILE  and also related ISA keyword to environment variable DNNL_ISA_KEYWORD

In [None]:
if FdIndex < len(result):
    logfile = result[FdIndex]
    os.environ["JITFILE"] = jitdumps_fd_path[1]+os.sep+logfile
    print(os.environ["JITFILE"])
if tags[1] == 'f32':
    os.environ["DNNL_ISA_KEYWORD"] = "zmm"
elif tags[1] == 'int8':
    os.environ["DNNL_ISA_KEYWORD"] = "vpdpbusd"
elif tags[1] == 'bf16' or tags[1] == 'bfloat16':
    os.environ["DNNL_ISA_KEYWORD"] ='vdpbf16ps|vcvtne2ps2bf16'
    
print(os.environ["DNNL_ISA_KEYWORD"])

#### disassembler JIT Dump file
> NOTE: zmm register is introduced by AVX512 ISA.  
Users should see usage of **zmm** register in AVX512 JIT dump files.  

> NOTE: vpdpbusd is introduced by AVX512_VNNI ISA.  
Users should see usage of **vpdpbusd** in AVX512_VNNI JIT dump files. 

> NOTE: **vdpbf16ps**, **vcvtne2ps2bf16**, and **vcvtneps2bf16** are introduced by AVX512_BF16 ISA.  
Users should see usage of vdpbf16ps, vcvtne2ps2bf16 or vcvtneps2bf16 in AVX512_BF16 JIT dump files. 

> NOTE: For disassembler vdpbf16ps, vcvtne2ps2bf16, and vcvtneps2bf16 instructions, users must use objdump with **v2.34** or above.

In [None]:
!objdump -D -b binary -mi386:x86-64 $JITFILE | grep -E $DNNL_ISA_KEYWORD

### (Optional)Step 5: move all results files into the selected Timeline folder
By runing below codes, all results files will be moved to the selected Timeline folder

In [None]:
from profiling.profile_utils import CommonUtils
utils = CommonUtils()
import os
import shutil
import datetime

# move png and csv results into Timeline folder    
pattern_list = [ "*.png" , "*.csv"]
current_path = os.getcwd()
for pattern in pattern_list:
    png_fds, png_fd_paths = utils.found_files_in_folder(pattern, current_path)
    for fd_path in png_fd_paths:
        shutil.move(fd_path,TimelineFd)

# move pretrained model, logs, merged files into Timeline folder 
fd_name_list = ['pretrained','merged','logs']
timeinfo = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M")
for fd_name in fd_name_list:
    if os.path.isdir(fd_name) == True:
        target_fd = TimelineFd + os.sep+ fd_name+'_'+timeinfo
        shutil.move(fd_name,target_fd)
