In [27]:
import pathlib
from collections import Counter


import pandas as pd
from tqdm import tqdm

In [28]:
labeled_bc_dir = pathlib.Path('/home/nika/Desktop/sip_dataset/LABELED-BCs')

In [29]:
def get_protected_dirs(labeled_bc_dir):
    for src_data_dir in labeled_bc_dir.iterdir():
        for obfs_dir in src_data_dir.iterdir():
            yield obfs_dir

In [30]:
obfs_dirs = list(get_protected_dirs(labeled_bc_dir))
obfs_dirs[:10]

[PosixPath('/home/nika/Desktop/sip_dataset/LABELED-BCs/simple-cov2/BCF30-SUB2-FLA2'),
 PosixPath('/home/nika/Desktop/sip_dataset/LABELED-BCs/simple-cov2/BCF30-FLA-SUB'),
 PosixPath('/home/nika/Desktop/sip_dataset/LABELED-BCs/simple-cov2/FLA-BCF30'),
 PosixPath('/home/nika/Desktop/sip_dataset/LABELED-BCs/simple-cov2/BCF100-SUB-FLA'),
 PosixPath('/home/nika/Desktop/sip_dataset/LABELED-BCs/simple-cov2/BCF100'),
 PosixPath('/home/nika/Desktop/sip_dataset/LABELED-BCs/simple-cov2/SUB-FLA'),
 PosixPath('/home/nika/Desktop/sip_dataset/LABELED-BCs/simple-cov2/BCF30-FLA2-SUB2'),
 PosixPath('/home/nika/Desktop/sip_dataset/LABELED-BCs/simple-cov2/BCF30-SUB'),
 PosixPath('/home/nika/Desktop/sip_dataset/LABELED-BCs/simple-cov2/SUB-BCF100-FLA'),
 PosixPath('/home/nika/Desktop/sip_dataset/LABELED-BCs/simple-cov2/NONE')]

In [31]:
def count_lines(file_path):
    with open(file_path) as inp:
        return len(inp.readlines())

In [32]:
def get_label_stats(ast_file_path):
    counter = Counter()
    with open(ast_file_path) as inp:
        for line in map(str.strip, inp):
            label = line.split('\t')[-1]
            counter[label] += 1
    return counter

In [74]:
data = []
for obfs_dir in tqdm(obfs_dirs):
    ll_files = [file for file in obfs_dir.iterdir() if file.suffix == '.ll']
    for file in ll_files:
        protection = file.name.split('-')[1:][0]
        if protection not in ('CFI', 'OH', 'SC'):
            protection = 'NONE'
            
        label_stats = get_label_stats(file.with_suffix('.sip_labels'))
        data.append({
            'file': file,
            'src_dataset': file.parts[-3],
            'obfs': obfs_dir.name,
            'protection': protection,
            'num_lines': count_lines(file),
            'num_blocks': sum(label_stats.values()),
            'protection_blocks': sum([v for k, v in label_stats.items() if k != 'none'])
        })

100%|██████████| 98/98 [00:59<00:00,  1.64it/s]


In [76]:
df = pd.DataFrame(data)

In [77]:
df.head()

Unnamed: 0,file,src_dataset,obfs,protection,num_lines,num_blocks,protection_blocks
0,/home/nika/Desktop/sip_dataset/LABELED-BCs/sim...,simple-cov2,BCF30-SUB2-FLA2,SC,7204,217,11
1,/home/nika/Desktop/sip_dataset/LABELED-BCs/sim...,simple-cov2,BCF30-SUB2-FLA2,SC,3126,89,3
2,/home/nika/Desktop/sip_dataset/LABELED-BCs/sim...,simple-cov2,BCF30-SUB2-FLA2,OH,10050,263,18
3,/home/nika/Desktop/sip_dataset/LABELED-BCs/sim...,simple-cov2,BCF30-SUB2-FLA2,CFI,25155,804,18
4,/home/nika/Desktop/sip_dataset/LABELED-BCs/sim...,simple-cov2,BCF30-SUB2-FLA2,SC,2109,88,5


In [81]:
stats = df[df['obfs'] == 'NONE']\
    .groupby(['protection'])\
    .agg({
        'num_blocks': 'sum', 
        'protection_blocks': 'sum', 
        'num_lines': 'sum', 
        'file': 'count'
    })\
    .reset_index()\
    .sort_values(['protection_blocks'], ascending=False)

stats['avg_ir_lines'] = stats['num_lines'] / stats['file']

no_protetion_avg_lines = float(stats[(stats['protection'] == 'NONE')]['avg_ir_lines'])
stats['avg_ir_lines_increase'] = (stats['avg_ir_lines'] / no_protetion_avg_lines - 1.0) * 100
stats = stats.sort_values('avg_ir_lines_increase')[
    ['protection', 'num_blocks', 'protection_blocks', 'avg_ir_lines', 'avg_ir_lines_increase']
]

stats.columns = ['SIP scheme', 'Targ. Blocks', 'Protected Blocks', 'Avg # IR Lines', 'Avg % Added IR Lines']
stats

Unnamed: 0,SIP scheme,Targ. Blocks,Protected Blocks,Avg # IR Lines,Avg % Added IR Lines
1,NONE,8167,0,1075.398058,0.0
3,SC,15997,3010,2090.495146,94.392684
0,CFI,37373,3130,4351.320388,304.624163
2,OH,32476,5589,6042.291262,461.865554


In [82]:
def int_thousands(num):
    res = []
    while num > 0:
        res.append(str(num % 1000))
        num //= 1000
        if num > 0:
            res[-1] = res[-1].zfill(3)
    return ','.join(reversed(res))

In [83]:
print(stats.to_latex(
    float_format='%0.2f', bold_rows=True, label='tab:dataset_protection_stats', index=False,
    formatters=[None, int_thousands, int_thousands, None, None]
))

\begin{table}
\centering
\label{tab:dataset_protection_stats}
\begin{tabular}{lrrrr}
\toprule
SIP scheme & Targ. Blocks & Protected Blocks &  Avg \# IR Lines &  Avg \% Added IR Lines \\
\midrule
      NONE &        8,167 &                  &         1075.40 &                  0.00 \\
        SC &       15,997 &            3,010 &         2090.50 &                 94.39 \\
       CFI &       37,373 &            3,130 &         4351.32 &                304.62 \\
        OH &       32,476 &            5,589 &         6042.29 &                461.87 \\
\bottomrule
\end{tabular}
\end{table}



In [84]:
src_data_stats_df = df[['src_dataset', 'protection', 'num_blocks', 'protection_blocks']]\
    .groupby(['src_dataset', 'protection'])\
    .agg({'num_blocks': 'sum', 'protection_blocks': 'sum'})\
    .reset_index()\
    .pivot(index='src_dataset', columns='protection', values=['num_blocks', 'protection_blocks'])[
        [
            ('num_blocks', 'NONE'), ('num_blocks', 'SC'), ('num_blocks', 'OH'), ('num_blocks', 'CFI'),
#          ('protection_blocks', 'NONE'), ('protection_blocks', 'SC'), 
#          ('protection_blocks', 'OH'), ('protection_blocks', 'CFI')
        ]
    ]\
    .rename(
        columns={
            'num_blocks': 'Total Blocks', 
            'protection_blocks': 'Protected Blocks', 'src_dataset': 'Source Dataset',
            'protection': 'Protection'
        }
    )

print(src_data_stats_df.to_latex(
    float_format='%0.2f', bold_rows=True, label='tab:protection_scheme_stats', index=True,
    formatters=[int_thousands, int_thousands, int_thousands, int_thousands]
))

\begin{table}
\centering
\label{tab:protection_scheme_stats}
\begin{tabular}{lrrrr}
\toprule
{} & \multicolumn{4}{l}{Total Blocks} \\
\textbf{protection} &         NONE &        SC &        OH &       CFI \\
\textbf{src\_dataset} &              &           &           &           \\
\midrule
\textbf{mibench-cov} &    1,821,549 & 3,596,715 & 6,997,849 & 6,367,185 \\
\textbf{simple-cov } &       58,474 &   115,782 &   212,839 &   417,708 \\
\textbf{simple-cov2} &      243,570 &   479,596 &   887,538 & 1,739,939 \\
\bottomrule
\end{tabular}
\end{table}



In [115]:
df

Unnamed: 0,file,src_dataset,obfs,protection,num_lines,num_blocks,protection_blocks
0,/home/nika/Desktop/sip_dataset/LABELED-BCs/sim...,simple-cov2,BCF30-SUB2-FLA2,SC,7204,217,11
1,/home/nika/Desktop/sip_dataset/LABELED-BCs/sim...,simple-cov2,BCF30-SUB2-FLA2,SC,3126,89,3
2,/home/nika/Desktop/sip_dataset/LABELED-BCs/sim...,simple-cov2,BCF30-SUB2-FLA2,OH,10050,263,18
3,/home/nika/Desktop/sip_dataset/LABELED-BCs/sim...,simple-cov2,BCF30-SUB2-FLA2,CFI,25155,804,18
4,/home/nika/Desktop/sip_dataset/LABELED-BCs/sim...,simple-cov2,BCF30-SUB2-FLA2,SC,2109,88,5
...,...,...,...,...,...,...,...
12887,/home/nika/Desktop/sip_dataset/LABELED-BCs/sim...,simple-cov,FLA-SUB-BCF30,SC,1256,137,4
12888,/home/nika/Desktop/sip_dataset/LABELED-BCs/sim...,simple-cov,FLA-SUB-BCF30,CFI,10107,1214,15
12889,/home/nika/Desktop/sip_dataset/LABELED-BCs/sim...,simple-cov,FLA-SUB-BCF30,CFI,7854,987,11
12890,/home/nika/Desktop/sip_dataset/LABELED-BCs/sim...,simple-cov,FLA-SUB-BCF30,NONE,1644,195,0


In [135]:
stats = df[(df['protection'] == 'NONE') & (df.src_dataset == 'simple-cov')]\
.groupby(['obfs'])\
.agg({'num_blocks': 'sum', 'num_lines': 'sum', 'file': 'count'})\
.reset_index().sort_values(['obfs'], ascending=False)


stats['avg_ir_lines'] = stats['num_lines'] / stats['file']

no_obfs_avg_lines = float(stats[stats['obfs'] == 'NONE']['avg_ir_lines'])

stats['avg_ir_lines_increase'] = (stats['avg_ir_lines'] / no_obfs_avg_lines - 1.0) * 100
stats.sort_values('avg_ir_lines', inplace=True)

stats = stats[['obfs', 'num_blocks', 'avg_ir_lines', 'avg_ir_lines_increase']]
# stats = stats[stats['obfs'].map(lambda x: '-' not in x)]
stats.columns = ['Obfuscation', 'Blocks', 'Avg IR Lines / Program', 'Avg % IR Lines Incr.']

stats

Unnamed: 0,Obfuscation,Blocks,Avg IR Lines / Program,Avg % IR Lines Incr.
10,NONE,783,228.85,0.0
11,SUB,783,240.72,5.19
0,BCF30,2019,458.98,100.56
5,FLA,2415,533.38,133.07
12,SUB-BCF30,2169,541.12,136.45
8,FLA-SUB,2415,544.55,137.95
14,SUB-FLA,2415,553.35,141.8
3,BCF30-SUB,2073,641.35,180.25
1,BCF30-FLA,3938,1035.38,352.43
13,SUB-BCF30-FLA,3865,1055.12,361.06


In [133]:
print(stats.to_latex(
    float_format='%0.2f', bold_rows=True, label='tab:dataset_obfuscation_stats_simple_cov', index=False,
    formatters=[None, int_thousands, None, None]
))

\begin{table}
\centering
\label{tab:dataset_obfuscation_stats_simple_cov}
\begin{tabular}{lrrr}
\toprule
  Obfuscation & Blocks &  Avg IR Lines / Program &  Avg \% IR Lines Incr. \\
\midrule
         NONE &    783 &                  228.85 &                  0.00 \\
          SUB &    783 &                  240.72 &                  5.19 \\
        BCF30 &  2,019 &                  458.98 &                100.56 \\
          FLA &  2,415 &                  533.38 &                133.07 \\
    SUB-BCF30 &  2,169 &                  541.12 &                136.45 \\
      FLA-SUB &  2,415 &                  544.55 &                137.95 \\
      SUB-FLA &  2,415 &                  553.35 &                141.80 \\
    BCF30-SUB &  2,073 &                  641.35 &                180.25 \\
    BCF30-FLA &  3,938 &                 1035.38 &                352.43 \\
SUB-BCF30-FLA &  3,865 &                 1055.12 &                361.06 \\
BCF30-FLA-SUB &  3,887 &                 1155.60 

In [137]:
stats = df[(df['protection'] == 'NONE') & (df.src_dataset == 'simple-cov')]\
.groupby(['obfs'])\
.agg({'num_blocks': 'sum', 'num_lines': 'sum', 'file': 'count'})\
.reset_index().sort_values(['obfs'], ascending=False)


stats['avg_ir_lines'] = stats['num_lines'] / stats['file']

no_obfs_avg_lines = float(stats[stats['obfs'] == 'NONE']['avg_ir_lines'])

stats['avg_ir_lines_increase'] = (stats['avg_ir_lines'] / no_obfs_avg_lines - 1.0) * 100
stats.sort_values('avg_ir_lines', inplace=True)

stats = stats[['obfs', 'num_blocks', 'avg_ir_lines', 'avg_ir_lines_increase']]
# stats = stats[stats['obfs'].map(lambda x: '-' not in x)]
stats.columns = ['Obfuscation', 'Blocks', 'Avg IR Lines / Program', 'Avg % IR Lines Incr.']

stats

Unnamed: 0,Obfuscation,Blocks,Avg IR Lines / Program,Avg % IR Lines Incr.
29,NONE,6601,4019.91,0.0
30,SUB,6601,4924.09,22.49
18,FLA,17764,8087.7,101.19
5,BCF30,17365,8399.7,108.95
25,FLA-SUB,17764,9123.7,126.96
37,SUB-FLA,17764,9173.3,128.2
13,BCF40,20605,9889.65,146.02
33,SUB-BCF30,17335,11929.7,196.77
10,BCF30-SUB,17239,12121.57,201.54
35,SUB-BCF40,19855,13800.48,243.3


In [139]:
print(stats.to_latex(
    float_format='%0.2f', bold_rows=True, label='tab:dataset_obfuscation_stats_mibench_cov', index=False,
    formatters=[None, int_thousands, None, None], longtable=True
))

\begin{longtable}{lrrr}
\label{tab:dataset_obfuscation_stats_mibench_cov}\\
\toprule
    Obfuscation &  Blocks &  Avg IR Lines / Program &  Avg \% IR Lines Incr. \\
\midrule
\endfirsthead

\toprule
    Obfuscation &  Blocks &  Avg IR Lines / Program &  Avg \% IR Lines Incr. \\
\midrule
\endhead
\midrule
\multicolumn{4}{r}{{Continued on next page}} \\
\midrule
\endfoot

\bottomrule
\endlastfoot
           NONE &   6,601 &                 4019.91 &                  0.00 \\
            SUB &   6,601 &                 4924.09 &                 22.49 \\
            FLA &  17,764 &                 8087.70 &                101.19 \\
          BCF30 &  17,365 &                 8399.70 &                108.95 \\
        FLA-SUB &  17,764 &                 9123.70 &                126.96 \\
        SUB-FLA &  17,764 &                 9173.30 &                128.20 \\
          BCF40 &  20,605 &                 9889.65 &                146.02 \\
      SUB-BCF30 &  17,335 &                11929.7

In [141]:
stats = df[(df['protection'] == 'NONE')]\
.groupby(['obfs'])\
.agg({'num_blocks': 'sum', 'num_lines': 'sum', 'file': 'count'})\
.reset_index().sort_values(['obfs'], ascending=False)


stats['avg_ir_lines'] = stats['num_lines'] / stats['file']

no_obfs_avg_lines = float(stats[stats['obfs'] == 'NONE']['avg_ir_lines'])

stats['avg_ir_lines_increase'] = (stats['avg_ir_lines'] / no_obfs_avg_lines - 1.0) * 100
stats.sort_values('avg_ir_lines', inplace=True)

stats = stats[['obfs', 'num_blocks', 'avg_ir_lines', 'avg_ir_lines_increase']]
# stats = stats[stats['obfs'].map(lambda x: '-' not in x)]
stats.columns = ['Obfuscation', 'Blocks', 'Avg IR Lines / Program', 'Avg % IR Lines Incr.']

stats

Unnamed: 0,Obfuscation,Blocks,Avg IR Lines / Program,Avg % IR Lines Incr.
29,NONE,8167,1075.4,0.0
30,SUB,8167,1286.68,19.65
18,FLA,22594,2222.54,106.67
5,BCF30,21643,2249.99,109.22
25,FLA-SUB,22594,2459.44,128.7
37,SUB-FLA,22594,2479.41,130.56
33,SUB-BCF30,21793,3088.96,187.24
10,BCF30-SUB,21487,3208.76,198.38
13,BCF40,23122,3966.4,268.83
6,BCF30-FLA,38441,4529.66,321.21


In [143]:
print(stats.to_latex(
    float_format='%0.2f', bold_rows=True, label='tab:dataset_obfuscation_stats_full', index=False,
    formatters=[None, int_thousands, None, None], longtable=True
))

\begin{longtable}{lrrr}
\label{tab:dataset_obfuscation_stats_full}\\
\toprule
    Obfuscation &  Blocks &  Avg IR Lines / Program &  Avg \% IR Lines Incr. \\
\midrule
\endfirsthead

\toprule
    Obfuscation &  Blocks &  Avg IR Lines / Program &  Avg \% IR Lines Incr. \\
\midrule
\endhead
\midrule
\multicolumn{4}{r}{{Continued on next page}} \\
\midrule
\endfoot

\bottomrule
\endlastfoot
           NONE &   8,167 &                 1075.40 &                  0.00 \\
            SUB &   8,167 &                 1286.68 &                 19.65 \\
            FLA &  22,594 &                 2222.54 &                106.67 \\
          BCF30 &  21,643 &                 2249.99 &                109.22 \\
        FLA-SUB &  22,594 &                 2459.44 &                128.70 \\
        SUB-FLA &  22,594 &                 2479.41 &                130.56 \\
      SUB-BCF30 &  21,793 &                 3088.96 &                187.24 \\
      BCF30-SUB &  21,487 &                 3208.76 &    

In [197]:
df['program'] = df['file'].map(lambda x: x.name.split('-')[0].split('.')[0])

In [198]:
df

Unnamed: 0,file,src_dataset,obfs,protection,num_lines,num_blocks,protection_blocks,program
0,/home/nika/Desktop/sip_dataset/LABELED-BCs/sim...,simple-cov2,BCF30-SUB2-FLA2,SC,7204,217,11,perfect
1,/home/nika/Desktop/sip_dataset/LABELED-BCs/sim...,simple-cov2,BCF30-SUB2-FLA2,SC,3126,89,3,insertionsort
2,/home/nika/Desktop/sip_dataset/LABELED-BCs/sim...,simple-cov2,BCF30-SUB2-FLA2,OH,10050,263,18,frequency
3,/home/nika/Desktop/sip_dataset/LABELED-BCs/sim...,simple-cov2,BCF30-SUB2-FLA2,CFI,25155,804,18,perfect
4,/home/nika/Desktop/sip_dataset/LABELED-BCs/sim...,simple-cov2,BCF30-SUB2-FLA2,SC,2109,88,5,reverse
...,...,...,...,...,...,...,...,...
12887,/home/nika/Desktop/sip_dataset/LABELED-BCs/sim...,simple-cov,FLA-SUB-BCF30,SC,1256,137,4,writetofile
12888,/home/nika/Desktop/sip_dataset/LABELED-BCs/sim...,simple-cov,FLA-SUB-BCF30,CFI,10107,1214,15,armstrong
12889,/home/nika/Desktop/sip_dataset/LABELED-BCs/sim...,simple-cov,FLA-SUB-BCF30,CFI,7854,987,11,fib
12890,/home/nika/Desktop/sip_dataset/LABELED-BCs/sim...,simple-cov,FLA-SUB-BCF30,NONE,1644,195,0,gcdrec


In [212]:
stats = df.groupby(['src_dataset'])\
    .agg({'num_blocks': 'sum', 'program': 'nunique', 'obfs': 'nunique'})\
    .sort_values('num_blocks')[['obfs', 'program', 'num_blocks']]\
    .rename(
        index={'src_dataset': 'Source Programs'},
        columns={'obfs': 'No. Obfuscations', 'program': 'No. Programs', 'num_blocks': 'Total No. Basic Blocks'}
    )
stats

Unnamed: 0_level_0,No. Obfuscations,No. Programs,Total No. Basic Blocks
src_dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
simple-cov,16,40,804803
simple-cov2,41,40,3350643
mibench-cov,41,23,18783298


In [213]:
print(stats.to_latex(
    float_format='%0.2f', bold_rows=True, label='tab:source_program_stats', index=True,
    formatters=[None, None, int_thousands]
))

\begin{table}
\centering
\label{tab:source_program_stats}
\begin{tabular}{lrrr}
\toprule
{} &  No. Obfuscations &  No. Programs & Total No. Basic Blocks \\
\textbf{src\_dataset} &                   &               &                        \\
\midrule
\textbf{simple-cov } &                16 &            40 &                804,803 \\
\textbf{simple-cov2} &                41 &            40 &              3,350,643 \\
\textbf{mibench-cov} &                41 &            23 &             18,783,298 \\
\bottomrule
\end{tabular}
\end{table}



In [211]:
df[df.src_dataset == 'simple-cov'].obfs.unique()

array(['BCF30-FLA-SUB', 'FLA-BCF30', 'SUB-FLA', 'BCF30-SUB', 'NONE',
       'BCF30', 'FLA-BCF30-SUB', 'SUB-BCF30', 'FLA', 'SUB',
       'BCF30-SUB-FLA', 'FLA-SUB', 'BCF30-FLA', 'SUB-FLA-BCF30',
       'SUB-BCF30-FLA', 'FLA-SUB-BCF30'], dtype=object)