In [None]:
import pandas as pd
from pandas.api.types import CategoricalDtype as Category
import matplotlib.pyplot as plt
from glob import glob
import seaborn as sns
import numpy as np
from iterextras import par_for
import subprocess as sp
import os
import itertools
from tqdm.auto import tqdm
import rs_utils
from scipy.stats import gmean
import matplotlib as mpl
import json
import statsmodels.formula.api as sm
from matplotlib import rc

sns.set()

In [None]:
PATHS = glob('../data/*.json')
CRATES = [os.path.splitext(os.path.basename(path))[0] for path in PATHS]
CATEGORIES = {
    'mutability_mode': Category(["DistinguishMut", "IgnoreMut"]),
    'context_mode': Category(["Recurse", "SigOnly"]),
    'pointer_mode': Category(["Conservative", "Precise"]),
    'crate': Category(sorted(CRATES))
}

def parse_data(path):
    print(path)
    if os.stat(path).st_size == 0:
        return []
    
    crate = os.path.splitext(os.path.basename(path))[0]
    df = pd.DataFrame(rs_utils.parse_data(path))
    df['crate'] = crate
    for k, dt in CATEGORIES.items():
        df[k] = df[k].astype(dt)
    return df


all_dfs = par_for(parse_data, PATHS)
df = pd.concat(all_dfs).reset_index()
del all_dfs

df['ins_rel_nonzero'] = df.instructions_relative > 0
df['ins_rel_frac_log'] = np.log(df.instructions_relative_frac)

In [None]:
def cond(df, m=None, c=None, p=None):
    if m is not None:
        df = df[df.mutability_mode == m]
    if c is not None:
        df = df[df.context_mode == c]
    if p is not None:
        df = df[df.pointer_mode == p]
    return df

modular_cond = cond(df, c='SigOnly', m='DistinguishMut', p='Precise')
modular_metric = modular_cond.instructions_relative_frac
mutblind_cond = cond(df, c='SigOnly', m='IgnoreMut', p='Precise')
mutblind_metric = mutblind_cond.instructions_relative_base_frac
refblind_cond = cond(df, c='SigOnly', m='DistinguishMut', p='Conservative')
refblind_metric = refblind_cond.instructions_relative_base_frac
whole_cond = cond(df, c='Recurse', m='DistinguishMut', p='Precise')

# Dataset (Section 5.1)

## Crate information (Table 1)

In [None]:
CRATE_INFO = [
    ('Rocket', 'core/lib', 'Web backend framework', 'https://github.com/SergioBenitez/Rocket'),
    ('image', None, 'Image processing library', 'https://github.com/image-rs/image'),
    ('rayon', None, 'Data parallelism library', 'https://github.com/rayon-rs/rayon'),
    ('rg3d', None, '3D game engine', 'https://github.com/mrDIMAS/rg3d'),    
    ('nalgebra', None, 'Numerics library', 'https://github.com/dimforge/nalgebra'),
    ('rustls', 'rustls', 'TLS implementation', 'https://github.com/ctz/rustls'),
    ('sccache', None, 'Distributed build cache', 'https://github.com/mozilla/sccache'),
    ('hyper', None, 'HTTP server', 'https://github.com/hyperium/hyper'),    
    ('rav1e', None, 'Video encoder', 'https://github.com/xiph/rav1e'),
    ('RustPython', 'vm', 'Python interpreter', 'https://github.com/RustPython/RustPython'),
]

In [None]:
modular_cond_funcs = modular_cond.groupby(['crate', 'function_path']) \
    .apply(lambda df: df.iloc[0]).reset_index(drop=True)
num_slices = modular_cond.groupby(['crate']).size().rename('num_slices')
avg_ins_per_func = modular_cond_funcs.groupby('crate').num_instructions.mean() \
    .rename('avg_ins_per_func')
num_funcs = modular_cond.groupby(['crate']).apply(lambda df: len(df.function_path.unique())).rename('num_funcs')

q = modular_cond.groupby(['crate', 'function_path']).size()
avg_slices_per_func = q[q > 0] \
    .groupby('crate').mean().rename('avg_slices_per_func')
crate_stats = pd.concat(
    [num_slices, avg_ins_per_func, num_funcs, avg_slices_per_func], 
    axis=1).sort_values('num_slices')
crate_stats

In [None]:
CLOC = 'cloc'
crate_loc = []
for (crate, path, purpose, url) in CRATE_INFO:
    crate_dir = f'../data/repos/{crate}'
    if path:
        crate_dir += f'/{path}'
    crate_dir += '/src'    
    cloc_str = sp.check_output(f'{CLOC} {crate_dir} --json', shell=True).strip()
    commit = sp.check_output('git rev-parse HEAD', shell=True, cwd=crate_dir).strip().decode('utf-8')
    cloc = json.loads(cloc_str)
    crate_loc.append({
        "crate": crate,
        'subdir': path,
        'purpose': purpose,
        "url": url,
        "commit": commit,
        "loc": cloc["Rust"]["code"],
    })
crate_loc = pd.DataFrame(crate_loc)

In [None]:
crate_final = crate_stats.join(crate_loc.set_index("crate")).sort_values('num_slices')
for crate, row in  crate_final.iterrows():
    print(f'\href{{{row.url}}}{{{crate}}} & {row.subdir or ""} & {row.purpose} & {row["loc"]:,} & \
{row.num_slices:,} & {row.num_funcs:,} & {row.avg_ins_per_func:.1f} \\\\ \hline')

In [None]:
print(f'{crate_final["loc"].sum():,} & {crate_final.num_slices.sum():,} & {crate_final.num_funcs.sum():,}')

In [None]:
crate_final.reset_index()[['crate', 'commit']]

## Execution time (last paragraph of 5.1)

In [None]:
durations = cond(df, m='DistinguishMut', p='Precise').groupby(['context_mode', 'function_path']) \
    .apply(lambda df: df.iloc[0].duration)
durations.groupby('context_mode').median() * 1e3


# Quantitative analysis (Section 5.2)

In [None]:
def plot_ins(data, xscale='linear', yscale='linear', nz=False, ax=None, quantile=None, bins=None, **kwargs):
    if nz:
        data = data[data > 0]        
        
    if bins is None:
        max = data.max()
        if 'log' in xscale:
            bins = np.logspace(np.log10(0.0001), np.log10(max))
            if not nz:
                bins = np.concatenate(([0.], bins))
        else:
            bins = np.linspace(0, max, 30)

    ax = sns.histplot(data, bins=bins, ax=ax)
    ax.set_xscale(xscale)
    ax.set_yscale(yscale)
    
    if xscale == 'symlog':
        nz_min = data[data > 0].min()
        ax.set_xscale(xscale, linthresh=nz_min)
        ax.set_xlim(-nz_min/2, data.max())
        
    ax.set_xlabel('\% increase in dependency set size' + (', log scale' if 'log' in xscale else ''))
    ax.set_ylabel('Count' + (', log scale' if yscale == 'log' else ''))

    if quantile is not None:
        x = data.quantile(quantile)
        ax.axvline(x, color='r')        

## Figure 2

In [None]:
## Note: install pdflatex and uncomment these lines if you want the graph styling to exactly match the paper version
#
# rc('font',**{'family':'serif','serif':['Linux Libertine O']})
# rc('text', **{'usetex': True, 'latex.preamble': r'\usepackage{libertine}\usepackage[libertine]{newtxmath}'})

fig, [ax1, ax2] = plt.subplots(1, 2, figsize=(5.5, 3.))

plot_ins(modular_metric, ax=ax1, xscale='symlog')
ax1.set_title('\\textsc{y-linear}')
ax1.set_yticklabels(['0'] + [f'${n} \cdot 10^5$' for n in [1,2,3,4]])

plot_ins(modular_metric, xscale='symlog', yscale='log', ax=ax2)
ax2.set_title('\\textsc{y-log}')
ax2.set_yticks([10e1, 10e3, 10e5])

for ax in [ax1, ax2]:
    ax.set_xticks([0, 10e-3, 10e-1, 10e1])

ax1.set_xlabel('')
ax2.set_xlabel('')
fig.supxlabel('\% difference in dependency set size, log scale (with zero)', y=0.1)

fig.tight_layout()
fig.savefig('eval-recurse-dist.pdf', bbox_inches='tight')

## Figure 3

In [None]:
fig, [ax1, ax2, ax3] = plt.subplots(1, 3, figsize=(5.5, 3.), sharey=True)

kwargs = {'nz': True, 'xscale': 'log'}
plot_ins(modular_metric, ax=ax1, **kwargs)
ax1.set_title('\\textsc{Modular} - \\textsc{Whole-program}')
plot_ins(mutblind_metric, ax=ax2, **kwargs)
ax2.set_title('\\textsc{Mut-blind} - \\textsc{Modular}', y=1.125)
plot_ins(refblind_metric, ax=ax3, **kwargs)
ax3.set_title('\\textsc{Ref-blind} - \\textsc{Modular}')

for ax in [ax1, ax2, ax3]:
    ax.set_xticks([10e-3, 10e-1, 10e1])
ax1.get_yaxis().set_major_formatter(mpl.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))

ax1.set_xlabel('')
ax3.set_xlabel('')

fig.tight_layout()

fig.savefig('eval-all-dist.pdf', bbox_inches='tight')

## Statistics used within 5.2 text

In [None]:
model = sm.ols(
    'num_relevant_instructions ~ mutability_mode * pointer_mode', 
    data=df[(df.context_mode == 'SigOnly')])
results = model.fit()
results.summary()

In [None]:
def stats(series):
    nz = len(series[series > 0]) / len(series)
    return f'{nz:.4f}, {1. - nz:.4f}, {series[series > 0].median():.2f}'

print('Modular:', stats(modular_metric))
print('MutBlind:', stats(mutblind_metric))
print('RefBlind:', stats(refblind_metric))

## Figure 4

In [None]:
g = sns.FacetGrid(data=mutblind_cond, col='crate', col_order=sorted(CRATES, key=lambda s: s.lower()), 
                  col_wrap=5, sharex=False, height=2.3, aspect=0.9)
g.map_dataframe(lambda **kwargs: plot_ins(kwargs['data'].instructions_relative_base_frac, nz=True, xscale='log'))
g.set_titles('\\textsc{{{col_name}}}')
g.fig.supylabel('Count')
g.fig.supxlabel('\% increase in dependency set size, log scale', y=0.05)
for ax in g.axes.flat:
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.set_xticks([10e-3, 10e-1, 10e1])
    
g.fig.tight_layout()
g.fig.savefig('eval-crates.pdf', bbox_inches='tight')



# Threats to validity (Section 5.4)


## 5.4.1 statistics

In [None]:
total_vs_nz = mutblind_cond.groupby('crate').apply(lambda df: pd.DataFrame([{
    'total': crate_final.num_slices[df.crate.iloc[0]],
     'nz': len(df[df.instructions_relative_base > 0])
}]))
result = sm.ols('nz ~ total', data=total_vs_nz).fit()
result.summary()

## 5.4.2 statistics

In [None]:
len(modular_cond[modular_cond.reached_library]) / len(modular_cond)

In [None]:
modular_cond.groupby('reached_library').apply(lambda df: len(df[df.ins_rel_nonzero]) / len(df))