In [None]:
import pandas as pd
from pandas.api.types import CategoricalDtype as Category
import matplotlib.pyplot as plt
from glob import glob
import seaborn as sns
import numpy as np
from iterextras import par_for
import subprocess as sp
import os
import itertools
from tqdm.auto import tqdm
import rs_utils
from scipy.stats import gmean
import matplotlib as mpl
import json
import statsmodels.formula.api as sm
from matplotlib import rc
import subprocess as sp

sns.set(
    style='whitegrid',
    palette='tab10', 
    rc={
        'font.family': 'serif', 
        'font.serif': ['Linux Libertine O'],
        'text.usetex': True,
        'text.latex.preamble': r'\usepackage{libertine}\usepackage[libertine]{newtxmath}'
    })

In [None]:
PATHS = glob('../data/*.json')
CRATES = [os.path.splitext(os.path.basename(path))[0] for path in PATHS]
crate_cat = Category(sorted(CRATES))

def parse_data(path):
    print(path)
    if os.stat(path).st_size == 0:
        return []
    
    crate = os.path.splitext(os.path.basename(path))[0]
    df = pd.read_json(path)
    df['crate'] = crate
    df['crate'] = df['crate'].astype(crate_cat)
    return df


df = pd.concat(par_for(parse_data, PATHS)).reset_index()

In [None]:
df['tok_frac'] = df.num_relevant_tokens / df.num_tokens
df['line_frac'] = df.num_relevant_lines / df.num_lines

cutoff = int(np.round(df.num_lines.mean()))
df['big_func'] = df.num_lines >= cutoff
df['func_bin'] = df.num_lines // 50 * 50
df['func_quartile'] = pd.qcut(df.num_lines, [0, 0.25, 0.5, 0.75, 1.])

# Methodology (Section 4.1)




## Table 3

In [None]:
CRATE_INFO = [
    ('Rocket', 'core/lib', 'Web backend framework', 'https://github.com/SergioBenitez/Rocket'),
    ('image', None, 'Image processing library', 'https://github.com/image-rs/image'),
    ('rayon', None, 'Data parallelism library', 'https://github.com/rayon-rs/rayon'),
    ('rg3d', None, '3D game engine', 'https://github.com/mrDIMAS/rg3d'),    
    ('nalgebra', None, 'Numerics library', 'https://github.com/dimforge/nalgebra'),
    ('rustls', 'rustls', 'TLS implementation', 'https://github.com/ctz/rustls'),
    ('sccache', None, 'Distributed build cache', 'https://github.com/mozilla/sccache'),
    ('hyper', None, 'HTTP server', 'https://github.com/hyperium/hyper'),    
    ('rav1e', None, 'Video encoder', 'https://github.com/xiph/rav1e'),
    ('RustPython', 'vm', 'Python interpreter', 'https://github.com/RustPython/RustPython'),
]

In [None]:
bwd = df[df.direction == 'Backward']
counts = bwd.groupby(['crate', 'function_path']).size().rename('count')
counts = counts[counts > 0].reset_index()

num_funcs = counts.groupby('crate').size().rename('num_funcs')
num_slices = counts.groupby('crate')['count'].sum().rename('num_slices')
avg_slices_per_func = counts.groupby('crate')['count'].mean().rename('avg_slices_per_func')
crate_stats = pd.concat([num_slices, avg_slices_per_func, num_funcs], axis=1).sort_values('num_slices')
crate_stats

In [None]:
CLOC = 'cloc'
crate_loc = []
for (crate, path, purpose, url) in CRATE_INFO:
    crate_dir = f'../data/repos/{crate}'
    if path:
        crate_dir += f'/{path}'
    crate_dir += '/src'    
    cloc_str = sp.check_output(f'{CLOC} {crate_dir} --json', shell=True).strip()
    commit = sp.check_output('git rev-parse HEAD', shell=True, cwd=crate_dir).strip().decode('utf-8')
    cloc = json.loads(cloc_str)
    crate_loc.append({
        "crate": crate,
        'subdir': path,
        'purpose': purpose,
        "url": url,
        "commit": commit,
        "loc": cloc["Rust"]["code"],
    })
crate_loc = pd.DataFrame(crate_loc)

In [None]:
crate_final = crate_stats.join(crate_loc.set_index("crate")).sort_values('num_slices')
for crate, row in  crate_final.iterrows():
    print(f'\href{{{row.url}}}{{{crate}}} & {row.subdir or ""} & {row.purpose} & {row["loc"]:,} & \
{row.num_slices:,} & {row.num_funcs:,} & {row.avg_slices_per_func:.1f} \\\\ \hline')
print('\\multicolumn{1}{l}{} & \\multicolumn{1}{l}{} & \\multicolumn{1}{r|}{\\textbf{Total:}} &')
print(f'{crate_final["loc"].sum():,} & {crate_final.num_slices.sum():,} & {crate_final.num_funcs.sum():,}')

## Dataset size

In [None]:
print(f'{len(df)} samples, {len(df)//3} focus regions')


# Results (Section 4.2)

## Distribution of slice sizes by direction

In [None]:
df.groupby('direction').line_frac.describe()[['25%', '50%', '75%']]

## Figure 5-top-left

In [None]:
HEIGHT = 2.5
plt.figure(figsize=(2.75, HEIGHT))
ax = sns.boxplot(data=df, x='direction', y='line_frac', width=0.5)
ax.set_xlabel('Direction')
ax.set_ylabel('Slice size')
plt.savefig('linefrac.pdf', bbox_inches='tight')

## Distribution of slice sizes by function size and direction

In [None]:
df.groupby(['big_func', 'direction']).line_frac.describe()[['25%', '50%', '75%']]

## Figure 5-top-right

In [None]:
plt.figure(figsize=(5, HEIGHT))
ax = sns.boxplot(data=df, x='big_func', y='line_frac', hue='direction', width=0.5)
ax.set_xlabel(f'$\\geq {cutoff}$ lines of code?')
ax.set_ylabel('Slice size')
ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., frameon=False)
plt.tight_layout()
plt.savefig('linefrac-by-size.pdf', bbox_inches='tight')

## Figure 5-bottom

In [None]:
plt.figure(figsize=(8, HEIGHT))
order = df.groupby('crate').tok_frac.median().sort_values().index.tolist()
ax = sns.boxplot(data=df, x='crate', y='tok_frac', hue='direction', fliersize=0,  order=order, width=0.6)
ax.get_legend().remove()
ax.set_xlabel('Crate')
ax.set_ylabel('Slice size')
plt.tight_layout()
plt.savefig('linefrac-by-size-crate.pdf', bbox_inches='tight')