In [None]:
import pandas as pd
from pandas.api.types import CategoricalDtype as Category
import matplotlib.pyplot as plt
from glob import glob
import seaborn as sns
import numpy as np
from iterextras import par_for
import subprocess as sp
import os
import itertools
from tqdm.auto import tqdm
import rs_utils
from scipy.stats import gmean
import matplotlib as mpl
import json
import statsmodels.formula.api as sm
from matplotlib import rc

sns.set()

rc('font',**{'family':'serif','serif':['Linux Libertine O']})
rc('text', **{'usetex': True, 'latex.preamble': r'\usepackage{libertine}\usepackage[libertine]{newtxmath}'})

In [None]:
PATHS = glob('../data/*.json')
CRATES = [os.path.splitext(os.path.basename(path))[0] for path in PATHS]
crate_cat = Category(sorted(CRATES))

def parse_data(path):
    print(path)
    if os.stat(path).st_size == 0:
        return []
    
    crate = os.path.splitext(os.path.basename(path))[0]
    df = pd.read_json(path)
    df['crate'] = crate
    df['crate'] = df['crate'].astype(crate_cat)
    return df


df = pd.concat(par_for(parse_data, PATHS)).reset_index()

In [None]:
df['tok_frac'] = df.num_relevant_tokens / df.num_tokens
df['line_frac'] = df.num_relevant_lines / df.num_lines

# df['spread_tok_frac'] = df.token_spread / df.num_tokens
# df['spread_line_frac'] = df.line_spread / df.num_lines
cutoff = int(np.round(df.num_lines.mean()))
df['big_func'] = df.num_lines >= cutoff
df['func_bin'] = df.num_lines // 50 * 50
df['func_quartile'] = pd.qcut(df.num_lines, [0, 0.25, 0.5, 0.75, 1.])

# Methodology (Section 4.1)


## Dataset size

In [None]:
print(f'{len(df)} samples, {len(df)//3} focus regions')


# Results (Section 4.2)

## Distribution of slice sizes by direction

In [None]:
df.groupby('direction').line_frac.describe()[['25%', '50%', '75%']]

## Figure 5-top-left

In [None]:
plt.figure(figsize=(2.75, 3.5))
ax = sns.boxplot(data=df, x='direction', y='line_frac', width=0.5)
ax.set_xlabel('Direction')
ax.set_ylabel('Slice size')
plt.savefig('linefrac.pdf', bbox_inches='tight')

## Distribution of slice sizes by function size and direction

In [None]:
df.groupby(['big_func', 'direction']).line_frac.describe()[['25%', '50%', '75%']]

## Figure 5-top-right

In [None]:
plt.figure(figsize=(5, 3.5))
ax = sns.boxplot(data=df, x='big_func', y='line_frac', hue='direction', width=0.5)
ax.set_xlabel(f'$\\geq {cutoff}$ lines of code?')
ax.set_ylabel('Slice size')
ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., frameon=False)
plt.tight_layout()
plt.savefig('linefrac-by-size.pdf', bbox_inches='tight')

## Figure 5-bottom

In [None]:
plt.figure(figsize=(8, 3.5))
order = df.groupby('crate').tok_frac.median().sort_values().index.tolist()
ax = sns.boxplot(data=df, x='crate', y='tok_frac', hue='direction', fliersize=0,  order=order, width=0.6)
ax.get_legend().remove()
ax.set_xlabel('Crate')
ax.set_ylabel('Slice size')
plt.tight_layout()
plt.savefig('linefrac-by-size-crate.pdf', bbox_inches='tight')