In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
from statsmodels.stats.proportion import proportion_confint

In [None]:
res = pd.read_csv( snakemake.input.results )
res["correct"] = (res["lineage"] == res["actual"])&(res["confidence"]==1)
res["frac_masked"] = res["frac_masked"] / 100
res["parsimony_placements"] = res["classification_notes"].str.extract(r"[A-Z0-9.]+\([0-9]+\/([0-9]+)\)")
res["parsimony_placements"] = pd.to_numeric( res["parsimony_placements"] )
res.loc[~res["correct"],"confidence"] = 1 - res["confidence"]
res = res.loc[~res["file"].str.startswith( "ERR025389_T2" )]
res.head()

In [None]:
accuracy = res.groupby( "frac_masked" )["correct"].agg( ["count", "sum"] )
accuracy.columns = ["observations", "successes"]
accuracy = accuracy.reset_index()
accuracy["accuracy"] = accuracy["successes"] / accuracy["observations"]
accuracy[["accuracy_low", "accuracy_high"]] = accuracy.apply( lambda x: pd.Series( proportion_confint( x["successes"], x["observations"], alpha=0.05, method="jeffreys" ) ), axis=1 )
accuracy["genome_present"] = 1 - accuracy["frac_masked"]
accuracy.head()

In [None]:
fig, ax = plt.subplots( dpi=200, figsize=(5,4) )

ax.plot( "frac_masked", "accuracy", data=accuracy, color="black", zorder=100 )
ax.fill_between( "frac_masked", "accuracy_low", "accuracy_high", data=accuracy, color="black", linewidth=0, alpha=0.2, zorder=99)

ax.xaxis.set_major_formatter( mticker.PercentFormatter( 1 ) )
ax.yaxis.set_major_formatter( mticker.PercentFormatter( 1 ) )

ax.set_xlim( 0, 1.01 )
ax.set_ylim( 0, 1.01 )

ax.set_xticks( np.arange( 0, 1, 0.05 ), minor=True )
ax.set_yticks( np.arange( 0, 1, 0.05 ), minor=True )

ax.set_xlabel( "Genome masked", fontweight="bold" )
ax.set_ylabel( "Accuracy", fontweight="bold" )

ax.grid( which="both", color="#EFEFEF")

plt.tight_layout()
plt.savefig( snakemake.output.accuracy_plot )
plt.show()

In [None]:
pp = res.groupby( "frac_masked" )["parsimony_placements"].describe( percentiles=[0.025, 0.5, 0.975])
pp = pp.reset_index()
pp.head()