In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
from statsmodels.stats.proportion import proportion_confint

In [None]:
res = pd.read_csv( snakemake.input.results )
res["correct"] = (res["lineage"] == res["actual"])&(res["confidence"]==1)
res["frac_masked"] = res["frac_masked"] / 100
res["parsimony_placements"] = res["classification_notes"].str.extract(r"[A-Z0-9.]+\([0-9]+\/([0-9]+)\)")
res["parsimony_placements"] = pd.to_numeric( res["parsimony_placements"] )
res.loc[~res["correct"],"confidence"] = 1 - res["confidence"]
res = res.loc[~res["file"].str.startswith( "ERR025389_T2" )]
res.head()

In [None]:
accuracy = res.groupby( "frac_masked" )["correct"].agg( ["count", "sum"] )
accuracy.columns = ["observations", "successes"]
accuracy = accuracy.reset_index()
accuracy["accuracy"] = accuracy["successes"] / accuracy["observations"]
accuracy[["accuracy_low", "accuracy_high"]] = accuracy.apply( lambda x: pd.Series( proportion_confint( x["successes"], x["observations"], alpha=0.05, method="jeffreys" ) ), axis=1 )
accuracy["genome_present"] = 1 - accuracy["frac_masked"]
accuracy.head()

In [None]:
fig, ax = plt.subplots( dpi=200, figsize=(5,4) )

ax.plot( "frac_masked", "accuracy", data=accuracy, color="black", zorder=100 )
ax.fill_between( "frac_masked", "accuracy_low", "accuracy_high", data=accuracy, color="black", linewidth=0, alpha=0.2, zorder=99)

ax.xaxis.set_major_formatter( mticker.PercentFormatter( 1 ) )
ax.yaxis.set_major_formatter( mticker.PercentFormatter( 1 ) )

ax.set_xlim( 0, 1.01 )
ax.set_ylim( 0, 1.01 )

ax.set_xticks( np.arange( 0, 1, 0.05 ), minor=True )
ax.set_yticks( np.arange( 0, 1, 0.05 ), minor=True )

ax.set_xlabel( "Genome masked", fontweight="bold" )
ax.set_ylabel( "Accuracy", fontweight="bold" )

ax.grid( which="both", color="#EFEFEF")

plt.tight_layout()
plt.savefig( snakemake.output.accuracy_plot )
plt.show()

In [None]:
pp = res.groupby( "frac_masked" )["parsimony_placements"].describe( percentiles=[0.025, 0.5, 0.975])
pp = pp.reset_index()
pp.head()

In [None]:
fig, ax = plt.subplots( dpi=200, figsize=(5,4) )

ax.plot( "frac_masked", "50%", data=pp, color="black", zorder=100 )
ax.fill_between( "frac_masked", "2.5%", "97.5%", data=pp, color="black", linewidth=0, alpha=0.2, zorder=99)

ax.xaxis.set_major_formatter( mticker.PercentFormatter( 1 ) )

ax.set_xlim( 0, 1.01 )
ax.set_ylim( 0, 20 )

ax.set_xticks( np.arange( 0, 1, 0.05 ), minor=True )
ax.set_yticks( np.arange( 0, 20, 1 ), minor=True )
ax.set_yticks( np.arange( 0, 25, 5 ), minor=False )

ax.set_xlabel( "Genome masked (%)", fontweight="bold" )
ax.set_ylabel( "Parsimoneous placements", fontweight="bold" )

ax.grid( which="both", color="#EFEFEF")

plt.tight_layout()
plt.savefig( snakemake.output.parsimony_plot )
plt.show()

In [None]:
aps = res.groupby( ["actual", "frac_masked"] )["correct"].agg( ["count", "sum"])
aps.columns = ["observations", "successes"]
aps["accuracy"] = aps["successes"] / aps["observations"]
aps[["accuracy_low", "accuracy_high"]] = aps.apply( lambda x: pd.Series( proportion_confint( x["successes"], x["observations"], alpha=0.05, method="jeffreys" ) ), axis=1 )
aps = aps.reset_index()
aps.head()

In [None]:
fig, axes = plt.subplots( dpi=200, figsize=(10,6), nrows=3, ncols=4, sharex=True, sharey=True )
lineages = ["T1", "T2", "T5", "T6", "T7", "T8", "T9", "T10", "T12", "T13", "T15"]
axes[-1,-1].remove()
for lin, ax in zip( lineages, axes.flatten() ):
    df = aps.loc[aps["actual"]==lin]
    ax.plot(df["frac_masked"], df["accuracy"], label=lin, zorder=10, color="black" )
    ax.fill_between( "frac_masked", "accuracy_low", "accuracy_high", data=df, color="black", alpha=0.2, linewidth=0, zorder=9 )
    ax.set_ylim(0,1.05)
    ax.set_xlim(0,1.05)
    ax.set_xticks( np.arange(0,1,0.1), minor=True)
    ax.set_yticks( np.arange(0,1,0.1), minor=True)
    ax.set_xticks( [0,0.5,1] )
    ax.set_yticks( [0,0.5,1] )
    ax.xaxis.set_major_formatter( mticker.PercentFormatter(1) )
    ax.yaxis.set_major_formatter( mticker.PercentFormatter(1) )
    ax.grid( which="major", axis="both", linewidth=1, color="#F1F1F1", zorder=1 )
    ax.grid( which="minor", axis="both", linewidth=0.5, color="#F1F1F1", zorder=1 )
    ax.set_title( lin, fontweight="bold", loc="left", fontsize=10 )
    ax.set_xlabel( "Genome masked (%)", fontweight="bold")
    ax.set_ylabel( "Accuracy (%)", fontweight="bold" )
    if lin != "T10":
        ax.label_outer()

axes[1,3].tick_params( axis="x", labelbottom=True)
axes[1,3].set_ylabel( "" )
axes[1,3].set_xticks([0, 0.5, 1])

plt.tight_layout()
plt.savefig( snakemake.output.accuracy_lineage_plot )
plt.show()