In [1]:
import pandas as pd
from importlib.resources import files
from pathlib import Path
import tempfile
import subprocess

In [None]:
vcf = pd.read_csv( snakemake.input.vcf, sep="\t", skiprows=13 )
vcf = vcf.set_index(['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT'])
vcf.head()

In [None]:
cand = pd.read_csv( snakemake.input.candidates )
cand["lineage"] = cand["id"].apply( lambda x: x.split("_" )[-1] )
samples = cand["sequence_id"].to_list()
cand.head()

In [None]:
pb = files( "vibecheck" ) / "resources/o1_cholera.no_missing.pb"

usher_results = {
    "fraction_missing" : [],
    "trial" : [],
    "result" : [],
    "sample" : []
}

fractions = [i/100 for i in snakemake.params.frequencies]

results = list()

for frac in fractions:
    for trial in range( snakemake.params.trials ):
        temp_dir = Path( tempfile.gettempdir() ) / f"{trial}-{frac}/"
        temp_dir.mkdir( parents=True, exist_ok=True )
        temp_vcf = tempfile.NamedTemporaryFile(suffix=".vcf")

        vcf.sample(frac=frac, replace=False )[samples].to_csv( temp_vcf, sep="\t" )

        subprocess.run( f"usher -n -D -i {pb} -v {temp_vcf.name} -T {snakemake.threads} -d {temp_dir}", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT )

        result = pd.read_csv( (temp_dir / "clades.txt"), sep="\t", header=None, names=["sample", "result"] )
        result["fraction_missing"] =  1-frac
        result["trial"] = trial
        results.append( result )

    print( f"Finished {frac}" )

In [None]:
usher_results = pd.concat( results, ignore_index=True )
usher_results["lineage"] = usher_results["result"].apply( lambda x: x.split( "*|" )[0] )
usher_results["actual"] = usher_results["sample"].apply( lambda x: x.split( "|" )[3] )
usher_results["correct"] = usher_results["lineage"] == usher_results["actual"]
usher_results.head()

In [None]:
usher_results.to_csv( snakemake.output.results )