# Replication of dnadiff results

This `notebook` contains all code and command lines, needed to replicate values returned by `dnadiff`. 

In [1]:
# Set Up
import os
from Bio import SeqIO
import pandas as pd
from pathlib import Path

First, we need to run four comands to generate all files needed for calulcations. 

This will include:
- `nucmer` to generate alignments with `--maxmatch` parameter
- `delta-filter` to generate M-to-M alignments by calling `-m` paratemter
- `show-coords` to generate `.coords` file needed to calculate `AlignedBases` and `AverageIdentity`
- `show-diff` to generate `.rdiff` files needed for `AlignedBases` calculation

In [2]:
def run_command_lines(input_genomes):
    """Run cmmd lines to generate
    `.mcoords` and `diff` files with
    the same settings as dnadiff. 
    """

    input_dir = Path(input_genomes)
    output_dir = input_dir.parent

    fasta = sorted(fname for fname in input_dir.iterdir() if '.DS_Store' not in str(fname))

    #Running Nucmer
    os.system(f"nucmer --maxmatch -p {output_dir}/output/output {fasta[0]} {fasta[1]}")

    #Running delta
    os.system(f"delta-filter -m {output_dir}/output/output.delta > {output_dir}/output/output.mdelta") 

    #Running show-coords
    os.system(f"show-coords -rclTH {output_dir}/output/output.mdelta > {output_dir}/output/output.mcoords")

    #Running 
    os.system(f"show-diff -rH {output_dir}/output/output.mdelta > {output_dir}/output/output.rdiff")
    

    

__Replication of AlignedBases value__

In [3]:
def AlignedBases(genome_seq, mcoords_file, rdiff_file):
    """Calculate and return AlignedBases value
    using the same approach as implemented by dnadiff.

    :param genome_seq: Path to a genome's sequence used in the analysis
    :param mcoords_file: Path to the mcoords file outputed by dnadiff
    :param rdiff_file: Path to the rdiff file outputed by dnadiff
    """

    #Step 1. Getting basic sequence information

    rnSeqs = 0
    rnBases = 0
    rnABases = 0
    TotalBases = 0

    records = list(SeqIO.parse(Path(genome_seq), "fasta"))
    refs = {record.id:len(record.seq) for record in records}

    for sequence_id, sequence_length in refs.items():
        rnSeqs += 1
        rnBases += sequence_length
        TotalBases += sequence_length

    #Step 2. Retriving information from M-to-M alignments (mcoords)
    rqnAlignsM = 0
    rSumLenM = 0
    rqSumIdyM = 0
    rqSumLenM = 0
    rnASeqs = 0

    mcoords = pd.read_csv(Path(mcoords_file), sep='\t', names=[f"col_{_}" for _ in range(0,13)])

    for index, row in mcoords.iterrows():
        rqnAlignsM += 1
        rSumLenM += row['col_4']
        rqSumIdyM += (row['col_6']/100) * (row['col_4'] + row['col_5'])
        rqSumLenM += (row['col_4'] + row['col_5'])

    seen_ref_seq = []
    for index, row in mcoords.iterrows():
        if row['col_11'] not in seen_ref_seq:
            rnASeqs += 1
            rnABases += refs[row['col_11']]
            seen_ref_seq.append(row['col_11'])

    #Step 3: Retrive information from `.rdiff` files, and updating the AlignedBases values
    rdiff = pd.read_csv(Path(rdiff_file), sep='\t', names=[f"col_{_}" for _ in range(0,7)])

    rnTIns = 0
    rSumTIns = 0
    rnIns = 0
    rSumIns = 0

    for index, row in rdiff.iterrows():
        gap = row['col_4']
        ins = gap
        if row['col_1'] == 'GAP':
            if int(row['col_6']) > gap:
                ins = row['col_6']
            if int(row['col_4']) <=0 and int(row['col_5']) <=0 and int(row['col_6']) >0:
                rnTIns +=1
                rSumTIns += int(row['col_6'])
        if row['col_1'] != 'DUP' and gap >0:
            rnABases -= gap
        if int(ins) >0:
            rnIns +=1
            rSumIns += int(ins)

    #Step 4: Genome Coverage calculation

    GenomeCov = rnABases / TotalBases

    return rnABases, GenomeCov

__Replication of AverageIdentity value__

In [4]:
def AverageIdentity(coords_file):
    """Calculate and return AverageIdentity
    value from coords files.

    :param coords_file: Path to coords file

    """

    
    # Calculating values from coord files
    coords1 = pd.read_csv(
        Path(coords_file), sep="\t", header=None
    )
    column_names = [f"col_{i}" for i in range(0, len(coords1.columns))]
    coords1.columns = column_names
    
    reqSumIdy1 = 0
    rqSumLen1 = 0
    
    
    for index, row in coords1.iterrows():
        reqSumIdy1 += (row["col_6"] / 100) * (row["col_4"] + row["col_5"])
    
        rqSumLen1 += row["col_4"] + row["col_5"]
    
    AverageID = round(reqSumIdy1 / rqSumLen1 * 100, 2)
    
    return AverageID

Running the analysis on `donovan_AF_bug`

In [5]:
run_command_lines("../data/donovan_AF_bug/input")
print(AlignedBases("../data/donovan_AF_bug/input/MGV-GENOME-0357962.fna", "../data/donovan_AF_bug/output/output.mcoords", "../data/donovan_AF_bug/output/output.rdiff"))
print(AverageIdentity("../data/donovan_AF_bug/output/output.mcoords"))

(87285, 1.0)
99.94


1: PREPARING DATA
2,3: RUNNING mummer AND CREATING CLUSTERS
# reading input file "../data/donovan_AF_bug/output/output.ntref" of length 87286
# construct suffix tree for sequence of length 87286
# (maximum reference length is 2305843009213693948)
# (maximum query length is 18446744073709551615)
# CONSTRUCTIONTIME /opt/anaconda3/envs/pyani_issue_421/opt/mummer-3.23/mummer ../data/donovan_AF_bug/output/output.ntref 0.01
# reading input file "/Users/angelikakiepas/Desktop/pyani_all/pyani/replicate_dnadiff/scripts/../data/donovan_AF_bug/input/MGV-GENOME-0358017.fna" of length 87353
# matching query-file "/Users/angelikakiepas/Desktop/pyani_all/pyani/replicate_dnadiff/scripts/../data/donovan_AF_bug/input/MGV-GENOME-0358017.fna"
# against subject-file "../data/donovan_AF_bug/output/output.ntref"
# COMPLETETIME /opt/anaconda3/envs/pyani_issue_421/opt/mummer-3.23/mummer ../data/donovan_AF_bug/output/output.ntref 0.04
# SPACE /opt/anaconda3/envs/pyani_issue_421/opt/mummer-3.23/mummer ../data/do

Comparing these to the dnadiff output

In [6]:
!cat 10 ../data/donovan_AF_bug/output/dnadiff/output.report | head -n 25

cat: 10: No such file or directory
/Users/angelikakiepas/Desktop/pyani/issue_421/rounding_error/scripts/../data/donovan_AF_bug/input/MGV-GENOME-0357962.fna /Users/angelikakiepas/Desktop/pyani/issue_421/rounding_error/scripts/../data/donovan_AF_bug/input/MGV-GENOME-0358017.fna
NUCMER

                               [REF]                [QRY]
[Sequences]
TotalSeqs                          1                    1
AlignedSeqs               1(100.00%)           1(100.00%)
UnalignedSeqs               0(0.00%)             0(0.00%)

[Bases]
TotalBases                     87285                87353
AlignedBases          87285(100.00%)       87353(100.00%)
UnalignedBases              0(0.00%)             0(0.00%)

[Alignments]
1-to-1                             3                    3
TotalLength                    87496                87510
AvgLength                   29165.33             29170.00
AvgIdentity                    99.94                99.94

M-to-M                             3     

Running the analysis on `donovan`\'s second example. 

In [7]:
run_command_lines("../data/donovan_test/input")
print(AlignedBases("../data/donovan_test/input/MGV-GENOME-0264574.fna", "../data/donovan_test/output/output.mcoords", "../data/donovan_test/output/output.rdiff"))
print(AverageIdentity("../data/donovan_test/output/output.mcoords"))

1: PREPARING DATA
2,3: RUNNING mummer AND CREATING CLUSTERS
# reading input file "../data/donovan_test/output/output.ntref" of length 39254
# construct suffix tree for sequence of length 39254
# (maximum reference length is 2305843009213693948)
# (maximum query length is 18446744073709551615)
# CONSTRUCTIONTIME /opt/anaconda3/envs/pyani_issue_421/opt/mummer-3.23/mummer ../data/donovan_test/output/output.ntref 0.00
# reading input file "/Users/angelikakiepas/Desktop/pyani_all/pyani/replicate_dnadiff/scripts/../data/donovan_test/input/MGV-GENOME-0266457.fna" of length 39594
# matching query-file "/Users/angelikakiepas/Desktop/pyani_all/pyani/replicate_dnadiff/scripts/../data/donovan_test/input/MGV-GENOME-0266457.fna"
# against subject-file "../data/donovan_test/output/output.ntref"
# COMPLETETIME /opt/anaconda3/envs/pyani_issue_421/opt/mummer-3.23/mummer ../data/donovan_test/output/output.ntref 0.01
# SPACE /opt/anaconda3/envs/pyani_issue_421/opt/mummer-3.23/mummer ../data/donovan_test/o

(39169, 0.997860036175579)
99.63


In [8]:
!cat 10 ../data/donovan_test/output/dnadiff/output.report | head -n 25

cat: 10: No such file or directory
/Users/angelikakiepas/Desktop/pyani/issue_421/rounding_error/scripts/../data/donovan_test/input/MGV-GENOME-0264574.fna /Users/angelikakiepas/Desktop/pyani/issue_421/rounding_error/scripts/../data/donovan_test/input/MGV-GENOME-0266457.fna
NUCMER

                               [REF]                [QRY]
[Sequences]
TotalSeqs                          1                    1
AlignedSeqs               1(100.00%)           1(100.00%)
UnalignedSeqs               0(0.00%)             0(0.00%)

[Bases]
TotalBases                     39253                39594
AlignedBases           39169(99.79%)        39176(98.94%)
UnalignedBases             84(0.21%)           418(1.06%)

[Alignments]
1-to-1                             2                    2
TotalLength                    59174                59187
AvgLength                   29587.00             29593.50
AvgIdentity                    99.63                99.63

M-to-M                             2         

Running the analysis on two _Streptomyces_ genomes

In [9]:
run_command_lines("../data/streptomyces_genomes/input")
print(AlignedBases("../data/streptomyces_genomes/input/GCF_002802945.1_ASM280294v1_genomic.fna", "../data/streptomyces_genomes/output/output.mcoords", "../data/streptomyces_genomes/output/output.rdiff"))
print(AverageIdentity("../data/streptomyces_genomes/output/output.mcoords"))

1: PREPARING DATA
2,3: RUNNING mummer AND CREATING CLUSTERS
# reading input file "../data/streptomyces_genomes/output/output.ntref" of length 6937472
# construct suffix tree for sequence of length 6937472
# (maximum reference length is 2305843009213693948)
# (maximum query length is 18446744073709551615)
# process 69374 characters per dot
#....................................................................................................
# CONSTRUCTIONTIME /opt/anaconda3/envs/pyani_issue_421/opt/mummer-3.23/mummer ../data/streptomyces_genomes/output/output.ntref 3.40
# reading input file "/Users/angelikakiepas/Desktop/pyani_all/pyani/replicate_dnadiff/scripts/../data/streptomyces_genomes/input/GCF_003369795.1_ASM336979v1_genomic.fna" of length 7787608
# matching query-file "/Users/angelikakiepas/Desktop/pyani_all/pyani/replicate_dnadiff/scripts/../data/streptomyces_genomes/input/GCF_003369795.1_ASM336979v1_genomic.fna"
# against subject-file "../data/streptomyces_genomes/output/output

(1684244, 0.24277632455011083)
85.48


In [10]:
!cat 10 ../data/streptomyces_genomes/output/dnadiff/output.report | head -n 25

cat: 10: No such file or directory
/Users/angelikakiepas/Desktop/pyani/issue_421/rounding_error/scripts/../data/streptomyces_genomes/input/GCF_002802945.1_ASM280294v1_genomic.fna /Users/angelikakiepas/Desktop/pyani/issue_421/rounding_error/scripts/../data/streptomyces_genomes/input/GCF_003369795.1_ASM336979v1_genomic.fna
NUCMER

                               [REF]                [QRY]
[Sequences]
TotalSeqs                         41                    1
AlignedSeqs               22(53.66%)           1(100.00%)
UnalignedSeqs             19(46.34%)             0(0.00%)

[Bases]
TotalBases                   6937431              7787608
AlignedBases         1684244(24.28%)      1737298(22.31%)
UnalignedBases       5253187(75.72%)      6050310(77.69%)

[Alignments]
1-to-1                          1455                 1455
TotalLength                  1679781              1679569
AvgLength                    1154.49              1154.34
AvgIdentity                    85.35                85