In [68]:
import sys
sys.path.append("/home/dzilov/Dropbox/gepymics/")

import os
import re
from glob import glob
from gepymics import fasta_read
from gepymics import reverse_complement
from collections import defaultdict

In [69]:
# first article == DOI: 10.1159/000357096
# second article == https://doi.org/10.1016/j.biologicals.2019.12.007
# third article == https://doi.org/10.1016/j.biologicals.2016.03.003

first_article_primers = {
    'F1': 'GCAAAGCTATAGAGATATAGTAGAGGT', 'F2': 'GCAAAGCTATAGAGATATAGTGGAGGT', 'F3': 'GCAAAGCTATAGAAATATAGTGGAGGT',
    'F4': 'GCGAAGCTATAGAAATATAGTGGAGGT', 'F5': 'GCGAAGCTATAGAGATATAGTGGAGGT', 'F6': 'GCAATGCTATAGAGATATAGCGGAGGT',
    'F7': 'GCAAAGCTATGGAGACATAGTGGAGGT', 'F8': 'GCAAAGTTATGGAAACATAATGGAGGT', 'R1': 'GTTGCGCTCGTTGCAGGAC',
    'R2': 'GTTGCGCTCGTTGCGGGAC', 'R3': 'GTTGCGTTCGTTGCAGGAC', 'R4': 'GTTGCGTTCGTTGCGGGAC'
} 


second_article_primers = {
    'F1': 'AAACTCAAAGGAATTGACGG', 'F2': 'AAACTCAAACGGAATTGACGG', 'F3': 'AAACTTAAAGGAATTGACGGG',
    'R1': 'CATCTGTTAACCTCCGAACTT', 'R2': 'CATCTGTTAACCTCCGTACTT', 'R3': 'CATACCGATAACCTCCACTAT',
    'R4': 'GTCACCTTGTTAACCTCCATTAT', 'R5': 'TGACTCTGATATCCTCCACTAT', 'R6': 'GTCATTCCGTTAACCTCAACTAT',
    'R7': 'GTCATTCCGTTAACCTCGACTAT', 'R8': 'CACCCTGATAACCTCCACTAT', 'R9': 'CACCCTGTTAACCTCCACTAT',
    'R10': 'CATTCCGTTGACCTCCACTAT', 'R11': 'TGTATCCTTGTTAACCTCTACTATATC', 'R12': 'CACTCTGTTAACCTCCACTAT',
    'R13': 'CTCAATGTTAACCTCCACTACA', 'R14': 'CACTCCGTTAGCCTCCACTAT', 'R15': 'CACTCCGTTAGCCTCCTCTAT',
    'R16': 'CTCAATGTTAGCCTCCACTAC', 'R17': 'GTATCTCTGTTAACCTCCACTATATC', 'R18': 'CTTACTGATAGCCTCCACTAT',
    'R19': 'CTTACTGATTGCCTCCACTAT', 'R20': 'GTACATCTGTTAGCCTCCTCTAT', 'R21': 'ATCTCGTTAGCCTCCACTAT',
    'R22': 'ATTCCGTTGACCTCCACTAT', 'R23': 'GTCATATTGTTAACCTCCGTTATATTTC', 'R24': 'TCTCAATGTTAACCTCCACTAT',
    'R25': 'TCTCAATGTTAGCCTCCACTAT', 'R26': 'CATCTGTTAGCCTCCGAACTT', 'R27': 'CAATGTTAGCCTCCAGCAT',
    'R28': 'ACTCGGTTAACCTCCATTATGT', 'R29': 'TGTCATTCTGTTAACCTCTACTATATC', 'R30': 'CACTCGGTTAACCTCCACTAT',
    'R31': 'GTCATCTTGTTAACCTCTACTATATCT', 'R32': 'CACTCCGTTAACCTCCACTAT', 'R33': 'GTCATCTTGTTAACCTCCACTATATC',
    'R34': 'ATTGGGTTGACCTCCACTAT', 'R35': 'TCATTCTGTTAGCCTCCACTAT', 'R36': 'TCATTCTGTTAGCCTCTACTAT',
    'R37': 'CTCAATGTTAACCTCCACTGT', 'R38': 'TGCTTCTGATAACCTCCACGAT', 'R39': 'TGCTTCTGATAACCTCCACTAT',
    'R40': 'TCATTCTGTTAACCTCCACTATATC', 'R41': 'GTCATTCTGATATACTCCACTGT', 'R42': 'TCATTCTGTTAACCTCCACTATGTC',
    'R43': 'TACATCTGTTAACCTCCACTATATC', 'R44': 'ACTCCGTTAACCTCCGCTAT', 'R45': 'TCTTTCGGTTAACCTCGACTAT',
    'R46': 'CACTCCGATAACCTCCACTAT', 'R47': 'GTTATTCTGTTAACCTCCAGTATG', 'R48': 'CACCCTGTTAACCTCTACTATATC',
    'R49': 'ACCTCGTTAACCTCCGCCAT', 'R50': 'ACCTCGTTAACCTCCGCTAT', 'R51': 'GTCATATTGTTAACCTCAACTATATTTCTATAGC',
    'R52': 'ACTCTGTTAACCTCCGCTAT', 'R53': 'ACTCTGTTAACCTCTGCTAT', 'R54': 'TCTCAATGTTAACCTCGGATATATC',
    'R55': 'TCCGTTAACCTCCGCT', 'R56': 'TCCGTTAGCCTCCGCT', 'R57': 'GTCTTACTGATATACTCCACCAT',
    'R58': 'CACCTCGATAACCTCCACTAT', 'R59': 'CACCTTGATAACCTCCACTAT', 'R60': 'ATCTCGTTAGCCTCGGCT',
    'R61': 'ATCCCTGATAACCTCCACTAT', 'R62': 'GTCTCAATGTTAACCTCTACTGT', 'R63': 'CCTTGATAGCCTCCGTATATG',
    'R64': 'GTCACCTTGTTAGCCTCCATTAT', 'R65': 'ACATCGGATAACCTCCACTAT'}


third_article_primers = {
    "F1": "GGATTAGATACCCTAGTAGTCCACA", "R1" : "CGTGTACCGTCGAATTAAGCA",
    "F2": "ACTAAGTGTTGGCCAAAAGGTC", "R2": "CCTCCGAATTTATTTCTAAGCCTTTG",
    "F3": "TCATCATGCCTCTTACGAGTG", "R3": "GCGGTGTGTACAAGACCCGA",
    "F4": "CTCCGCCTGAGTAGTATGC", "R4": "CACCTGTCTCAATGTTAACCTC"}

In [70]:
# merge all articles in one dictionary

all_primers = {}

for i, primer in first_article_primers.items():
    all_primers[f"A1{i}"] = primer

for i, primer in second_article_primers.items():
    all_primers[f"A2{i}"] = primer

for i, primer in third_article_primers.items():
    all_primers[f"A3{i}"] = primer

In [89]:
# write primers to the table

primers_table = "/home/dzilov/Dropbox/mycolasma_primers/tables/articles_primers.tsv"

with open(primers_table, "w") as fw:
    fw.write("#id\tprimer_seq\n")
    fw.write("# A1 (article_1) == https://doi.org/10.1159/000357096\n")
    fw.write("# A2 (article_2) == https://doi.org/10.1016/j.biologicals.2019.12.007\n")
    fw.write("# A3 (article_3) == https://doi.org/10.1016/j.biologicals.2016.03.003\n")
    for i, primer in all_primers.items():
        fw.write(f"{i}\t{primer}\n")

In [94]:
workdir = "/media/eternus1/projects/zilov/data/mycoplasma/all_complete_genomes/*"
genomes = glob(f"{workdir}/*.fna")

In [96]:
# check if primers in genome

primers_in = defaultdict(list)

for genome in genomes:
    genome_name = os.path.basename(genome)
    fasta_dict = fasta_read(genome)
    for header, seq in fasta_dict.items():
        for i, primer in all_primers.items():
            if (primer or reverse_complement(primer)) in seq:
                primers_in[genome_name].append(i)

In [98]:
# check primer pairs found in genomes

primer_pairs_found = {"1" : [], "2" : [], "3" : []}

for genome, matches in primers_in.items():
    for key in primer_pairs_found.keys():
        pattern_f = re.compile(f"A{key}[\d]*")
        pattern_r = re.compile(f"A{key}R[\d]*")
        if any(pattern_f.match(m) for m in matches) and any(pattern_r.match(m) for m in matches):
            if genome not in primer_pairs_found[key]:
                primer_pairs_found[key].append(genome)

In [100]:
for k, v in primer_pairs_found.items():
    print(k, len(v))

1 107
2 57
3 91


In [106]:
for k, v in primer_pairs_found.items():
    print(f"Article {k}: {len(v)}")

Article 1: 107
Article 2: 57
Article 3: 91


In [102]:
# write primers matches to the table

assembly_ids = []
for genome in genomes:
    ident = "_".join(os.path.splitext(os.path.basename(genome))[0].split("_")[:2])
    assembly_ids.append(ident)

In [103]:
def ncbi_id(db, db_id_query):
    from Bio import Entrez
    Entrez.email = "ad3002@gmail.com"
    handle = Entrez.esearch(db=db, retmax=100, term=db_id_query)
    query_id = Entrez.read(handle)["IdList"]
    return query_id

ncbi_ids = []
for aid in assembly_ids:
    ncbi_ids.append(ncbi_id("assembly", aid)[0])

In [104]:
def ncbi_id_summary(db, db_id):
    from Bio import Entrez
    Entrez.email = "ad3002@gmail.com"
    handle = Entrez.esummary(db=db, id=db_id)
    summary = Entrez.read(handle)
    handle.close()
    return summary

assembly_table = [] # assembly_acc assembly_name taxid species assembly_status
for nid in ncbi_ids:
    summary = ncbi_id_summary("assembly", nid)
    info_dict = summary["DocumentSummarySet"]["DocumentSummary"][0]
    if not info_dict["SubmissionDate"]:
        print(info_dict)
    genome_info = [info_dict["AssemblyAccession"], info_dict["AssemblyName"], info_dict["Taxid"], info_dict["SpeciesName"], info_dict["AssemblyStatus"], info_dict["SubmissionDate"]]
    assembly_table.append(genome_info)

In [129]:
primers_found_table = {}
for g in assembly_table:
    genome_name = g[0]
    for k, v in primers_in.items():
        if genome_name in k:
            g[-1] = ";".join(v)
            if genome_name not in primers_found_table.keys():
                primers_found_table[genome_name] = g[1:]

In [130]:
## write all genomes in which at least one primer was found

primers_in_genome_table = "./tables/primers_in_all_genomes.tsv"

with open(primers_in_genome_table, "w") as fw:
    fw.write("#assembly_acc\tassembly_name\ttaxid\tspecies\tassembly_status\tassembly_date\tprimers_found\n")
    for k, v in primers_found_table.items():
        line_to_write = "\t".join(v)
        fw.write(f"{k}\t{line_to_write}\n")

In [134]:
for g, info in primers_found_table.items():
    fo`

GCF_000023685.1
GCF_012934885.1
GCF_004771095.1
GCF_000143865.1
GCF_900489665.1
GCF_003663725.1
GCF_900660555.1
GCF_900489545.1
GCF_900489685.1
GCF_000200735.1
GCF_014352955.1
GCF_013402755.1
GCF_000211295.1
GCF_000815065.1
GCF_900660465.1
GCF_900489675.1
GCF_019720735.1
GCF_008728895.1
GCF_900489805.1
GCF_008728935.1
GCF_013367995.1
GCF_000427215.1
GCF_000008205.1
GCF_900660735.1
GCF_900489555.1
GCF_002952835.1
GCF_004768725.1
GCF_020497635.1
GCF_003855455.1
GCF_900489515.1
GCF_000286695.1
GCF_000085865.1
GCF_900489705.1
GCF_017654545.1
GCF_007923985.1
GCF_013367935.1
GCF_900489725.1
GCF_900489825.1
GCF_000759375.2
GCF_009792315.1
GCF_014068355.1
GCF_016767155.1
GCF_900489765.1
GCF_900489715.1
GCF_007858495.1
GCF_900489795.1
GCF_013008635.1
GCF_000211545.4
GCF_001554055.1
GCF_900660565.1
GCF_013367855.1
GCF_000524555.1
GCF_001554075.1
GCF_020497585.1
GCF_900489525.1
GCF_900489595.1
GCF_002736285.1
GCF_004771115.1
GCF_000286775.1
GCF_000941075.1
GCF_000008225.1
GCF_900489655.1
GCF_9006

In [132]:
primers_found_table

{'GCF_000023685.1': ['ASM2368v1',
  '436113',
  'Mycoplasma mycoides',
  'Complete Genome',
  '2009/08/21 00:00',
  'A1F1;A2F1;A3F4;A1F1;A2F1;A3F4'],
 'GCF_000277795.1': ['ASM27779v1',
  '1197325',
  'Mycoplasma wenyonii',
  'Complete Genome',
  '2012/07/16 00:00',
  'A1F4'],
 'GCF_002090295.1': ['ASM209029v1',
  '2104',
  'Mycoplasma pneumoniae',
  'Complete Genome',
  '2017/04/13 00:00',
  'A1F8;A2F2;A3F1'],
 'GCF_002090215.1': ['ASM209021v1',
  '2104',
  'Mycoplasma pneumoniae',
  'Complete Genome',
  '2017/04/13 00:00',
  'A1F8;A2F2;A3F1'],
 'GCF_012934885.1': ['ASM1293488v1',
  '2726117',
  'Mycoplasma sp. C264-NAS',
  'Complete Genome',
  '2020/04/30 00:00',
  'A1R1;A2F3;A2R56;A3R3'],
 'GCF_002128265.1': ['ASM212826v1',
  '2104',
  'Mycoplasma pneumoniae',
  'Complete Genome',
  '2017/05/11 00:00',
  'A1F8;A2F2;A3F1'],
 'GCF_011464375.1': ['ASM1146437v1',
  '40480',
  'Mycoplasma capricolum',
  'Complete Genome',
  '2020/03/19 00:00',
  'A1F1;A2F1;A3F4'],
 'GCF_004771095.1': ['AS

In [135]:
primer_pairs_in_genome = {}

for g in assembly_table:
    genome_name = g[0]
    for k, v in primer_pairs_found.items():
        if genome_name in k:
            g[-1] = ";".join(v)
            if genome_name not in primers_found_table.keys():
                primer_pairs_in_genome[genome_name] = g[1:]

In [141]:
taxa_list = "/home/dzilov/data_zilov/data/mycoplasma/all_taxa.txt"

uniq_taxa = []

for i in assembly_table:
    if i[2] not in uniq_taxa:
        uniq_taxa.append(i[2])


with open(taxa_list, "w") as fw:
    for i in uniq_taxa:
        fw.write(f"{i}\n")
    