In [None]:
from glob import glob
import os
from collections import defaultdict

def get_organism(gff):
    genome_dir = os.path.dirname(gff)
    report = glob(f'{genome_dir}/*.gbff')
    
    with open(report[0]) as fh:
        for line in fh:
            if line.startswith('SOURCE'):
                organism = line.split('SOURCE')[-1].strip()
    return organism

def get_pid(gff, protein_name):
    with open(gff) as fh:
        for line in fh:
            if protein_name.lower() in line.lower():
                pid = line.split("\t")[8].split("Name=")[-1].split(";")[0]
                return pid
                
def fasta_read(path_to_fasta_file):
    "Returns dictionary header:sequence"
    fasta = {}
    header = None
    with open(path_to_fasta_file) as fh:
        for i, line in enumerate(fh):
            line = line.strip()
            if line.startswith(">"):
                if header:
                    fasta[header] = "".join(seq)
                header = line[1:].split()[0]
                seq = []
            else:
                seq.append(line)
        if header:
            fasta[header] = "".join(seq)
    return fasta

In [None]:

sco_to_analize = ['30S ribosomal protein S7',
                  'molecular chaperone DnaK',
                  'F0F1 ATP synthase subunit alpha',
                  '50S ribosomal protein L6',
                  'aminoacyl-tRNA hydrolase',
                  'nucleotide exchange factor GrpE',
                  'uracil phosphoribosyltransferase',
                  'translation initiation factor IF-1',
                  '30S ribosome-binding factor RbfA',
                  'preprotein translocase subunit SecG',
                  'amidotransferase subunit GatB',
                  'cell wall cluster transcriptional repressor MraZ',
                  'deoxyribose-phosphate aldolase',
                  'methyltransferase RsmH',
                  '50S ribosomal protein L32',
                 'chromosomal replication initiator protein DnaA']

## sco num_bact n_missed uniq_prots prots_in_multifasta multifasta_file
summary_table = []

anno_files = glob("/home/dzilov/data_zilov/data/mycoplasma/*/*/*.gff")

print(len(anno_files), 'genomes to analyze')


# read all cds fastas
all_fastas = []
for i in glob("/home/dzilov/data_zilov/data/mycoplasma/*/*/*cds_from_genomic.fna"):
    fasta = fasta_read(i)
    all_fastas.append(fasta)
    
# build multifasta for each sco by all bacteria

for sco in sco_to_analize:
    print(f"take {sco} to build multifasta")
    
    sco_joined = "_".join(sco.split())
    multifasta_file = f"/home/dzilov/data_zilov/data/mycoplasma/clusters/multifasta/{sco_joined}.fasta"
    
#     if os.path.exists(multifasta_file) and os.path.getsize(multifasta_file) != 0:
#         print(f"Multifasta for {sco} already written, continue...")
#         continue
    
    
    pids2bact = {} 
    n_missed = 0
    for file in anno_files:
        prot2bact = [get_pid(file, sco), get_organism(file)]
        if prot2bact[0] and prot2bact[1]:
            pids2bact[prot2bact[0]] = prot2bact[1]
        else:
            n_missed += 1
            print(f"MISSED {sco} in {file}, organism: {prot2bact[1]}")
    
    print(f'{len(pids2bact)} uniq prots in {sco}')
        
    multifasta = {}
    for pid, organism in pids2bact.items():
        for prots in all_fastas:
            for header, seq in prots.items():
                if pid in header:
                    short_organism = "_".join(organism.split(",")[0].split())
                    multifasta[f"{pid}_{short_organism}"]=seq
    
    

    with open(multifasta_file, "w") as fw:
        for h, s in multifasta.items():
            fw.write(f">{h}\n{s}\n")
            
    print(len(multifasta.keys()), f'prots added to multifasta {multifasta_file}')
    
    sco_summary = [sco, str(len(all_fastas)), str(n_missed), str(len(pids2bact)), multifasta_file]
    summary_table.append(sco_summary)

MISSED 30S ribosome-binding factor RbfA in /home/dzilov/data_zilov/data/mycoplasma/all_complete_genomes/GCF_000008405.1_ASM840v1/GCF_000008405.1_ASM840v1_genomic.gff, organism: Mycoplasma hyopneumoniae 232
MISSED 30S ribosome-binding factor RbfA in /home/dzilov/data_zilov/data/mycoplasma/all_complete_genomes/GCF_000281235.1_ASM28123v1/GCF_000281235.1_ASM28123v1_genomic.gff, organism: Candidatus Mycoplasma haemolamae str. Purdue
MISSED 30S ribosome-binding factor RbfA in /home/dzilov/data_zilov/data/mycoplasma/all_complete_genomes/GCF_000477415.1_ASM47741v1/GCF_000477415.1_ASM47741v1_genomic.gff, organism: Mycoplasma parvum str. Indiana
MISSED 30S ribosome-binding factor RbfA in /home/dzilov/data_zilov/data/mycoplasma/all_complete_genomes/GCF_021383865.1_ASM2138386v1/GCF_021383865.1_ASM2138386v1_genomic.gff, organism: Mycoplasma hyopneumoniae
MISSED 30S ribosome-binding factor RbfA in /home/dzilov/data_zilov/data/mycoplasma/all_complete_genomes/GCF_000400855.1_ASM40085v1/GCF_000400855.1

MISSED cell wall cluster transcriptional repressor MraZ in /home/dzilov/data_zilov/data/mycoplasma/all_complete_genomes/GCF_000018785.1_ASM1878v1/GCF_000018785.1_ASM1878v1_genomic.gff, organism: Acholeplasma laidlawii PG-8A
MISSED cell wall cluster transcriptional repressor MraZ in /home/dzilov/data_zilov/data/mycoplasma/all_complete_genomes/GCF_000183365.1_ASM18336v1/GCF_000183365.1_ASM18336v1_genomic.gff, organism: Mycoplasma leachii PG50
MISSED cell wall cluster transcriptional repressor MraZ in /home/dzilov/data_zilov/data/mycoplasma/all_complete_genomes/GCF_900476025.1_50465_E02/GCF_900476025.1_50465_E02_genomic.gff, organism: Acholeplasma laidlawii
MISSED cell wall cluster transcriptional repressor MraZ in /home/dzilov/data_zilov/data/mycoplasma/all_complete_genomes/GCF_000203215.1_ASM20321v1/GCF_000203215.1_ASM20321v1_genomic.gff, organism: Mycoplasma suis KI3806
MISSED cell wall cluster transcriptional repressor MraZ in /home/dzilov/data_zilov/data/mycoplasma/all_complete_genom

MISSED chromosomal replication initiator protein DnaA in /home/dzilov/data_zilov/data/mycoplasma/all_complete_genomes/GCF_007858515.1_ASM785851v1/GCF_007858515.1_ASM785851v1_genomic.gff, organism: Mycoplasma anserisalpingitidis
MISSED chromosomal replication initiator protein DnaA in /home/dzilov/data_zilov/data/mycoplasma/all_complete_genomes/GCF_019720735.1_ASM1972073v1/GCF_019720735.1_ASM1972073v1_genomic.gff, organism: Mycoplasma sp. Ms02
MISSED chromosomal replication initiator protein DnaA in /home/dzilov/data_zilov/data/mycoplasma/all_complete_genomes/GCF_008728895.1_ASM872889v1/GCF_008728895.1_ASM872889v1_genomic.gff, organism: Mycoplasma gallisepticum
MISSED chromosomal replication initiator protein DnaA in /home/dzilov/data_zilov/data/mycoplasma/all_complete_genomes/GCF_008728935.1_ASM872893v1/GCF_008728935.1_ASM872893v1_genomic.gff, organism: Mycoplasma gallisepticum
MISSED chromosomal replication initiator protein DnaA in /home/dzilov/data_zilov/data/mycoplasma/all_complete

95 prots added to multifasta /home/dzilov/data_zilov/data/mycoplasma/clusters/multifasta/chromosomal_replication_initiator_protein_DnaA.fasta


In [None]:
## MSA with MUSCLE for each protein cluster

#write commands for xargs

muscle_xargs = '/home/dzilov/data_zilov/data/mycoplasma/clusters/multifasta/xargs_muscle.txt'
muscle_dir = '/home/dzilov/data_zilov/data/mycoplasma/clusters/multifasta/muscle'

multifasta_list = glob('/home/dzilov/data_zilov/data/mycoplasma/clusters/multifasta/*.fasta')


with open(muscle_xargs, 'w') as fw:
    for i in multifasta_list:
        out = os.path.join(muscle_dir, f"alignment_{os.path.basename(i)}")
        fw.write(f'/home/dzilov/soft/miniconda3/envs/muscle/bin/muscle -in {i} -out {out}\n')

command = "less /home/dzilov/data_zilov/data/mycoplasma/clusters/multifasta/xargs_muscle.txt | xargs -I {} -P 15 sh -c '{}'"
os.system(command)

In [None]:
## phylogeny with IQTree for each protein cluster alignment

#write commands for xargs

iqtree_xargs = '/home/dzilov/data_zilov/data/mycoplasma/clusters/multifasta/xargs_iqtree.txt'
iqtree_dir = '/home/dzilov/data_zilov/data/mycoplasma/clusters/multifasta/iqtree_with_acholeplasma/'

alignment_list = glob('/home/dzilov/data_zilov/data/mycoplasma/clusters/multifasta/muscle/*.fasta')

with open(iqtree_xargs, 'w') as fw:
    for i in alignment_list:
        out = os.path.join(iqtree_dir, os.path.splitext(os.path.basename(i))[0])
        fw.write(f'/home/dzilov/soft/miniconda3/envs/iqtree/bin/iqtree -T 1 --mem 100G -s {i} --prefix {out}\n')
        
command = "less /home/dzilov/data_zilov/data/mycoplasma/clusters/multifasta/xargs_iqtree.txt | xargs -I {} -P 15 sh -c '{}'"
os.system(command)

In [None]:
## tree to png

## conda activate newick_utils

tree_draw_xargs = '/home/dzilov/data_zilov/data/mycoplasma/clusters/multifasta/xargs_drawtree.txt'
pic_dir = '/home/dzilov/data_zilov/data/mycoplasma/clusters/multifasta/iqtree_new/tree_pics/'

trees = glob('/home/dzilov/data_zilov/data/mycoplasma/clusters/multifasta/iqtree_w/*.treefile')

with open(tree_draw_xargs, 'w') as fw:
    for tree in trees:
        print(tree)
        prot_name = os.path.basename(tree).split('.treefile')[0]
        fw.write(f'nw_display -w 2000 -s {tree} > {pic_dir}{prot_name}.svg\n')
        fw.write(f'convert {pic_dir}{prot_name}.svg {pic_dir}{prot_name}.png\n')
        fw.write(f'rm {pic_dir}{prot_name}.svg\n')

In [None]:
summary_file = '/media/eternus1/projects/zilov/data/mycoplasma/clusters/multifasta/mycoplasma_targets_to_tree_summary.tsv'

with open(summary_file, "w") as fw:
    fw.write('#target\tgenomes_analyzed\tmissed_genomes\tunique_sequnces\tmultifasta_file\n')
    for cl in summary_table:
        line_to_write = "\t".join(cl)
        fw.write(f'{line_to_write}\n')

In [None]:
summary_table

In [None]:
consensus_table = defaultdict()

for v in fasta_msa.values():
    for i in range(len(v)):
        consensus_table[i] = {'A': 0, 'C': 0, 'G': 0, 'T': 0, '-': 0, 'N': 0}
    break

for k, v in fasta_msa.items():
    for i, n in enumerate(v):
        consensus_table[i][n] += 1

In [None]:
consensus = []

for k, v in consensus_table.items():
    few_n = []
    
    for n, c in v.items():
        if c == len(fasta_msa):
            few_n.append(n)
        else:
            if c != 0:
                if c / len(fasta_msa) > .3:
                    few_n.append(n)
      
    if len(few_n) == 1:
        if '-' in few_n:
            consensus.append('N')
        else:
            consensus.append(few_n[0])
    elif ('A' in few_n and 'T' in few_n and 'G' in few_n and 'C' in few_n) or ('-' in few_n):
        consensus.append('N')
    elif 'A' in few_n and 'G' in few_n:
        if 'C' in few_n:
            consensus.append('V')
        elif 'T' in few_n:
            consensus.append('D')
        else:
            consensus.append('R')
    elif 'A' in few_n and 'T' in few_n:
        if 'C' in few_n:
            consensus.append('H')
        else:
            consensus.append('W')
    elif 'G' in few_n and 'C' in few_n:
        if 'T' in few_n:
            consensus.append('B')
        else:
            consensus.append('S')
    elif 'G' in few_n and 'T' in few_n:
        consensus.append('K')
    elif 'C' in few_n and 'T' in few_n:
        consensus.append('Y')
    elif 'A' in few_n and 'C' in few_n:
        consensus.append('M')
        
    return ''.join(consensus)