#This code will compare two files for Pfam domains

In [1]:
#!/usr/bin/python
from collections import defaultdict
import pandas as pd
from os.path import exists, join
import pdb
import os
import re
import csv

In [2]:
input_hmmer_dir = "../output/hmmer_txt_output/braker3_gff"
input_dir = "../input/"

output_dir = "../output/hmmer_txt_output/braker3_gff/TLR_output_files"
output_file_pfam = "Pfam_domain_similarity_braker3_lrr_manual.tsv"
output_filepath = join(output_dir,output_file_pfam)


required = [input_dir,input_hmmer_dir,output_dir]

for r in required:
    if not exists(r):
        raise FileNotFoundError(f"The specified hmmer file {r} does not seem to be in the specified path.")
    print(f"Confirmed that required file or directory {r} exists.")


Confirmed that required file or directory ../input/ exists.
Confirmed that required file or directory ../output/hmmer_txt_output/braker3_gff exists.
Confirmed that required file or directory ../output/hmmer_txt_output/braker3_gff/TLR_output_files exists.


In [3]:
#prepare the output file for the loop by adding column names to both files
column_names = ["Species", "Pfam_domains_compared", "Isoform_ID"]
with open(output_filepath, 'w') as output_pfam:
    output_pfam.write("%s" % "\t".join(column_names)+"\n")

In [7]:
def find_shared_domains(file_1,file_2):
    """ Print results of matching target sequence names from two HMMER text outputs.
    File_1 -- text file of HMMER output.
    File_2 -- text tile of second HMMER output to be compared.
    """
    names = ["target_name", "accession description", "query name", "accession",\
            "full sequence E-value", "full sequence score", "full sequence bias",\
            "best 1 domain E-value", "best domain score", "best domain bias",\
            "domain number estimation exp", "domain number estimation reg",\
            "domain number estimation clu", "domain number estimation ov",\
            "domain number estimation env", "domain number estimation dom",\
            "domain number estimation rep", "domain number estimation inc",\
            "description of target"]
    df_1 = pd.read_csv(file_1, sep="\s+", comment = "#", index_col = False, names=names)
    df_2 = pd.read_csv(file_2, sep="\s+", comment = "#", index_col = False, names=names)
    #need to filter out output values so that only values less than 0.1 remain
    df_1_filter = df_1[(df_1["full sequence E-value"]<.0000000001)]
    df_1_target = df_1_filter["target_name"]
    #need to compare only the gene id values rather than the isoform values (i.e the values after the .p)
    df_1_target_strip = df_1_target.str.split('.t').str[0]
    #df_1_target_strip = df_1_target.str.split('.1.p').str[0]
    df_1_target_names = set(df_1_target_strip)
    #need to filter out output values so that only values less than 0.1 remain
    df_2_filter = df_2[(df_2["full sequence E-value"]<.0000000001)]
    df_2_target = df_2["target_name"]
    df_2_target_strip = df_2_target.str.split('.t').str[0]
    #df_2_target_strip = df_2_target.str.split('.1.p').str[0]
    df_2_target_names = set(df_2_target_strip)
    shared_target_names = df_1_target_names.intersection(df_2_target_names)
    #print(shared_target_names)
    return shared_target_names

In [8]:
#choose a coral species to compare
#need to select the coral species
coral_species = 'a_hyactintus'
coral_files = {}
#keys =
for file in os.listdir(input_hmmer_dir):
    if not file.startswith(coral_species):
        continue
    if not file.endswith('hmmer_braker3_gff_results.txt'):
    #if not file.endswith('profile_hmmer_results.txt'):
        continue
        #pfam_domain = os.path.basename(file("_")[-1])#change to -1 for nontest sequences
    print(file)

a_hyactintus_soft_mask_braker_filtered_proteins_PF18017_full_profile_hmmer_braker3_gff_results.txt
a_hyactintus_soft_mask_braker_filtered_proteins_PF01463_full_profile_hmmer_braker3_gff_results.txt
a_hyactintus_soft_mask_braker_filtered_proteins_PF13927_full_profile_hmmer_braker3_gff_results.txt
a_hyactintus_soft_mask_braker_filtered_proteins_PF18837_full_profile_hmmer_braker3_gff_results.txt
a_hyactintus_soft_mask_braker_filtered_proteins_PF00047_full_profile_hmmer_braker3_gff_results.txt
a_hyactintus_soft_mask_braker_filtered_proteins_PF18452_full_profile_hmmer_braker3_gff_results.txt
a_hyactintus_soft_mask_braker_filtered_proteins_PF00069_seed_profile_hmmer_braker3_gff_results.txt
a_hyactintus_soft_mask_braker_filtered_proteins_PF13855_full_profile_hmmer_braker3_gff_results.txt
a_hyactintus_soft_mask_braker_filtered_proteins_PF12799_full_profile_hmmer_braker3_gff_results.txt
a_hyactintus_soft_mask_braker_filtered_proteins_PF01582_full_up_profile_hmmer_braker3_gff_results.txt
a_hyact

In [31]:

#domains_by_gene_family = {"Ig":["PF00047","PF18452","PF13927","PF13895"]}
#domains for lrrs: PF00560, PF07723, PF07725, PF12799, PF13306, PF13516, PF13855, PF14580, PF18805, PF18837
#domains for tir = PF01582
#death domains: PF00531
domains_by_gene_family = {"TLR":["PF01582","PF18837"]}
filename_template="a_hyactintus_soft_mask_braker_filtered_proteins_{domain}_full_profile_hmmer_braker3_gff_results.txt"
for gene_family in domains_by_gene_family.keys():
    domains=domains_by_gene_family[gene_family]
    print(domains)
    results ={}
    for domain in domains:
        #figure out file name for current domain
        domain_file=filename_template.format(domain=domain)
        print(domain_file)
        domain_file_path=os.path.join(input_hmmer_dir,domain_file)
        print(domain_file_path)
        for domain_2 in domains:
            domain_file_2=filename_template.format(domain=domain_2)
            print(domain_file_2)
            domain_file_path_2=os.path.join(input_hmmer_dir,domain_file_2)
            print(domain_file_path_2)
            gene_ids=find_shared_domains(domain_file_path,domain_file_path_2)
            print(gene_ids)
            if not results:
                results=gene_ids
            results=results.intersection(gene_ids)
            print(len(results))
            

['PF01582', 'PF18837']
a_hyactintus_soft_mask_braker_filtered_proteins_PF01582_full_profile_hmmer_braker3_gff_results.txt
../output/hmmer_txt_output/braker3_gff/a_hyactintus_soft_mask_braker_filtered_proteins_PF01582_full_profile_hmmer_braker3_gff_results.txt
a_hyactintus_soft_mask_braker_filtered_proteins_PF01582_full_profile_hmmer_braker3_gff_results.txt
../output/hmmer_txt_output/braker3_gff/a_hyactintus_soft_mask_braker_filtered_proteins_PF01582_full_profile_hmmer_braker3_gff_results.txt
{'anno1.g20329', 'anno1.g3943', 'anno1.g11141', 'anno1.g4848', 'anno1.g9987', 'anno1.g17273', 'anno1.g12545', 'anno1.g4739', 'anno1.g2564', 'anno1.g4844', 'anno1.g20454', 'anno1.g17262', 'anno1.g4835', 'anno1.g20738', 'anno1.g20735', 'anno1.g20737', 'anno1.g4845', 'anno1.g2563', 'anno1.g20736', 'anno1.g4843', 'anno1.g17268', 'anno1.g7451', 'anno1.g20330', 'anno1.g4840', 'anno1.g12542', 'anno1.g12541', 'anno1.g7077', 'anno1.g20210'}
28
a_hyactintus_soft_mask_braker_filtered_proteins_PF18837_full_pro

In [28]:
#Need to extract some parts of the files used to write to the common output file.
species = os.path.basename(domain_file[:domain_file.index('_soft')])
print(species)
#pfam_file_1 = os.path.basename(domain_file.split("_")[-2])#change to -1 for nontest sequences
#print(pfam_file_1)
#pfam_file_2 = os.path.basename(domain_file_2.split("_")[-2])#change to -1 for nontest sequences
#print(pfam_file_2)
pfam_string = str(domains)[1:-1]
pfam_replace_1 = pfam_string.replace("'","")
pfam_compare = pfam_replace_1.replace(",","_")
#pfam_compare = pfam_file_1 + " " + pfam_file_2
print(pfam_compare)


a_hyactintus
PF01582_ PF14580


In [29]:
#need to write each shared domain to a file
#NOTE: union comand releases output as a set. 

with open(output_filepath, "a") as comparison_file:
    #writer=csv.DictWriter(comparison_file, fieldnames=shared_domains)
    for gene_id in results:
        print(f"Writing output file: {species}, {pfam_compare}, {gene_id}")
        #we have a problem in the output... it is seperating pfam compare into its own columns.
        #I want them as one column with all domains.
        row=[(species),(pfam_compare),(gene_id)]
        row="\t".join(map(str,row))
        comparison_file.write(row + "\n")

Writing output file: a_hyactintus, PF01582_ PF14580, anno1.g11809
