## HMMER file parsing and results counting
This code will parse and count the number of HMMER search results generated from the previous step and output it to a table.

In [7]:
#!/usr/bin/python
from collections import defaultdict
import pandas as pd
from os.path import exists, join
import pdb
import os
import re

# Import files and setup directories for parsing files

In [14]:
input_hmmer_dir = "../output/hmmer_txt_output/"
input_dir = "../input/"

output_file = "hmmer_count_results_cnidarian2.tsv"
output_dir = "../output/"
output_filepath = join(output_dir,output_file)

required = [input_dir,input_hmmer_dir,output_dir]

for r in required:
    if not exists(r):
        raise FileNotFoundError(f"The specified hmmer file {r} does not seem to be in the specified path.")
    print(f"Confirmed that required file or directory {r} exists.")


Confirmed that required file or directory ../input/ exists.
Confirmed that required file or directory ../output/hmmer_txt_output/ exists.
Confirmed that required file or directory ../output/ exists.


# Prepare a output file so that data generated from the loop in the next step can be written to it.
The column names in the file are: Species File, Domain, Count, and Unique Count. The count is the total number of hits Hmmer found wile unique count only takes into account different hits (ie no duplicate hits).

In [15]:
#prepare the output file for the loop by adding column names
column_names = ["Species_File", "Domain", "Count", "Unique_Count_Isoform", "Unique_count_Sequence"]
with open(output_filepath, 'w') as output:
    output.write("%s" % "\t".join(column_names)+"\n")

## Loop to count the number of unique outputs generated by HMMER 
Convert HMMER text results into a pandas data frame and then count the results. 
Note: HMMER uses space delimination

In [16]:
for results_file in os.listdir(input_hmmer_dir):
    if not results_file.endswith(".txt"):
        continue
    #convert the HMMER output to a pandas dataframe (separated by spaces)
    hmmer_results = pd.read_csv((input_hmmer_dir + results_file),sep = "\s+", \
                                comment = "#", index_col = False,\
            names=["target_name", "accession description", "query name", "accession",\
            "full sequence E-value", "full sequence score", "full sequence bias",\
            "best 1 domain E-value", "best domain score", "best domain bias",\
            "domain number estimation exp", "domain number estimation reg",\
            "domain number estimation clu", "domain number estimation ov",\
            "domain number estimation env", "domain number estimation dom",\
            "domain number estimation rep", "domain number estimation inc",\
            "description of target"])
    
    #select e value columns. Count the number of total and unique ids in the column
    #if no values are present in the file (ie no matches were found) assign 0s.
    hit_indicies = hmmer_results["full sequence E-value"]<.1
    
    if not True in list(hit_indicies):
        print("No hits. Manualy assigning 0s")
        hmmer_unique_target_isoform = 0
        unique_target_id_count = 0
        hmmer_total_id = 0
    else:
        hmmer_select_df = hmmer_results[hit_indicies]
    
        #count the total number of hits by isolating the first value from the shape output
        hmmer_total_id = hmmer_select_df.shape[0]
    
        #count the uniques id hits including isoforms
        hmmer_unique_target_isoform = hmmer_select_df.target_name.nunique()
    
        #count unique target names only not number of isoforms
        unique_target_id = hmmer_select_df.target_name.str.split('.p').str[0]
        unique_target_id_count = unique_target_id.nunique()
    
    # extract domain used from file name
    results_domain_used = results_file.split("_")[-5]
    
    #extract only the species name from the output file name
    species_name = results_file[:results_file.index('longest')]
    
    #print what is being analyzed in the same stile that it will be put in the table
    print(f"Writing the output file: {species_name}, {results_domain_used},\
    {hmmer_total_id}, {hmmer_unique_target_isoform}, {unique_target_id_count}")
    
    #write the results to a table 
    with open(output_filepath, 'a') as output:
        output_field = [(species_name), (results_domain_used), (hmmer_total_id),\
                        (hmmer_unique_target_isoform), (unique_target_id_count)]
        output.write("\t".join(map(str, output_field))+"\n")


Writing the output file: M_capitata_, PF00560,    91, 91, 78
Writing the output file: P_verrucosa_, PF13855,    298, 298, 221
Writing the output file: M_efflorescens_, PF01582,    35, 35, 19
Writing the output file: Fungia_, PF00560,    79, 79, 56
Writing the output file: M_capitata_, PF00047,    415, 415, 225
No hits. Manualy assigning 0s
Writing the output file: O_faveolata_, PF18837,    0, 0, 0
Writing the output file: P_damicornis_, PF13516,    107, 107, 75
Writing the output file: P_damicornis_, PF01582,    24, 24, 14
Writing the output file: A_hyactintus_, PF12799,    287, 287, 108
Writing the output file: M_capitata_, PF13855,    191, 191, 158
Writing the output file: M_efflorescens_, PF13895,    511, 511, 170
Writing the output file: Aiptasia_, PF00047,    455, 455, 133
Writing the output file: A_hyactintus_, PF18837,    3, 3, 3
Writing the output file: Fungia_, PF01582,    32, 32, 17
Writing the output file: A_cytherea_, PF00560,    138, 138, 75
Writing the output file: P_verr