## HMMER file parsing and results counting
This code will parse and count the number of HMMER search results generated from the previous step and output it to a table.

In [1]:
#!/usr/bin/python
from collections import defaultdict
import pandas as pd
from os.path import exists, join
import pdb
import os

# Import files and setup directories for parsing files

In [2]:
input_hmmer_dir = "../output/hmmer_txt_output/"
input_dir = "../input/"

output_file = "hmmer_count_results_cnidarian.tsv"
output_dir = "../output/"
output_filepath = join(output_dir,output_file)

required = [input_dir,input_hmmer_dir,output_dir]

for r in required:
    if not exists(r):
        raise FileNotFoundError(f"The specified hmmer file {r} does not seem to be in the specified path.")
    print(f"Confirmed that required file or directory {r} exists.")


Confirmed that required file or directory ../input/ exists.
Confirmed that required file or directory ../output/hmmer_txt_output/ exists.
Confirmed that required file or directory ../output/ exists.


# Prepare a output file so that data generated from the loop in the next step can be written to it.
The column names in the file are: Species File, Domain, Count, and Unique Count. The count is the total number of hits Hmmer found wile unique count only takes into account different hits (ie no duplicate hits).

In [3]:
#prepare the output file for the loop by adding column names
column_names = ["Species_File", "Domain", "Count", "Unique_Count_Isoform", "Unique_count_Sequence"]
with open(output_filepath, 'w') as output:
    output.write("%s" % "\t".join(column_names)+"\n")

## Loop to count the number of unique outputs generated by HMMER 
Convert HMMER text results into a pandas data frame and then count the results. 
Note: HMMER uses space delimination

In [4]:
for results_file in os.listdir(input_hmmer_dir):
    if not results_file.endswith(".txt"):
        continue
    #convert the HMMER output to a pandas dataframe (separated by spaces)
    hmmer_results = pd.read_csv((input_hmmer_dir + results_file),sep = "\s+", \
                                comment = "#", index_col = False,\
            names=["target_name", "accession description", "query name", "accession",\
            "full sequence E-value", "full sequence score", "full sequence bias",\
            "best 1 domain E-value", "best domain score", "best domain bias",\
            "domain number estimation exp", "domain number estimation reg",\
            "domain number estimation clu", "domain number estimation ov",\
            "domain number estimation env", "domain number estimation dom",\
            "domain number estimation rep", "domain number estimation inc",\
            "description of target"])
    
    #select e value columns. Count the number of total and unique ids in the column
    hmmer_select_df = hmmer_results[hmmer_results["full sequence E-value"]<.1]
    
    #count the total number of hits by isolating the first value from the shape output
    hmmer_total_id = hmmer_select_df.shape[0]
    
    #count the uniques id hits including isoforms
    hmmer_unique_target_isoform = hmmer_select_df.target_name.nunique()
    
    #count unique target names only not number of isoforms
    unique_target_id = hmmer_select_df.target_name.str.split('.p').str[0]
    unique_target_id_count = unique_target_id.nunique()
    
    # extract column and row to designate domain used
    results_column_used = hmmer_results.loc[:,'query name']
    results_domain_used = results_column_used.iloc[0,]
    
    #extract only the file name (ie without the file extention) for the output file
    results_file_split = os.path.splitext(results_file) [0]
    
    #print what is being analyzed in the same stile that it will be put in the table
    print(f"Writing the output file: {results_file_split}, {results_domain_used},\
    {hmmer_select_df.shape}, {hmmer_unique_target_isoform}, {unique_target_id_count}")
    
    #write the results to a table 
    with open(output_filepath, 'a') as output:
        output_field = [(results_file_split), (results_domain_used), (hmmer_total_id),\
                        (hmmer_unique_target_isoform), (unique_target_id_count)]
        output.write("\t".join(map(str, output_field))+"\n")


Writing the output file: A_aurita_longest_orfs_hmmer_results, PF01582_full_up,    (12, 19), 12, 12
Writing the output file: Fungia_longest_orfs_hmmer_results, PF01582_full_up,    (34, 19), 34, 18
Writing the output file: P_verrucosa_longest_orfs_hmmer_results, PF01582_full_up,    (28, 19), 28, 17
Writing the output file: O_faveolata_longest_orfs_hmmer_results, PF01582_full_up,    (27, 19), 27, 15
Writing the output file: Aiptasia_longest_orfs_hmmer_results, PF01582_full_up,    (9, 19), 9, 8
Writing the output file: A_cytherea_longest_orfs_hmmer_results, PF01582_full_up,    (34, 19), 34, 18
Writing the output file: P_lutea_longest_orfs_hmmer_results, PF01582_full_up,    (38, 19), 38, 25
Writing the output file: Xen_longest_orfs_hmmer_results, PF01582_full_up,    (7, 19), 7, 5
Writing the output file: G_fascicularis_longest_orfs_hmmer_results, PF01582_full_up,    (20, 19), 20, 16
Writing the output file: P_damicornis_longest_orfs_hmmer_results, PF01582_full_up,    (23, 19), 23, 13
Writin