## This notebook will show the steps needed to find TIR containing domains within the Acropora digifiera and Nematostella genomes

In [5]:
#first download files
import wget
input_path='../input'
print("Downloading files from the internet")
#protien file compagen webcite used in Poole and Weis 2014 but its not downloading!
url="http://compagen.zoologie.uni-kiel.de/datasets/ADIG_G-PEP_111201"
ADIG_PEP=wget.download(url, out=input_path)

Downloading files from the internet


## Preprocessing the data!
The files may need to be unzipped.
Also it is important to make sure that all letters are upper case!

In [None]:
#some of the genomes are not ziped when downloaded so this step may not need to be done.
!unzip "../input/file_name"


In [7]:
#make sure all letters are uppercase
!awk '/^>/ {print($0)}; /^[^>]/{print(toupper($0))}' "../input/ADIG_G-PEP_111201" > ../input/proteomes/adig_pep_upper.fna

## Build the hmmer TIR profile

These steps require that HMMER be installed on your computer.

In [5]:
#!/bash
import os
pfam_domain_dir = "../input/domains"
output_dir = "../output"
for domain in os.listdir(pfam_domain_dir):
    if not domain.endswith(".sto"):
        continue
    pfam_domain_filepath = os.path.join(pfam_domain_dir, domain)
    print("Building a HMM profile from: {pfam_domain_filepath}")
    #need to split the file name so that I can get a file without the extention
    base_filename = os.path.splitext(domain)[0]
    #build the hmmer profile for each domain in the folder and save the file
    output_filepath = os.path.join(output_dir, f"{base_filename}_profile.hmm")
    !echo "Building HMM profile: {output_filepath}"
    !hmmbuild {output_filepath} {pfam_domain_filepath}

Building a HMM profile from: {pfam_domain_filepath}
Building HMM profile: ../output/PF01582_full_up_profile.hmm
# hmmbuild :: profile HMM construction from multiple sequence alignments
# HMMER 3.3.2 (Nov 2020); http://hmmer.org/
# Copyright (C) 2020 Howard Hughes Medical Institute.
# Freely distributed under the BSD open source license.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# input alignment file:             ../input/domains/PF01582_full_up.sto
# output HMM file:                  ../output/PF01582_full_up_profile.hmm
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

# idx name                  nseq  alen  mlen eff_nseq re/pos description
#---- -------------------- ----- ----- ----- -------- ------ -----------
1     PF01582_full_up      13614  1188   299    50.60  0.590 

# CPU time: 1.07u 0.05s 00:00:01.12 Elapsed: 00:00:01.36


# Search the translated coral genomes for TIR domains
Use the above TIR profile to search against the translated genomes. The translated genome is important as the profile built in the previous step is amino acid based.

In [2]:
import os
#search proteome for TIR domains and save them
genome_dir = "/home/manager/TIR_A_dig/input/genomes/"
hmm_dir = "../output/"
output_dir ="../output"
#coral_pep = os.listdir(data_dir)
for hmm_file in os.listdir(hmm_dir):
    if not hmm_file.endswith(".hmm"):
            continue
    hmm_filepath = os.path.join(hmm_dir,hmm_file)
 
    for genome_file in os.listdir(genome_dir):
        if not genome_file.endswith(".fna"):
            continue
        coral_pep_filepath=os.path.join(genome_dir,genome_file)
        
        print(f"Running HMM search on genome {coral_pep_filepath}")
        print(f"Running HMM search using HMM file {hmm_filepath}")
       
        coral_pep_split = os.path.splitext(genome_file)[0]
        
        output_file = f"{coral_pep_split}_hmmer_results.txt"
        output_filepath = os.path.join(output_dir, output_file)
        print(f"Generating output file path {output_filepath}")
        !hmmsearch --tblout {output_filepath} {hmm_filepath} {coral_pep_filepath}
    #it works yay! but looks like it runs through the loop 4 times rather than just 2. 
    #Also for some reason tblout doesn't work.
       
#!hmmsearch ../output/PF01582_full_up.hmm  ../output/adig_pep_upper.fna > ../output/PF01582_full_up.out

Running HMM search on genome /home/manager/TIR_A_dig/input/genomes/adig_pep_upper.fna
Running HMM search using HMM file ../output/PF01582_full_up_profile.hmm
Generating output file path ../output/adig_pep_upper_hmmer_results.txt
# hmmsearch :: search profile(s) against a sequence database
# HMMER 3.3.2 (Nov 2020); http://hmmer.org/
# Copyright (C) 2020 Howard Hughes Medical Institute.
# Freely distributed under the BSD open source license.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# query HMM file:                  ../output/PF01582_full_up_profile.hmm
# target sequence database:        /home/manager/TIR_A_dig/input/genomes/adig_pep_upper.fna
# per-seq hits tabular output:     ../output/adig_pep_upper_hmmer_results.txt
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

Query:       PF01582_full_up  [M=299]
Scores for complete sequences (score includes all domains):
   --- full sequence ---   --- best 1 domain ---    -#dom-
    E-v

In [2]:
!hmmsearch -h

# hmmsearch :: search profile(s) against a sequence database
# HMMER 3.3.2 (Nov 2020); http://hmmer.org/
# Copyright (C) 2020 Howard Hughes Medical Institute.
# Freely distributed under the BSD open source license.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Usage: hmmsearch [options] <hmmfile> <seqdb>

Basic options:
  -h : show brief help on version and usage

Options directing output:
  -o <f>           : direct output to file <f>, not stdout
  -A <f>           : save multiple alignment of all hits to file <f>
  --tblout <f>     : save parseable table of per-sequence hits to file <f>
  --domtblout <f>  : save parseable table of per-domain hits to file <f>
  --pfamtblout <f> : save table of hits and domains to file, in Pfam format <f>
  --acc            : prefer accessions over names in output
  --noali          : don't output alignments, so output is smaller
  --notextw        : unlimit ASCII text output line width
  --textw <n>      : set max width of 