# Viterbi Processing

#### Check log viterbi of the testing sets to see if it is in the range

In [1]:
## Input:
training_file = "./bonyFish_viterbi_training" ## input viterbi data from HMM
testing_file = "./bonyFish_viterbi_testing" ## Test viterbi data from HMM

## Output:
log_viterbi_testing_file = "./bonyFish_log_viterbi_testing.fasta" ## Output the log viterbi of the testing file
log_viterbi_testing_file_with_output = "./bonyFish_log_viterbi_testing_out.fasta"## Output the result

## Support libraries

In [2]:
#libs
import pandas as pd
from Bio import AlignIO
from Bio import SeqIO
from Bio import Phylo
from sklearn.model_selection import train_test_split
from bisect import bisect_left

## Training set

In [3]:
length_training = 0
des = []
with open(training_file) as handle:
    for record in SeqIO.parse(handle, "fasta"):
        des.append(record.description)
        length_training = length_training + 1 
print("Training size: ", length_training)

Training size:  20


In [4]:
scores_training = []
for i in des:
    split_string = i.split("=", 1)
    substring = split_string[1]
    split_string = substring.split("_", 1)
    substring = split_string[0]
    score = float(substring)
    scores_training.append(score)

In [5]:
scores_training

[-73.6105520177382,
 -84.92690956533501,
 -217.30760033867622,
 -78.22266231879902,
 -418.70785531728654,
 -73.6105520177382,
 -287.0834628185588,
 -95.51999429893145,
 -92.63791276230968,
 -230.5730549350164,
 -82.02659071501006,
 -380.0310040573463,
 -392.4571684762799,
 -73.60694789190055,
 -107.71673284312624,
 -75.84125644116058,
 -416.04164316809056,
 -81.30668680819508,
 -78.49867154508875,
 -101.92132305956717]

In [6]:
# find min max for training set
maxViterbi = max(scores_training)
maxViterbi
minViterbi = min(scores_training)
minViterbi
print("Training sets viterbi")
print("Max Viterbi: ", maxViterbi)
print("Min Viterbi: ", minViterbi)

Training sets viterbi
Max Viterbi:  -73.60694789190055
Min Viterbi:  -418.70785531728654


## Testing set

In [7]:
length_testing = 0
des_testing = []
with open(testing_file) as handle:
    for record in SeqIO.parse(handle, "fasta"):
        des_testing.append(record.description)
        length_testing = length_testing + 1
print("Testing size: ", length_testing)

Testing size:  5


In [8]:
# Get score
scores_testing = []
for i in des_testing:
    split_string = i.split("=", 1)
    substring = split_string[1]
    split_string = substring.split("_", 1)
    substring = split_string[0]
    score = float(substring)
    scores_testing.append(score)

In [9]:
scores_testing

[-462.58235850212895,
 -103.97955074178653,
 -78.78796399513632,
 -87.15685326113591,
 -83.9925180293402]

In [10]:
# Get name
name_testing = []
for i in des_testing:
    split_string = i.split("__", 1)
    substring = split_string[1]
    split_string = substring.split(" ", 1)
    substring = split_string[0]
    name_testing.append(substring)
name_testing

['NP_001186299.1',
 'NP_001243561.1',
 'XP_033482864.1',
 'XP_030277370.1',
 'XP_028431436.1']

In [11]:
#Convert X_test these into a fasta file script
original_file = testing_file
corrected_file = log_viterbi_testing_file
with open(original_file) as original, open(corrected_file, 'w') as corrected:
    records = SeqIO.parse(original_file, 'fasta')
    for (record,score,name) in zip (records,scores_testing,name_testing):
        record.id = name
        record.description = str(score)
        SeqIO.write(record, corrected, 'fasta')

### Check the viterbi if it is in the range

In [12]:
#helper function for viterbi check
def check_viterbi(input,min,max):
    if (input<min):
        print("Less than min, not in range")
    elif (input>max):
        print("More than man, not in range")
    else:
        print("Within the Training Viterbi range")

In [13]:
#Check the viterbi if it is in the range
print("Viterbi range: min:", minViterbi, " to max: ", maxViterbi)
with open(log_viterbi_testing_file) as handle:
    for record in SeqIO.parse(handle, "fasta"):
        print("Sequence check: ", record.id)
        split_string = record.description.split(" ", 1)
        viterbi_score = float(split_string[1])
        print("Sequence viterbi score: ", viterbi_score)
        check_viterbi(viterbi_score,minViterbi,maxViterbi)
        des_testing.append(record.description)

Viterbi range: min: -418.70785531728654  to max:  -73.60694789190055
Sequence check:  NP_001186299.1
Sequence viterbi score:  -462.58235850212895
Less than min, not in range
Sequence check:  NP_001243561.1
Sequence viterbi score:  -103.97955074178653
Within the Training Viterbi range
Sequence check:  XP_033482864.1
Sequence viterbi score:  -78.78796399513632
Within the Training Viterbi range
Sequence check:  XP_030277370.1
Sequence viterbi score:  -87.15685326113591
Within the Training Viterbi range
Sequence check:  XP_028431436.1
Sequence viterbi score:  -83.9925180293402
Within the Training Viterbi range


### Check Viterbi score of the testing file to see if it is in the Viterbi range of input file or not

In [14]:
#helper function for viterbi check
def check_viterbi_output(input,min,max):
    if (input<min):
        return False
    elif (input>max):
        return False
    else:
        return True

In [22]:
#Convert X_test these into a fasta file script
original_file = log_viterbi_testing_file
corrected_file = log_viterbi_testing_file_with_output
with open(original_file) as original, open(corrected_file, 'w') as corrected:
    records = SeqIO.parse(original_file, 'fasta')
    for record in records:
        split_string = record.description.split(" ", 1)
        viterbi_score = float(split_string[1])
        if check_viterbi_output(viterbi_score,minViterbi,maxViterbi):
            record.description = record.description + " Yes"
            print(record.description)
        else:
            record.description = record.description + " No"
            print(record.description)
        SeqIO.write(record, corrected, 'fasta')

NP_001186299.1 -462.58235850212895 No
NP_001243561.1 -103.97955074178653 Yes
XP_033482864.1 -78.78796399513632 Yes
XP_030277370.1 -87.15685326113591 Yes
XP_028431436.1 -83.9925180293402 Yes


In [23]:
## Clear unwanted file
##Remove unwanted files
##Can comment to get the file
import os
os.remove(log_viterbi_testing_file)