# Viterbi Processing

#### Check log viterbi of the testing sets to see if it is in the range

In [1]:
## Input:
training_file = "./OXTR_viterbi_training" ## Input viterbi data from HMM
testing_file = "./OXTR_viterbi_testing" ## Testing viterbi data from HMM

## Output:
log_viterbi_testing_file = "./OXTR_log_viterbi_testing.fasta" ## Output the log viterbi of the testing file
log_viterbi_testing_file_with_output = "./primate_log_viterbi_testing_out.fasta"## Output the result

## Support libraries

In [2]:
#libs
import pandas as pd
from Bio import AlignIO
from Bio import SeqIO
from Bio import Phylo
from sklearn.model_selection import train_test_split
from bisect import bisect_left

## Training set

In [3]:
length_training = 0
des = []
with open(training_file) as handle:
    for record in SeqIO.parse(handle, "fasta"):
        des.append(record.description)
        length_training = length_training + 1 
print("Training size: ", length_training)

Training size:  20


In [4]:
scores_training = []
for i in des:
    split_string = i.split("=", 1)
    substring = split_string[1]
    split_string = substring.split("_", 1)
    substring = split_string[0]
    score = float(substring)
    scores_training.append(score)

In [5]:
scores_training

[-13.505550165546241,
 -10.175748823247558,
 -15.890452753421053,
 -11.454502412279107,
 -15.753575067847407,
 -11.074291153310236,
 -11.175748816776066,
 -10.647012991819068,
 -11.852442414479665,
 -10.522074237234412,
 -10.175748823247558,
 -10.175748823247558,
 -11.925766580850727,
 -24.34832377595867,
 -15.179186461109301,
 -18.800142442405008,
 -13.281982449350739,
 -16.08936715455976,
 -15.745888255211144,
 -17.169206342452558]

In [6]:
# find min max for training set
maxViterbi = max(scores_training)
maxViterbi
minViterbi = min(scores_training)
minViterbi
print("Training sets viterbi")
print("Max Viterbi: ", maxViterbi)
print("Min Viterbi: ", minViterbi)

Training sets viterbi
Max Viterbi:  -10.175748823247558
Min Viterbi:  -24.34832377595867


## Testing set

In [7]:
length_testing = 0
des_testing = []
with open(testing_file) as handle:
    for record in SeqIO.parse(handle, "fasta"):
        des_testing.append(record.description)
        length_testing = length_testing + 1
print("Testing size: ", length_testing)

Testing size:  5


In [8]:
# Get score
scores_testing = []
for i in des_testing:
    split_string = i.split("=", 1)
    substring = split_string[1]
    split_string = substring.split("_", 1)
    substring = split_string[0]
    score = float(substring)
    scores_testing.append(score)

In [9]:
scores_testing

[-19.81947906928411,
 -19.027963863397595,
 -10.522074237234412,
 -15.902729537845085,
 -17.95769928266878]

In [10]:
# Get name
name_testing = []
for i in des_testing:
    split_string = i.split("__", 1)
    substring = split_string[1]
    split_string = substring.split(" ", 1)
    substring = split_string[0]
    name_testing.append(substring)
name_testing

['XP_003927162.1',
 'XP_017357089.1',
 'XP_032005459.1',
 'XP_002813528.1',
 'ALO75878.1']

In [11]:
#Convert X_test these into a fasta file script
original_file = testing_file
corrected_file = log_viterbi_testing_file
with open(original_file) as original, open(corrected_file, 'w') as corrected:
    records = SeqIO.parse(original_file, 'fasta')
    for (record,score,name) in zip (records,scores_testing,name_testing):
        record.id = name
        record.description = str(score)
        SeqIO.write(record, corrected, 'fasta')

### Check the viterbi if it is in the range

In [12]:
#helper function for viterbi check
def check_viterbi(input,min,max):
    if (input<min):
        print("Less than min, not in range")
    elif (input>max):
        print("More than man, not in range")
    else:
        print("Within the Training Viterbi range")

In [13]:
#Check the viterbi if it is in the range
print("Viterbi range: min:", minViterbi, " to max: ", maxViterbi)
with open(log_viterbi_testing_file) as handle:
    for record in SeqIO.parse(handle, "fasta"):
        print("Sequence check: ", record.id)
        split_string = record.description.split(" ", 1)
        viterbi_score = float(split_string[1])
        print("Sequence viterbi score: ", viterbi_score)
        check_viterbi(viterbi_score,minViterbi,maxViterbi)
        des_testing.append(record.description)

Viterbi range: min: -24.34832377595867  to max:  -10.175748823247558
Sequence check:  XP_003927162.1
Sequence viterbi score:  -19.81947906928411
Within the Training Viterbi range
Sequence check:  XP_017357089.1
Sequence viterbi score:  -19.027963863397595
Within the Training Viterbi range
Sequence check:  XP_032005459.1
Sequence viterbi score:  -10.522074237234412
Within the Training Viterbi range
Sequence check:  XP_002813528.1
Sequence viterbi score:  -15.902729537845085
Within the Training Viterbi range
Sequence check:  ALO75878.1
Sequence viterbi score:  -17.95769928266878
Within the Training Viterbi range


### Check Viterbi score of the testing file to see if it is in the Viterbi range of input file or not

In [14]:
#helper function for viterbi check
def check_viterbi_output(input,min,max):
    if (input<min):
        return False
    elif (input>max):
        return False
    else:
        return True

In [15]:
#Check inputs viterbi and output results
original_file = log_viterbi_testing_file
corrected_file = log_viterbi_testing_file_with_output
with open(original_file) as original, open(corrected_file, 'w') as corrected:
    records = SeqIO.parse(original_file, 'fasta')
    for record in records:
        split_string = record.description.split(" ", 1)
        viterbi_score = float(split_string[1])
        if check_viterbi_output(viterbi_score,minViterbi,maxViterbi):
            record.description = record.description + " Yes"
            print(record.description)
        else:
            record.description = record.description + " No"
            print(record.description)
        SeqIO.write(record, corrected, 'fasta')

XP_003927162.1 -19.81947906928411 Yes
XP_017357089.1 -19.027963863397595 Yes
XP_032005459.1 -10.522074237234412 Yes
XP_002813528.1 -15.902729537845085 Yes
ALO75878.1 -17.95769928266878 Yes
