In [1]:
from random import choice
from random import randint 
from math import log10
def fasta_reader(file):
    return [line.strip("\n").split(",") for line in open(file)]
#generate a random character given weights 
def select_char(probabilities:dict, precision: int = 10000):
    s = ""
    for i in probabilities.keys():
        s+=i*int(probabilities[i]*precision)
    return choice(s)

class HMM:
    def __init__(self):
        self.states = []
        self.emissions = []
        self.data_table = {}
        self.columns = [] #list of dicts each list position is a 
    def load_data(self, file, state_slice=0, seq_slice=2):
        for data_point in fasta_reader(file):
            state = data_point[state_slice]
            seq = data_point[seq_slice].strip(" ")
            if state not in self.states:
                self.states.append(state)
            for i in seq:
                if i not in self.emissions and i != " ":
                    self.emissions.append(i)
            self.data_table[seq] = state 
    def build_model(self):
        self.columns = {z:[{j:1 for j in self.emissions} for i in range(len(list(self.data_table.keys())[0]))] for z in self.states}
        self.prob_columns = {z:[{j:1 for j in self.emissions} for i in range(len(list(self.data_table.keys())[0]))] for z in self.states}
        for z in self.columns.keys(): # for each state 
            for j in range(len(list(self.data_table.keys())[0])): # for each position in the list of columns 
                for i in self.data_table.keys(): #check to make sure the sequence being used belongs to the right state
                    if self.data_table[i] == z:
                        char = i[j]
                        self.columns[z][j][char] = self.columns[z][j][char] + 1   
            for j in range(len(list(self.data_table.keys())[0])):
                counter = 0
                for i in self.columns[z][j].keys():
                    counter+=self.columns[z][j][i]
                for i in self.prob_columns[z][j].keys():
                    self.prob_columns[z][j][i] = self.columns[z][j][i]/counter
                    
    def viterbi_classification(self, seq, logodd=1, verbose=0):
        state_seq = []
        prob = {i:1 for i in self.states}
        if logodd == 0:
            def ld(x, y):
                return x*y
        else:
            def ld(x, y):
                if x == 1:
                    return log10(y)
                else:
                    return x+log10(y)
        for i in range(len(seq)):
            for j in self.prob_columns.keys():
                prob[j] = ld(prob[j], self.prob_columns[j][i][seq[i]])
        highest = -100000000
        state = ""
        if verbose !=0:
            print(prob)
        for i in prob.keys():
            if prob[i] > highest:
                highest = prob[i]
                state = i
        return state, highest
      
    def generate_sequence(self, state="random"):
        if state == "random":
            state = choice(self.states)
        seq = ""
        for i in range(len(self.prob_columns[state])):
            seq+=select_char(self.prob_columns[state][i])
        return seq, state
            
x = HMM()
x.load_data("splice.data.txt")
x.build_model()
x.prob_columns
x.viterbi_classification("AGACCCGCCGGGAGGCGGAGGACCTGCAGGGTGAGCCCCACCGCCCCTCCGTGCCCCCGC")
x.generate_sequence()

print(x.states, x.emissions)
x.viterbi_classification("CTGGAATTACCAGCTATTCCTCTTATGACTAGGTCTTTACCACCGCGCGGCTCTTCTATT") #expected state N

['EI', 'IE', 'N'] ['C', 'A', 'G', 'T', 'N', 'D', 'R', 'S']


('N', -36.415347866411324)

In [2]:
def sanity_check(states: list, hmm: HMM, labeled_data: dict, verbose = 0):
    answers = {i:{"True Negative":0, "True Positive":0, "False Positive":0, "False Negative":0} for i in states}
    for i in labeled_data.keys():
        results = hmm.viterbi_classification(i)[0]
        if results == labeled_data[i]:
            for j in answers.keys():
                if j == results: 
                    answers[j]["True Positive"] = answers[j]["True Positive"]+1
                else:
                    answers[j]["True Negative"] = answers[j]["True Negative"]+1
        else:
            for j in answers.keys():
                if j == results: 
                    answers[j]["False Positive"] = answers[j]["False Positive"]+1
                    answers[labeled_data[i]]["False Negative"] = answers[labeled_data[i]]["False Negative"]+1
                    
                else:
                    answers[j]["True Negative"] = answers[j]["True Negative"]+1  
    if verbose != 0: 
        for i in answers.keys():
            accuracy = (answers[i]["True Negative"]+answers[i]["True Positive"])/(answers[i]["True Negative"]+answers[i]["True Positive"]+answers[i]["False Negative"]+answers[i]["False Negative"])
            spec = answers[i]["True Negative"]/(answers[i]["True Negative"] + answers[i]["False Positive"])
            sens = answers[i]["True Positive"]/(answers[i]["True Positive"] + answers[i]["False Negative"])
            print("--",i,"--")
            print("Accuracy: ", accuracy)
            print("Specificity", spec)
            print("Sensitivty", sens)
            print("\n")
    return answers
print("Self Test")
sanity_check(x.states, x, x.data_table, verbose = 1)

Self Test
-- EI --
Accuracy:  0.975504799735187
Specificity 0.9754549301735083
Sensitivty 0.9455081001472754


-- IE --
Accuracy:  0.9774685222001326
Specificity 0.9767638360794254
Sensitivty 0.9494047619047619


-- N --
Accuracy:  0.9513288504642972
Specificity 0.9761737911702874
Sensitivty 0.9540507859733979




{'EI': {'False Negative': 37,
  'False Positive': 58,
  'True Negative': 2305,
  'True Positive': 642},
 'IE': {'False Negative': 34,
  'False Positive': 55,
  'True Negative': 2312,
  'True Positive': 638},
 'N': {'False Negative': 76,
  'False Positive': 34,
  'True Negative': 1393,
  'True Positive': 1578}}

In [6]:
test_data = {}
count = 100000
for i in range(count):
    seq, state = x.generate_sequence()
    test_data[seq]=state
print("Test on",count,"Points of Randomly Generated Data")
sanity_check(x.states, x, test_data, verbose = 1)

Test on 100000 Points of Randomly Generated Data
-- EI --
Accuracy:  0.9854005875616193
Specificity 0.9843981948815391
Sensitivty 0.9780335041505589


-- IE --
Accuracy:  0.9878726327737619
Specificity 0.9884326541452115
Sensitivty 0.9814510233918129


-- N --
Accuracy:  0.9741670595630991
Specificity 0.9878251081225191
Sensitivty 0.9610935234770259




{'EI': {'False Negative': 733,
  'False Positive': 1051,
  'True Negative': 66313,
  'True Positive': 32636},
 'IE': {'False Negative': 609,
  'False Positive': 784,
  'True Negative': 66993,
  'True Positive': 32223},
 'N': {'False Negative': 1315,
  'False Positive': 822,
  'True Negative': 66694,
  'True Positive': 32484}}