In [15]:
import numpy as np
from scipy.special import logsumexp
import re

def trans_to_nuc_abunds(trans_abunds_vec, trans_lengths_vec):
    """Given a vector of transcription abundances and a vector of transcript lengths, returns the nucelotide 
    abundances as a list"""
    
    if len(trans_abunds_vec) != len(trans_lengths_vec): return False
    
    num_transcripts = len(trans_abunds_vec)
    nuc_abunds_list = np.multiply(trans_abunds_vec, trans_lengths_vec)
    nuc_abunds_tot = sum(nuc_abunds_list)
    nuc_abunds_list = np.true_divide(nuc_abunds_list, nuc_abunds_tot)
    
    return nuc_abunds_list

def nuc_to_trans_abunds(nuc_abunds_vec, trans_lengths_vec):
    """Given a vector of nucleotide abundances and a vector of transcript lengths, returns the transcript
    abundances as a list"""
    
    if len(nuc_abunds_vec) != len(trans_lengths_vec): return False
    
    num_transcripts = len(nuc_abunds_vec)
    trans_abunds_list = np.true_divide(nuc_abunds_vec, trans_lengths_vec)
    trans_abunds_tot = sum(trans_abunds_list)
    trans_abunds_list = np.true_divide(trans_abunds_list, trans_abunds_tot)
    
    return trans_abunds_list

def generate_rand_abunds_and_lens(trans_num, len_range):
    """Generates a (nearly) uniform random vector of transcript abundances using the dirichlet distribution, and 
    random lengths sampled from the range given by a tuple len_range. Both the transcript abundances and the lengths
    vectors will have a number of elements equal to trans_num"""

    abunds_list = list(np.random.dirichlet(np.ones(trans_num),size=1)[0])
    lens_list = list(np.random.randint(len_range[0], len_range[1]+1, trans_num))
    
    return abunds_list, lens_list

def create_arc(trans_lens_vec):
    """"""
    trans_list = []
    
    num_segs = len(trans_lens_vec)
    seg_list = [i for i in range(num_segs)]
    
    for start_seg, trans_len in enumerate(trans_lens_vec):
        end_seg = start_seg+trans_len
        if end_seg > num_segs:
            transcript = seg_list[start_seg:]+seg_list[:end_seg-num_segs]
            trans_list.append(transcript)
            
        else: trans_list.append(seg_list[start_seg:end_seg])
    
    return trans_list

def create_reads(trans_vec, trans_abunds_vec, trans_lens_vec, N):
    """"""
    trans_num = len(trans_vec)
    nuc_abunds_list = trans_to_nuc_abunds(trans_abunds_vec, trans_lens_vec)
    reads_list = []
    trans_list = []
    for i in range(N):
        transcript_idx = np.random.choice(trans_num,p=nuc_abunds_list)
        transcript = trans_vec[transcript_idx]
        trans_list.append(transcript_idx)
        segment = np.random.choice(transcript)
        reads_list.append(segment)
        
    return reads_list, trans_list

In [2]:
trans_abunds_list, trans_lens_list = generate_rand_abunds_and_lens(10, (2,4))
transcript_list = create_arc(trans_lens_list)
reads_list, trans_list = create_reads(transcript_list, trans_abunds_list, trans_lens_list, 100000)

In [3]:
#TODO either the math or the code for this is wrong. I think the problem might even be in the creation of the reads
# from the transcriptome (check this first)

# Update, I have made sure the data is being generated proerly, I have looked over the math, and I have tested to
# make sure the code is doing what the math says to do. I am really stumped as to why this is not working...
def calc_nll(reads_vec, trans_vec, trans_abunds_vec, trans_lens_vec):
    """"""
    nuc_abunds_list = trans_to_nuc_abunds(trans_abunds_vec, trans_lens_vec)
    log_probs_list = []
    
    # Iterate over each read
    for read in reads_vec:
        read_prob_list = []
        
        # For each read, iterate over each transcript
        for idx, transcript in enumerate(trans_vec):
            # If the transcript contains the read sequence, calculate the probability that the read came from this
            # transcript and add it to the probability list for this transcript (this implicitly sums ove both S and
            # T), but the if statement means we only get non-zero probabilities
            if read in transcript:
                prob = np.log(nuc_abunds_list[idx]/trans_lens_vec[idx])
                read_prob_list.append(prob)
        
        # Add the total probability for every transcript
        log_probs_list.append(logsumexp(read_prob_list))
        
    total_nll = -1*np.sum(log_probs_list)
    
    return total_nll

def get_lestrade_ests(reads_vec, trans_vec, trans_lens_vec):
    """"""
    trans_num = len(trans_lens_vec)
    trans_counts_list = [0]*trans_num
    
    # Iterate over each read
    for read in reads_vec:
        temp_counts = [0]*trans_num
        
        # For each read, add a count for each transcript that include the segment that generated the read
        for idx, transcript in enumerate(trans_vec):
            if read in transcript:
                temp_counts[idx] += 1
                
        # Normalize the counts to add to 1
        temp_tot = sum(temp_counts)
        temp_counts = np.true_divide(temp_counts, temp_tot)
        
        # Add the count proportions for each read to the total counts list
        trans_counts_list = np.add(trans_counts_list, temp_counts)
    
    counts_tot = sum(trans_counts_list)
    est_nuc_abunds_list = np.true_divide(trans_counts_list, counts_tot)
    est_trans_abunds_list = nuc_to_trans_abunds(est_nuc_abunds_list, trans_lens_vec)
    
    return est_trans_abunds_list
        

In [4]:
lestrade_trans_abunds_list = get_lestrade_ests(reads_list, transcript_list, trans_lens_list)
true_nll = calc_nll(reads_list, transcript_list, trans_abunds_list, trans_lens_list)
lestrade_nll = calc_nll(reads_list, transcript_list, lestrade_trans_abunds_list, trans_lens_list)

In [14]:
print("True transcript abundances:")
print(trans_abunds_list)
print("\nLestrade's transcript abundances:")
print(lestrade_trans_abunds_list)
print("\nDifference:")
print(np.abs(np.subtract(trans_abunds_list, lestrade_trans_abunds_list)))

print("\n\nNegative log-likelyhood of true parameters:")
print(true_nll)
print("\nNegative log-likelyhood of Lestrade's parameters:")
print(lestrade_nll)
print("\nNLL difference: ")
print(np.abs(true_nll-lestrade_nll))

True transcript abundances:
[0.24093657295034118, 0.05221239146209306, 0.03669953919523882, 0.013589459121737311, 0.05524833772037555, 0.16772870154821895, 0.03136687490868616, 0.0021691374719922376, 0.30349391939155057, 0.09655506622976628]

Lestrade's transcript abundances:
[0.14099364 0.09789682 0.06735321 0.0351291  0.06981442 0.08047829
 0.07654297 0.13066893 0.1515792  0.14954342]

Difference:
[0.09994293 0.04568443 0.03065367 0.02153964 0.01456608 0.08725041
 0.0451761  0.12849979 0.15191472 0.05298835]


Negative log-likelyhood of true parameters:
217797.0855776609

Negative log-likelyhood of Lestrade's parameters:
220466.06129170262

NLL difference: 
2668.975714041706


In [26]:
def read_data_from_file(file_name):
    """TAKEN FROM LESTRADE AND ADAPTED FOR OUR PURPOSES. Reads a data table of the format output by Lestrade and 
    returns the read frequences"""
    with open(file_name) as f:
        #   The first line is "The <n> transcripts of the sand mouse Arc locus"
        line  = f.readline()
        match = re.search(r'^The (\d+) transcripts', line)
        T     = int(match.group(1))

        # The next T lines are 
        #   <Arcn>  <true_tau> <L> <structure>
        # tau's may be present, or obscured ("xxxxx")
        tau       = np.zeros(T)
        L         = np.zeros(T).astype(int)
        tau_known = True   # until we see otherwise
        for i in range(T):
            fields    = f.readline().split()
            if fields[1] == "xxxxx":
                tau_known = False
            else:
                tau[i] = float(fields[1])
            L[i]      = int(fields[2])

        # after a blank line,
        # 'The <n> read sequences':
        line  = f.readline()
        line  = f.readline()
        match = re.search(r'The (\d+) read sequences', line)
        N     = int(match.group(1))

        # the next T lines are 
        #  <read a-j> <count>
        r = np.zeros(T).astype(int)
        for k in range(T):
            fields = f.readline().split()
            r[k]   = fields[1]
            
    read_counts_list = list(r)
#     nuc_abunds_list = np.true_divide(read_counts_list, np.sum(read_counts_list))
    
    return(read_counts_list, L)

def expectation_step(reads_vec, trans_vec, trans_abunds_vec, trans_lens_vec):
    nuc_abunds_list = nuc_to_trans_abunds(trans_abunds_vec, trans_lens_vec)
    trans_num = len(trans_lens_vec)
    count_list = [0]*trans_num
    
    for read in reads_vec:
        read_prob_list = []
        idx_list = []
        for idx, transcript in enumerate(trans_vec):
            if read in transcript:
                idx_list.append(idx)
                numer = np.log(nuc_abunds_list[idx]/trans_lens_vec[idx])
                read_prob_list.append(numer)
        denom = logsumexp(numer)
        read_prob_list = np.subtract(read_prob_list, denom)
        # Next I need to make sure that these add to 1 when exponentiated, and then add them to the proper idx
        # in count list according to my idx_list. 
            
            

In [28]:
nuc_abunds_from_counts_list, len_list = read_data_from_file("w08-data.out")
arc_transcript_list = create_arc(len_list)
print(arc_transcript_list)

[[0, 1, 2, 3], [1, 2], [2, 3, 4], [3, 4, 5, 6], [4, 5, 6, 7], [5, 6, 7], [6, 7], [7, 8], [8, 9, 0], [9, 0, 1]]
