In [55]:
import numpy as np
from scipy.special import logsumexp

def trans_to_nuc_abunds(trans_abunds_vec, trans_lengths_vec):
    """Given a vector of transcription abundances and a vector of transcript lengths, returns the nucelotide 
    abundances as a list"""
    
    if len(trans_abunds_vec) != len(trans_lengths_vec): return False
    
    num_transcripts = len(trans_abunds_vec)
    nuc_abunds_list = np.multiply(trans_abunds_vec, trans_lengths_vec)
    nuc_abunds_tot = sum(nuc_abunds_list)
    nuc_abunds_list = np.true_divide(nuc_abunds_list, nuc_abunds_tot)
    
    return nuc_abunds_list

def nuc_to_trans_abunds(nuc_abunds_vec, trans_lengths_vec):
    """Given a vector of nucleotide abundances and a vector of transcript lengths, returns the transcript
    abundances as a list"""
    
    if len(nuc_abunds_vec) != len(trans_lengths_vec): return False
    
    num_transcripts = len(nuc_abunds_vec)
    trans_abunds_list = np.true_divide(nuc_abunds_vec, trans_lengths_vec)
    trans_abunds_tot = sum(trans_abunds_list)
    trans_abunds_list = np.true_divide(trans_abunds_list, trans_abunds_tot)
    
    return trans_abunds_list

def generate_rand_abunds_and_lens(trans_num, len_range):
    """"""
    # Generate an (almost) uniformly random vector of transcript abundances
    abunds_list = np.random.dirichlet(np.ones(trans_num),size=1)[0]
    lens_list = np.random.randint(len_range[0], len_range[1]+1, trans_num)
    
    return abunds_list, lens_list

def create_arc(trans_lens_vec):
    """"""
    trans_list = []
    
    num_segs = len(trans_lens_vec)
    seg_list = [i for i in range(num_segs)]
    
    for start_seg, trans_len in enumerate(trans_lens_vec):
        end_seg = start_seg+trans_len
        if end_seg > num_segs:
            transcript = seg_list[start_seg:]+seg_list[:end_seg-num_segs]
            trans_list.append(transcript)
            
        else: trans_list.append(seg_list[start_seg:end_seg])
    
    return trans_list

def create_reads(trans_vec, trans_abunds_vec, trans_lens_vec, N):
    """"""
    nuc_abunds_list = trans_to_nuc_abunds(trans_abunds_vec, trans_lens_vec)
    reads_list = []
    for i in range(N):
        transcript = np.random.choice(trans_vec,p=nuc_abunds_list)
        segment = np.random.choice(transcript)
        reads_list.append(segment)
        
    return reads_list

array([0.06774375, 0.93225625])

[[0], [1]]

In [56]:
reads_list = create_reads(transcript_list, trans_abunds_list, trans_lens_list, 5)
reads_list

[[0], [1]]


ValueError: 'a' must be 1-dimensional

In [37]:
#TODO either the math or the code for this is wrong. I think the problem might even be in the creation of the reads
# from the transcriptome (check this first)
def calc_nll(reads_vec, trans_vec, trans_abunds_vec, trans_lens_vec):
    """"""
    nuc_abunds_list = trans_to_nuc_abunds(trans_abunds_vec, trans_lens_vec)
    log_probs_list = []
    
    # Iterate over each read
    for read in reads_vec:
        read_prob_list = []
        
        # For each read, iterate over each transcript
        for idx, transcript in enumerate(trans_vec):
            
            # If the transcript contains the read sequence, calculate the probability that the read came from this
            # trasncript and add it to the probability list for this transcript
            if read in transcript:
                prob = nuc_abunds_list[idx]/trans_lens_vec[idx]
                print("prob: " + str(prob))
                read_prob_list.append(prob)
        
        # Add the total probability
        print("total: " + str(np.log(np.sum(read_prob_list))))
        log_probs_list.append(np.log(np.sum(read_prob_list)))
        print(log_probs_list)
    total_nll = -1*np.sum(log_probs_list)
    
    return total_nll

def get_lestrade_ests(reads_vec, trans_vec, trans_lens_vec):
    """"""
    trans_num = len(trans_lens_vec)
    trans_counts_list = [0]*trans_num
    
    # Iterate over each read
    for read in reads_vec:
        temp_counts = [0]*trans_num
        
        # For each read, add a count for each transcript that include the segment that generated the read
        for idx, transcript in enumerate(trans_vec):
            if read in transcript:
                temp_counts[idx] += 1
                
        # Normalize the counts to add to 1
        temp_tot = sum(temp_counts)
        temp_counts = np.true_divide(temp_counts, temp_tot)
        
        # Add the count proportions for each read to the total counts list
        trans_counts_list = np.add(trans_counts_list, temp_counts)
    
    counts_tot = sum(trans_counts_list)
    est_nuc_abunds_list = np.true_divide(trans_counts_list, counts_tot)
    est_trans_abunds_list = nuc_to_trans_abunds(est_nuc_abunds_list, trans_lens_vec)
    
    return est_trans_abunds_list
        

In [38]:
lestrade_trans_abunds_list = get_lestrade_ests(reads_list, transcript_list, trans_lens_list)
true_nll = calc_nll(reads_list, transcript_list, trans_abunds_list, trans_lens_list)
lestrade_nll = calc_nll(reads_list, transcript_list, lestrade_trans_abunds_list, trans_lens_list)

prob: 0.07233910583675617
prob: 0.11086181573198657
prob: 0.006308360926680572
total: -1.663317271514626
[-1.663317271514626]
prob: 0.07233910583675617
prob: 0.029863776824219028
total: -2.2807953955342386
[-1.663317271514626, -2.2807953955342386]
prob: 0.11086181573198657
prob: 0.006308360926680572
total: -2.1441278995907647
[-1.663317271514626, -2.2807953955342386, -2.1441278995907647]
prob: 0.060489635509298034
prob: 0.0025243081005250087
prob: 0.008405451272542566
total: -2.639185809515329
[-1.663317271514626, -2.2807953955342386, -2.1441278995907647, -2.639185809515329]
prob: 0.012866593716650375
prob: 0.027807774200653695
prob: 0.060489635509298034
total: -2.291012282769677
[-1.663317271514626, -2.2807953955342386, -2.1441278995907647, -2.639185809515329, -2.291012282769677]
prob: 0.05555555555555555
prob: 0.05555555555555555
prob: 0.08333333333333331
total: -1.637608789400797
[-1.637608789400797]
prob: 0.05555555555555555
prob: 0.049999999999999996
total: -2.24851787172377
[-1.6

In [39]:
print("True transcript abundances:")
print(trans_abunds_list)
print("\nLestrade's transcript abundances:")
print(lestrade_trans_abunds_list)
print("\nDifference:")
print(np.abs(np.subtract(trans_abunds_list, lestrade_trans_abunds_list)))

print("\n\nNegative log-likelyhood of true parameters:")
print(true_nll)
print("\nNegative log-likelyhood of Lestrade's parameters:")
print(lestrade_nll)

True transcript abundances:
[0.20604119 0.08506005 0.03664751 0.07920401 0.17229072 0.00718991
 0.02394098 0.05589351 0.31576421 0.01796791]

Lestrade's transcript abundances:
[0.14925373 0.13432836 0.05970149 0.05970149 0.11940299 0.04477612
 0.05970149 0.         0.14925373 0.2238806 ]

Difference:
[0.05678746 0.04926831 0.02305398 0.01950251 0.05288773 0.03758621
 0.03576051 0.05589351 0.16651048 0.20591269]


Negative log-likelyhood of true parameters:
11.018438658924635

Negative log-likelyhood of Lestrade's parameters:
10.765482465585006


In [None]:
def create_arc_transcriptome(arc_locus, use_default_lengths, file_name):
    """Creates a tanscriptome based off an arc locus given as an input and outputs it to a FASTA file with 
    name "output". use_defualt_lengths allows the user to use the hardcoded lengths v by setting the 
    parameter to True, generate random lengths between 2 and 4 segments long (inclusive) by setting it to 
    False, or use their own given transcripts by setting the parameter equal to a list of lengths they want
    to use. file_name gives the name of the file that should be written to as a FASTA file"""
    
    arc_transcripts = [1]*S
    
    # Check if the user wants the random, defualt, or specified lengths
    if use_default_lengths == False:
        L_use = [1]*S
    
        for idx, element in enumerate(L_use):
            L_use[idx] = rand.choice([2*S_len, 3*S_len, 4*S_len])    
            
    elif use_default_lengths == True:
        L_use = L
        
    elif isinstance(use_default_lengths, list):
        L_use = use_default_lengths
        
    
    # Iterate over each transcript in the Arc locus
    for transcript_num, transcript_len in enumerate(L_use):
        seg_start = transcript_num*S_len
        
        # Correct for off-by-one error for non-zero segment start indices
        if seg_start > 0:
            seg_start -= 1  
        
        # Test if this iteration's transcript wraps fully around the circle, and then add the arc_locus
        # splits accordingly
        if (seg_start+1) + transcript_len > len(arc_locus):
            first_seg = arc_locus[seg_start:]
            arc_transcripts[transcript_num] = first_seg + arc_locus[:(transcript_len-len(first_seg))]   
        else:
            arc_transcripts[transcript_num] = arc_locus[seg_start:seg_start+transcript_len]
      
    # Write the arc_transcripts to a FASTA file
    file = open(file_name, "w")
    
    for transcript_num, transcript_seq in enumerate(arc_transcripts):
        fasta_list = [transcript_seq[i * 80:(i + 1) * 80] for i in range((len(transcript_seq) + 80 - 1)
                                                                         // 80 )]  
        file.write(">"+"Arc"+str(transcript_num+1)+"\n")
        
        for line in fasta_list:
            new_line = "".join(line)
            file.write(new_line+"\n")

    file.close()
    
    return arc_transcripts

In [None]:
test = np.random.dirichlet(np.ones(10),size=1)[0]
print(test)
print(sum(test))
test = np.divide(test, sum(test))