In [1]:
import numpy as np
from scipy.special import logsumexp

def trans_to_nuc_abunds(trans_abunds_vec, trans_lengths_vec):
    """Given a vector of transcription abundances and a vector of transcript lengths, returns the nucelotide 
    abundances as a list"""
    
    if len(trans_abunds_vec) != len(trans_lengths_vec): return False
    
    num_transcripts = len(trans_abunds_vec)
    nuc_abunds_list = [trans_abunds_vec[i]*trans_lengths_vec[i] for i in range(num_transcripts)]
    nuc_abunds_tot = sum(nuc_abunds_list)
    nuc_abunds_list = [abund/nuc_abunds_tot for abund in nuc_abunds_list]
    
    return nuc_abunds_list

def nuc_to_trans_abunds(nuc_abunds_vec, trans_lengths_vec):
    """Given a vector of nucleotide abundances and a vector of transcript lengths, returns the transcript
    abundances as a list"""
    
    if len(nuc_abunds_vec) != len(trans_lengths_vec): return False
    
    num_transcripts = len(nuc_abunds_vec)
    trans_abunds_list = [nuc_abunds_vec[i]/trans_lengths_vec[i] for i in range(num_transcripts)]
    trans_abunds_tot = sum(trans_abunds_list)
    trans_abunds_list = [abund/trans_abunds_tot for abund in trans_abunds_list]
    
    return trans_abunds_list

def create_arc(trans_lens_vec):
    """"""
    trans_list = []
    
    num_segs = len(trans_lens_vec)
    seg_list = [i for i in range(num_segs)]
    
    for start_seg, trans_len in enumerate(trans_lens_vec):
        end_seg = start_seg+trans_len
        if end_seg > num_segs:
            transcript = seg_list[start_seg:]+seg_list[:end_seg-num_segs]
            trans_list.append(transcript)
            
        else: trans_list.append(seg_list[start_seg:end_seg])
    
    return trans_list

def create_reads(trans_vec, trans_abunds_vec, trans_lens_vec, N):
    """"""
    nuc_abunds_list = trans_to_nuc_abunds(trans_abunds_vec, trans_lens_vec)
    reads_list = []
    for i in range(N):
        transcript = np.random.choice(trans_vec,p=nuc_abunds_list)
        segment = np.random.choice(transcript)
        reads_list.append(segment)
        
    return reads_list
        
        
        

In [2]:
trans_lens_list = [1,1,3]
trans_abunds_list = [0.2, 0.6, 0.2]
transcript_list = create_arc(trans_lens_list)
reads_list = create_reads(transcript_list, trans_abunds_list, trans_lens_list, 10)
print(transcript_list)
print(reads_list)

[[0], [1], [2, 0, 1]]
[1, 0, 1, 1, 1, 2, 0, 0, 2, 1]


In [3]:
def calc_nll(reads_vec, trans_vec, trans_abunds_vec, trans_lens_vec):
    nuc_abunds_list = trans_to_nuc_abunds(trans_abunds_vec, trans_lens_vec)
    log_probs_list = []
    
    for read in reads_vec:
        read_prob_list = []
        for idx, transcript in enumerate(trans_vec):
            if read in transcript:
                prob = np.log(nuc_abunds_list[idx]/trans_lens_vec[idx])
                read_prob_list.append(prob)
        log_probs_list.append(logsumexp(read_prob_list))
        
    total_nll = -1*np.sum(log_probs_list)
    
    return total_nll

def get_lestrade_ests(reads_vec, trans_vec, trans_lens_vec):
    trans_num = len(trans_lens_vec)
    trans_counts_list = [0]*trans_num
    
    # Iterate over each read
    for read in reads_vec:
        temp_counts = [0]*trans_num
        
        # For each read, add a count for each transcript that include the segment that generated the read
        for idx, transcript in enumerate(trans_vec):
            if read in transcript:
                temp_counts[idx] += 1
                
        # Normalize the counts to add to 1
        count_tot = sum(temp_count)
        temp_counts = [count/count_tot for count in temp_count]
        
        # Add the count proportions for each read to the total counts list
        trans_counts_list = [trans_counts_list[i]+temp_counts[i] for i in range(trans_num)]
        

In [7]:
wrong_trans_abunds = [0.9, 0.05, 0.05]
test_1 = calc_nll(reads_list, transcript_list, trans_abunds_list, trans_lens_list)
test_2 = calc_nll(reads_list, transcript_list, wrong_trans_abunds, trans_lens_list)

print(test_1)
print(test_2)

10.448188143273844
18.611371693284106


In [5]:
def create_arc_transcriptome(arc_locus, use_default_lengths, file_name):
    """Creates a tanscriptome based off an arc locus given as an input and outputs it to a FASTA file with 
    name "output". use_defualt_lengths allows the user to use the hardcoded lengths v by setting the 
    parameter to True, generate random lengths between 2 and 4 segments long (inclusive) by setting it to 
    False, or use their own given transcripts by setting the parameter equal to a list of lengths they want
    to use. file_name gives the name of the file that should be written to as a FASTA file"""
    
    arc_transcripts = [1]*S
    
    # Check if the user wants the random, defualt, or specified lengths
    if use_default_lengths == False:
        L_use = [1]*S
    
        for idx, element in enumerate(L_use):
            L_use[idx] = rand.choice([2*S_len, 3*S_len, 4*S_len])    
            
    elif use_default_lengths == True:
        L_use = L
        
    elif isinstance(use_default_lengths, list):
        L_use = use_default_lengths
        
    
    # Iterate over each transcript in the Arc locus
    for transcript_num, transcript_len in enumerate(L_use):
        seg_start = transcript_num*S_len
        
        # Correct for off-by-one error for non-zero segment start indices
        if seg_start > 0:
            seg_start -= 1  
        
        # Test if this iteration's transcript wraps fully around the circle, and then add the arc_locus
        # splits accordingly
        if (seg_start+1) + transcript_len > len(arc_locus):
            first_seg = arc_locus[seg_start:]
            arc_transcripts[transcript_num] = first_seg + arc_locus[:(transcript_len-len(first_seg))]   
        else:
            arc_transcripts[transcript_num] = arc_locus[seg_start:seg_start+transcript_len]
      
    # Write the arc_transcripts to a FASTA file
    file = open(file_name, "w")
    
    for transcript_num, transcript_seq in enumerate(arc_transcripts):
        fasta_list = [transcript_seq[i * 80:(i + 1) * 80] for i in range((len(transcript_seq) + 80 - 1)
                                                                         // 80 )]  
        file.write(">"+"Arc"+str(transcript_num+1)+"\n")
        
        for line in fasta_list:
            new_line = "".join(line)
            file.write(new_line+"\n")

    file.close()
    
    return arc_transcripts

In [8]:
test = [0]*5
test

[0, 0, 0, 0, 0]