In [1009]:
import os 
import numpy as np
import pandas as pd
import json as js
import textwrap
import random
import collections
import re
import matplotlib.pyplot as plt

%matplotlib inline

In [1216]:
alt_map = {'-':'0'}
complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'} 

def reverse_complement(seq):    
    for k,v in alt_map.items():
        seq = seq.replace(k,v)
    bases = list(seq) 
    bases = reversed([complement.get(base,base) for base in bases])
    bases = ''.join(bases)
    for k,v in alt_map.items():
        bases = bases.replace(v,k)
    return bases

def divide_chunks(l, n): 
#l can be list or string, n is fraglen
# looping till length l 
    for i in range(0, len(l), n):  
        yield l[i:i + n] 


# 1. Run kallisto and 2. Reproduce Moriarty's Results

See installation notes in Homework 2 folder. 

First instinct is that Moriarty failed to appreciate that directionality matters in sequencing these transcripts because each read is approximately 0.1x the length of each transcript fragment (1000 nt). 

I used the following arguments for each kallisto command for obtaining the data in the cells below:

1. Getting the index: kallisto index split_genome -i "Moriarty_index" -k 31 arc.fasta
2. Getting the abundance results: kallisto quant -i "Moriarty_index" -o "Moriarty_quant" --single -l 150 -s 20 arc.fastq.gz

Note that I used the masplit_genomeimum number of k-mers possible for generating the indesplit_genome. I am interested to see if using a smaller k-mer will result in some sort of difference in the quantification step. I need to read the rest of the Pachter paper (and how de Bruijn graphs are constructed) to understand how this impacts the indesplit_genome generation and subsequently quantification of the RNA-Seq data. 

In [1011]:
#load in results of kallisto run
Moriarty_info_dict = {}
with open("/Volumes/Macintosh HD/Users/Hailey/Dropbox/MCB112/Homework_2/Moriarty_quant/run_info.json") as info_file:
    info_list = info_file.readlines()
    info = ''.join(info_list)
    Moriarty_info_dict = js.loads(info)

In [1012]:
Moriarty_info_dict

{'call': 'kallisto quant -i Moriarty_index -o Moriarty_quant --single -l 150 -s 20 arc.fastq.gz',
 'index_version': 10,
 'kallisto_version': '0.44.0',
 'n_bootstraps': 0,
 'n_processed': 100000,
 'n_pseudoaligned': 99988,
 'n_targets': 10,
 'n_unique': 724,
 'p_pseudoaligned': 100.0,
 'p_unique': 0.7,
 'start_time': 'Fri Sep 21 08:57:37 2018'}

In [1013]:
#make pandas dataframe of abundance of Arc loci
abundance_Moriarty_df = pd.read_csv("/Volumes/Macintosh HD/Users/Hailey/Dropbox/MCB112/Homework_2/Moriarty_quant/abundance.tsv", sep="\t")

In [1014]:
#reproduced Moriarty's results:
print(abundance_Moriarty_df.to_string(index=False))

target_id  length  eff_length  est_counts       tpm
    Arc1    4000        3851     2382.22   17374.2
    Arc2    2000        1851     3772.62   57244.5
    Arc3    3000        2851    28342.40  279214.0
    Arc4    4000        3851    10476.80   76410.3
    Arc5    4000        3851    12679.70   92476.6
    Arc6    3000        2851     1770.75   17444.5
    Arc7    2000        1851     5444.64   82615.2
    Arc8    2000        1851     5871.52   89092.5
    Arc9    3000        2851     2649.55   26101.9
   Arc10    3000        2851    26597.80  262027.0


On kallisto results - tpms look very similar to Moriarty's results. It appears I have replicated his results here.

# 3. Simulate an Arc transcriptome and RNA-seq reads

In [1098]:
# Set up the Arc locus 
#
S         = 10           # Number of segments in the Arc locus (A..J)
T         = S            # Number of different transcripts (the same, one starting on each segment, 1..10)
N         = 100000       # total number of observed reads we generate
alpha     = 0.999        # base calling accuracy (Q30 bases, typical of current Illumina)
len_S     = 1000         # length of each segment (nucleotides)
len_Arc   = len_S * S    # total length of the Arc locus (nucleotides)
len_R     = 75           # read length
mean_frag = 150          # fragment size: mean (of a truncated Gaussian)
sd_frag   = 20           # fragment size: stdev

In [1016]:
#assuming here that each Arc locus size is 2000-4000 nt long as stated in the homework esplit_genomeplanation
Arc_size = [2000, 3000, 4000]

#nucleotide array for generating sequences
nts = ['A', 'T', 'C', 'G']

#generate random segment lengths
L_rand = pd.Series(np.random.choice(Arc_size, S))

#generate 10,000 bp long random sequence of nucleotides (nts) with each having a 0.25 probability of occuring
Arc_genome = np.random.choice(nts, 10000, p=[0.25, 0.25, 0.25, 0.25])
#generate random, but positive vis
v_rand = pd.Series(np.absolute(np.random.rand(10)))

In [1170]:
#set lists for transcripts and their abundances
Arc_transcripts = ['Arc1', 'Arc2', 'Arc3', 'Arc4', 'Arc5', 'Arc6', 'Arc7', 'Arc8', 'Arc9', 'Arc10']

#create list of already used segments for simulation and concatenate to new random sequences
Seg_cov = ['ABCD', 'BC', 'CDE', 'DEFG', 'EFGH', 'FGH', 'GH', 'HI', 'IJA', 'JAB']
transcript_seg = {}
transcript_seg = dict(zip(Arc_transcripts, Seg_cov))

#simulations using original data

v_i = [0.008, 0.039, 0.291, 0.112, 0.127, 0.008, 0.059, 0.06, 0.022, 0.273]

#normalize abundances to have a total probability of 1
c = 1/sum(v_i)
v_i_n = [i*c for i in v_i]

#sample transcript i according to its abundance
Arc_transcript_samples_10 = np.random.choice(Arc_transcripts, size=10, p=v_i_n)
Arc_transcript_samples_20 = np.random.choice(Arc_transcripts, size=20, p=v_i_n)

In [1171]:
#make dict of transcripts and store in FASTA file (same length as indicated in the original set)
genome_sequence = list(Arc_genome)

#prepare each of the transcripts with linear coordinates from original data table
#extraction from random sequence with same coordinates as OG genome
Arc1 = ''.join(genome_sequence[0:4000])

Arc2 = ''.join(genome_sequence[1000:3000])

Arc3 = ''.join(genome_sequence[2000:5000])

Arc4 = ''.join(genome_sequence[3000:7000])

Arc5 = ''.join(genome_sequence[4000:8000])

Arc6 = ''.join(genome_sequence[5000:8000])

Arc7 = ''.join(genome_sequence[6000:8000])

Arc8 = ''.join(genome_sequence[7000:9000])

Arc9 = ''.join(genome_sequence[8000:10000] + genome_sequence[0:1000])

Arc10 = ''.join(genome_sequence[9000:10000] + genome_sequence[0:2000])

#initialize ordered dict for the transcript dictionary so that Arc10 is actually last in file when written
transcript_dict = collections.OrderedDict()
transcript_dict['Arc1'] = Arc1
transcript_dict['Arc2'] = Arc2
transcript_dict['Arc3'] = Arc3
transcript_dict['Arc4'] = Arc4
transcript_dict['Arc5'] = Arc5
transcript_dict['Arc6'] = Arc6
transcript_dict['Arc7'] = Arc7
transcript_dict['Arc8'] = Arc8
transcript_dict['Arc9'] = Arc9
transcript_dict['Arc10'] = Arc10

#write out fasta file    
fasta_rand = open('Arc_og_length_rand_seq.fasta', 'w')

for i in range(len(transcript_dict)):
    
    #make sure to get proper formatting for FASTA
    fasta_rand.write('>' + list(transcript_dict.keys())[i] + '\n' + '\n'.join(textwrap.wrap(list(transcript_dict.values())[i], 60)) + '\n')

#do not forget to close it
fasta_rand.close()

## Simulating reads

In [1257]:
#write out fastq file    
fastq = open('Arc_og_len_2.fastq', 'w')
for x in range(10000):
    #choose randomly sized fragment from sample transcript
    Arc_transcript_samples_10 = np.random.choice(Arc_transcripts, size=10, p=v_i_n)

    rand_transcripts = []
    for transcript in Arc_transcript_samples_10:
        for Arc in list(transcript_dict.keys()):
            if str(transcript) == Arc:
                #print(transcript)
                while True:
                    fraglen = int(np.random.normal(mean_frag, sd_frag))
                    if fraglen >= len_R: break
                if fraglen > len(transcript_dict['{}'.format(transcript)]): fraglen = len(transcript_dict['{}'.format(transcript)])
                if fraglen < 75 : fraglen = 75
                x = list(divide_chunks(str(list(transcript_dict.values())), fraglen))
                y = len(x)
                z = np.random.choice(y-1, 1)
                rand_transcripts.append(x[int(z)])

    fragments = rand_transcripts
    rand_reads = []
    for frag in fragments:
        #print(len(frag))
        j = np.random.randint(len(frag))
        read_f = frag[j: j+75]
        reads_f.append(read_f)
        read_r = frag[len(frag)-75: len(frag)]
        reads_r.append(reverse_complement(read_r))
        read_r = reverse_complement(read_r)
        x = np.random.randint(2)
        if x == 0:
            read = read_r
        if x == 1:
            read = read_f
        rand_reads.append(read)

    #introduce errors into reads
    nts = ['A', 'T', 'C', 'G']
    p_A = np.array([alpha, 1-alpha, 1-alpha, 1-alpha])
    p_A /= p_A.sum()

    p_T = np.array([1-alpha, alpha, 1-alpha, 1-alpha])
    p_T /= p_T.sum()

    p_C = np.array([1-alpha, 1-alpha, alpha, 1-alpha])
    p_C /= p_C.sum()

    p_G = np.array([1-alpha, 1-alpha, 1-alpha, alpha])
    p_G /= p_G.sum()

    rand_read_err = []
    for read in rand_reads:
        for i in range(len(read)):
            if read[i] == nts[0]:
                read[i].replace(read[i], np.random.choice(nts, p=p_A))
            elif read[i] == nts[1]:
                read[i].replace(read[i], np.random.choice(nts, p=p_T))
            elif read[i] == nts[2]:
                read[i].replace(read[i], np.random.choice(nts, p=p_C))
            elif read[i] == nts[3]:
                read[i].replace(read[i], np.random.choice(nts, p=p_G))
        rand_read_err.append(read)
    #write sequences with errors to FASTQ format file (in same format as one provided)

    for i in range(len(rand_read_err)):
        #make sure to get proper formatting for FASTA
        fastq.write('@read{}'.format(i) + '\n' + rand_read_err[i] + '\n' + '+' + '\n' + '?'*int(75) + '\n')
    #do not forget to close it
fastq.close()

## Debugging

In [None]:
#make list of random (plausible) loci of the Arc genome

#make string long enough so that I don't end up getting empty strings when choosing randomly (but in order of locus)
arc_str = 'ABCDEFGHIJABCDEFGHIJABCDEFGHIJABCDEFGHIJABCDEFGHIJABCDEFGHIJABCDEFGHIJABCDEFGHIJ'
Arc_rand = []
transcript = ''

#loop through string in order to get a list of random locuses
j=0
for arc_seg in range(10):
    j = j + np.random.randint(2,5)
    transcript = ''.join(np.random.choice([arc_str[j :j + np.random.randint(2,5)]]))
    Arc_rand.append(transcript)
        
Arc_rand

The circular architecture of the genome and how the reads are sampled from the circular architecture may be causing an issue with how kallisto records reads. 

## Extra Work

In [1251]:
#choose 75 nt sample read from fragment in random orientation
j = 0        
for i in range(0, len(fragments), 75):
    for frag in fragments:
        j = np.random.randint(len(frag) - 75)
        read_f = frag[j: j+75]
        reads_f.append(read_f)
        read_r = frag[len(frag)-75: len(frag)]
        reads_r.append(reverse_complement(read_r))
        read_r = reverse_complement(read_r)
        x = np.random.randint(2)
        if x == 0:
            read = read_r
        if x == 1:
            read = read_f

#print(reads)
#make reads for FASTQ file
#be sure to add errors
#rand_reads = []
#for i in range(100000):
    #random_read = np.random.choice(reads)
    #rand_reads.append(random_read)

75
75
75
75
75
75


In [1160]:
#introduce errors into reads
nts = ['A', 'T', 'C', 'G']
p_A = np.array([alpha, 1-alpha, 1-alpha, 1-alpha])
p_A /= p_A.sum()

p_T = np.array([1-alpha, alpha, 1-alpha, 1-alpha])
p_T /= p_T.sum()

p_C = np.array([1-alpha, 1-alpha, alpha, 1-alpha])
p_C /= p_C.sum()

p_G = np.array([1-alpha, 1-alpha, 1-alpha, alpha])
p_G /= p_G.sum()

rand_read_err = []
for read in rand_reads:
    for i in range(len(read)):
        if read[i] == nts[0]:
            read[i].replace(read[i], np.random.choice(nts, p=p_A))
        elif read[i] == nts[1]:
            read[i].replace(read[i], np.random.choice(nts, p=p_T))
        elif read[i] == nts[2]:
            read[i].replace(read[i], np.random.choice(nts, p=p_C))
        elif read[i] == nts[3]:
            read[i].replace(read[i], np.random.choice(nts, p=p_G))
    rand_read_err.append(read)

In [1161]:
#write sequences with errors to FASTQ format file (in same format as one provided)

#write out fastq file    
fastq = open('Arc_og_length_100.fastq', 'w')
for i in range(len(rand_read_err)):
    #make sure to get proper formatting for FASTA
    fastq.write('@read{}'.format(i) + '\n' + rand_read_err[i] + '\n' + '+' + '\n' + '?'*int(75) + '\n')
#do not forget to close it
fastq.close()

In [878]:
#create dictionary of the genome with segment names as keys, splitting the genome array by the number of segments
split_genome = np.array_split(Arc_genome, S)

Arc_dict = {}
Arc_dict['A'] = split_genome[0]
Arc_dict['B'] = split_genome[1]
Arc_dict['C'] = split_genome[2]
Arc_dict['D'] = split_genome[3]
Arc_dict['E'] = split_genome[4]
Arc_dict['F'] = split_genome[5]
Arc_dict['G'] = split_genome[6]
Arc_dict['H'] = split_genome[7]
Arc_dict['I'] = split_genome[8]
Arc_dict['J'] = split_genome[9]

In [879]:
dict_of_dicts = {}

transcript_sequences = []

#use the keys in Arc_dict to assign sequences to each of the transcripts with their respective segments to a new 
#list of sequences representing each transcript
print(Arc_dict.keys(), transcript_seg.values())

dict_keys(['G', 'F', 'B', 'E', 'H', 'C', 'I', 'J', 'A', 'D']) dict_values(['IJA', 'EFGH', 'BC', 'DEFG', 'HI', 'CDE', 'ABCD', 'GH', 'FGH', 'JAB'])


In [880]:
#create random list of Arc segment names with requirements
Arc_seg = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
Arc_seg_len = 1000

#make list of possible Arc locuses based on genome structure given
Arc_rand = []
transcript = ''
for seg in Arc_seg:
    #get random combinations of letters from the list above and concatenate randomly in groups of 2 - 4 locuses
    transcript = ''.join(np.random.choice(Arc_seg, replace=False) for _ in range(np.random.randint(2, 5)))
    #esplit_genomeclude empty transcripts:
    if len(transcript) >= 2 and len(transcript) <= 4:
             Arc_rand.append(transcript)
Arc_rand

['JJJ', 'IJ', 'ADCC', 'FIG', 'ACBH', 'AGJC', 'FGCF', 'CG', 'FCC', 'CBBE']

In [None]:
#step 1 - make dictionary of segment:bp pairs that are random
#step 2 - make sequences of the random locuses generated from the list above, with a new dataframe

Arc_seg = ['A','B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']

In [None]:
Arc_transcript_sample