# Progressive read size evaluation

### This section is using the big 15GB data file as a source. See below for new configuration.
We know we can do 10k reads so let's take progressively larger number of reads to show a progression of processing time.

In [1]:
from pyspark import SparkContext

In [2]:
import subprocess
import os

In [3]:
import eulercuda as ec

In [4]:
import pycuda.driver
import pycuda.autoinit

In [5]:
from collections import OrderedDict

In [9]:
from bokeh.io import output_notebook, show
from bokeh.plotting import figure
output_notebook()

In [7]:
datafile = '/home/ubuntu/genome/sra_clean'

In [10]:
def make_dataset(read_size):
    dataset = []
    with open (datafile, 'r') as infile:
        for i in range(read_size):
            line = infile.readline().rstrip()
            if 'N' not in line:
                dataset.append(line)
    return dataset

# New method using "cleaned" ecoli data file

In [11]:
def hail_mary(path, data):
    import eulercuda.eulercuda as ec
    subprocess.call(["hdfs", "dfs", "-rm", "-r", "-f", '/genome/' + path])
    hail_mary = data.mapPartitions(lambda x: ec.assemble2(k, buffer=x, readLength = dataLength,readCount=dataCount)) \
        .saveAsTextFile('hdfs://172.31.26.32/genome/' + path)

In [12]:
timing_data = OrderedDict()
increment = 5000
read_size = 5000
dataset = []

In [13]:
k = 17
lmerLength = 18
run_ok = True
while run_ok:
    dataset = make_dataset(read_size)
    dataLength = len(dataset[0])
    rdd_data = sc.parallelize(dataset, (read_size // 2000))
    dataCount = rdd_data.count() // rdd_data.getNumPartitions()
    try:
        time = %timeit -o -r 5 hail_mary('ec'+str(read_size)[:2]+'k_output',rdd_data)
    except:
        print('Failure at '+ str(read_size), flush=True)
        run_ok = False
    if run_ok:
        timing_data[read_size] = time
        print(read_size,time.best, flush=True)
        print()
        read_size += increment

1 loop, best of 5: 10.9 s per loop
5000 10.854565590001584

1 loop, best of 5: 9.12 s per loop
10000 9.11539518599966

1 loop, best of 5: 10.3 s per loop
15000 10.260186309998971

1 loop, best of 5: 15.8 s per loop
20000 15.803715278998425

1 loop, best of 5: 16.8 s per loop
25000 16.77285344900156

1 loop, best of 5: 20.2 s per loop
30000 20.23722870200072

1 loop, best of 5: 23 s per loop
35000 23.008605576000264

Failure at 40000


In [None]:
timing_data.keys()

In [None]:
upper_limit = next(reversed(timing_data))

In [None]:
run_times = [v.best for v in timing_data.values()]
read_sizes = [k//1000 for k in timing_data.keys()]

In [None]:
read_sizes

In [None]:
p = None
p = figure(plot_width=800, plot_height=400, title="GPU Assisted Assembly")
p.line(read_sizes,run_times,line_width=1)
show(p)

## Now let's do the same thing with the pure python assembler

In [None]:
# datafile = '/home/ubuntu/genome/ba100k_clean.txt'
datafile = '/home/ubuntu/genome/sra_clean'

In [None]:
compliment = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}

In [None]:
def twin(km):
    compliment = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
    # return Seq.reverse_complement(km)
    return "".join(compliment.get(base, base) for base in reversed(km))

def fw(km):
    for x in 'ACGT':
        yield km[1:]+x

def bw(km):
    for x in 'ACGT':
        yield x + km[:-1]

In [None]:
def contig_to_string(c):
    return c[0] + ''.join(x[-1] for x in c[1:])

def get_contig(d,km):
    '''
    Find kmer's contig.
    Return: the string, list of kmers in contig
    '''
    c_fw = get_contig_forward(d,km)

    c_bw = get_contig_forward(d,twin(km))

    if km in fw(c_fw[-1]):
        c = c_fw
    else:
        c = [twin(x) for x in c_bw[-1:0:-1]] + c_fw
    return contig_to_string(c),c


def get_contig_forward(d,km):
    c_fw = [km]

    while True:
        if sum(x in d for x in fw(c_fw[-1])) != 1:
            break

        cand = [x for x in fw(c_fw[-1]) if x in d][0]
        if cand == km or cand == twin(km):
            break # break out of cycles or mobius contigs
        if cand == twin(c_fw[-1]):
            break # break out of hairpins

        if sum(x in d for x in bw(cand)) != 1:
            break

        c_fw.append(cand)

    return c_fw

def all_contigs(k_tuples):
    d = dict(k_tuples)
    done = set()
    r = []
    for x in d:
        if x not in done:
            s,c = get_contig(d,x)
            for y in c:
                done.add(y)
                done.add(twin(y))
            r.append(s)
    return r

In [None]:
def do_baseline(run_size, path):
    k = 17
    lmerLength = 18
    subprocess.call(["hdfs", "dfs", "-rm", "-r", "-f", '/genome/' + path])
    dataset = make_dataset(run_size)
    data = sc.parallelize(dataset, (run_size // 2000))
    fwd_list = data.flatMap(lambda x: [x[i:i+k] for i in range(len(x.rstrip())-k+1)])
    rev_comp = data.map(lambda x:''.join(reversed([compliment.get(base, base) for base in x])))
    rev_list = rev_comp.flatMap(lambda x: [x[i:i+k] for i in range(len(x.rstrip())-k+1)])
    kmer_list = fwd_list + rev_list
    emitter = kmer_list.map(lambda x: (x, 1))
    kmer_counts = emitter.reduceByKey(lambda x, y: x+y)
    asm_time = kmer_counts.mapPartitions(all_contigs).saveAsTextFile('hdfs://172.31.26.32/genome/' + path)
    

In [None]:
#upper_limit = 60000
upper_limit += 5000

In [None]:
base_timing = OrderedDict()

In [None]:
#do_baseline(10000,'base_output')

In [None]:
for i in range(5000, upper_limit, 5000):
#     print(str(i), ' reads: ', flush=True)
    timing = %timeit -o -r 5 do_baseline(i, "base_output")
    print(i, timing.best, flush=True)
    print(flush=True)
    base_timing[i] = timing

In [None]:
base_run_times = [v.best for v in base_timing.values()]
base_read_sizes = [k//1000 for k in base_timing.keys()]

In [None]:
p = figure(plot_width=800, plot_height=400, title="GPU Assisted Assembly")
p.multi_line([read_sizes,run_times,base_run_times,base_read_sizes],color=["firebrick", "navy"],line_width=1)
show(p)