# Progressive read size evaluation
We know we can do 10k reads so let's take progressively larger number of reads to show a progression of processing time.

In [1]:
from pyspark import SparkContext

In [2]:
import subprocess
import os

In [3]:
import eulercuda as ec

In [4]:
import pycuda.driver
import pycuda.autoinit

In [5]:
from tqdm import *

In [6]:
from ipywidgets import FloatProgress
from IPython.display import display

In [7]:
def log_progress(sequence, every=None, size=None):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{index} / ?'.format(index=index)
                else:
                    progress.value = index
                    label.value = u'{index} / {size}'.format(
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = str(index or '?')

## Reading Data
In this case I'm going to read a local data file into a list then ranomize the list and take slices. Then use Spark to parallelize the slice to turn it into an RDD.

In [8]:
datafile = '/home/ubuntu/genome/Ecoli-RR359304-2.fastq'

In [9]:
def read_fastq(filename, total):
    """
    Read fastq formatted <filename> and return a list of reads
    """
    result = []
    infile = open(filename, "r")
    for i,line in tqdm(enumerate(infile), desc='parsing', total=total):
        if i % 4 == 1:
            result.append(line.rstrip())
    return result

In [10]:
lines = 389607888 # hardcoded b/c this won't change for this file.

In [11]:
raw_data = read_fastq('/home/ubuntu/genome/Ecoli-RR359304-2.fastq',lines)

parsing: 100%|██████████| 389607888/389607888 [05:34<00:00, 1165424.06it/s]


In [12]:
raw_data[:5]

['NGAGATAGCGCGTCGTAACCCTGGTGAGCNNNNNNNNNNNNNNNNNNNNN',
 'GGATCCGTCATTCCAGTATCAGCATTGCCCGGCTGGCGAAACCGATTGGC',
 'ATATCCATACCCGAAACCAGGGTGGAAATATACTGACCAACGCCAGAGTC',
 'TGCTTATCGCCCTGCTCCAGCAACTCAATCGCCTCGCCGAAACGCTTATA',
 'TACTCAGCAGGAAACTCTCGGGGAAATTGTGACTGAGATTTTGAAAGATG']

In [13]:
from random import choice, shuffle

In [14]:
shuffle(raw_data) # maybe shuffle isn't needed with choice...

In [15]:
def build_dataset(num_reads, raw_data):
    data = []
    for i in tqdm(range(num_reads), desc='Building'):
        data.append(choice(raw_data))
    return data

In [16]:
data = build_dataset(15000, raw_data)
rdd_data = sc.parallelize(data)
dataLength = len(data[0])
dataCount = data.count() // data.getNumPartitions()

Building: 100%|██████████| 15000/15000 [00:00<00:00, 341787.64it/s]


Exception: could not open socket

In [None]:
len(raw_)