In [1]:
from distributed import Executor
from dask import delayed
from distributed import Client
from collections import defaultdict

In [2]:
client = Client()
client

<Client: scheduler="127.0.0.1:8786" processes=8 cores=8>

In [13]:

def read_fasta(infilename):
    sequence = []
    with open(infilename, 'r') as infile:
        for line in infile:
            if line[0] != '>':
                sequence.append(line.strip())
    return sequence

def read_fastq(filename):
    """
    Read fastq formatted <filename> and return a list of reads
    """
    with open(filename, "r") as infile:
        result = []
        for i, line in enumerate(infile):
            if i % 4 == 1:
                result.append(line.rstrip('\n'))
    return result

In [18]:
def twin(km):
    complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
    # return Seq.reverse_complement(km)
    return "".join(complement.get(base, base) for base in reversed(km))

def kmers(seq,k):
    for i in range(len(seq)-k+1):
        yield seq[i:i+k]

def fw(km):
    for x in 'ACGT':
        yield km[1:]+x

def bw(km):
    for x in 'ACGT':
        yield x + km[:-1]

#@delayed
def build(reads,k=31,limit=1):
    d = defaultdict(int)
    for read in reads:
        #seq_s = str(read.seq)
        seq_l = read.split('N')
        for seq in seq_l:
            for km in kmers(seq,k):
                d[km] +=1
            seq = twin(seq)
            for km in kmers(seq,k):
                d[km] += 1

    d1 = [x for x in d if d[x] <= limit]
    for x in d1:
        del d[x]
    # for key, value in d.items():
    #     print(key, value)
    return dict(d)


In [None]:
input_file = '../../data/read_1.fq'
buffer = read_fastq(input_file)

In [None]:
d = delayed(build(buffer))

In [None]:
%%timeit
d.compute()

In [None]:
%%timeit
b = build(buffer)

In [None]:
infile = '../../data/Ba10k.sim1.fq'
buffer = read_fastq(infile)

In [None]:
%%timeit 
ba1 = build (buffer)

In [None]:
ba2 = delayed(build(buffer))

In [None]:
%%timeit 
ba2.compute()

In [11]:
infile = '../../data/Ecoli_raw.fasta'
buffer = read_fasta(infile)

In [None]:
len(buffer)

In [None]:
%%timeit
e = build(buffer)

In [None]:
ecoli = delayed(build(buffer))

In [12]:
%%timeit
chunk_size = len(buffer) // 20
ecoli = {}
for i in range(20):
    chunk = buffer[chunk_size * i : chunk_size * (i + 1)]
    ecoli = build(chunk)
ecoli.compute()

#for k, v in ecoli.items():
#    print (k, v)

TypeError: object of type 'Delayed' has no len()

In [None]:
for k, v in ecoli.items():
    print (k, v)

In [14]:
infile = '/Users/zen/Code/git/sra_data.fastq'
buffer = read_fastq(infile)

In [15]:
 chunk_size = len(buffer) // 100

In [21]:
%%time
sra = []
for i in range (100):
    chunk = buffer[chunk_size*i : chunk_size * (i + 1)]
    data = delayed(build)(chunk)
    sra.append(data)

data.compute()

CPU times: user 2min 24s, sys: 2.04 s, total: 2min 26s
Wall time: 2min 30s


In [19]:
%%time
sra = build(buffer)

CPU times: user 7min 28s, sys: 30.3 s, total: 7min 58s
Wall time: 7min 58s
