# PySpark
* Familiar interface (map, reduce, take, fold, etc.)
* Uses YARN and HDFS

### Monitoring:
    http://<driver-node>:4040
    
Assembler based on: 
    `https://pmelsted.wordpress.com/2013/11/23/naive-python-implementation-of-a-de-bruijn-graph/`

In [None]:
#from pyspark import SparkContext
import pyspark

In [None]:
# sc.stop()

In [None]:
# conf = pyspark.SparkConf()

In [None]:
# conf.setMaster("yarn-client")

# conf.set('spark.driver.memory','4g')
# conf.set('spark.executor.memory','8g')
#conf.set('spark.executor.cores', '1')
# conf.set('spark.shuffle.service.enabled','true')
# conf.set('spark.dynamicAllocation.enabled','true')
# conf.set('spark.dynamicAllocation.initialExecutors','2')
# conf.set('spark.dynamicAllocation.minExecutors','2')

In [None]:
# sc = pyspark.SparkContext(conf=conf)

In [None]:
sc.defaultParallelism

Data is stored internally in an RDD (Resilient Distributed Dataset)

In [None]:
sc.defaultMinPartitions

In [None]:
compliment = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}

In [None]:
raw_data = sc.textFile('hdfs://172.31.26.32:8020/genome/Ba10k.sim1.fq')
raw_data.getNumPartitions()

## fastq file parser

In [None]:
data = raw_data.filter(lambda x: x[0] in ['A','C','G','T'] if len(x) > 0)

In [None]:
k = 17

## calculate list of kmers
Both forward and the reverse complement.

In [None]:
fwd_list = data.flatMap(lambda x: [x[i:i+k] for i in range(len(x.rstrip())-k+1)])


rev_comp = data.map(lambda x:''.join(reversed([complement.get(base, base) for base in x])))

rev_list = rev_comp.flatMap(lambda x: [x[i:i+k] for i in range(len(x.rstrip())-k+1)])

In [None]:
kmer_list = fwd_list + rev_list

## now we need to coalesce them, like WordCount

In [None]:
emitter = kmer_list.map(lambda x: (x, 1))

In [None]:
emitter.take(5)

In [None]:
kmer_counts = emitter.reduceByKey(lambda x, y: x+y)
# kmer_counts.take(10)

In [None]:
kmer_counts.count()

In [None]:
kmer_counts.keys().take(5)

In [None]:
kmer_counts.values().take(5)

In [None]:
# kmer_dict = kmer_counts.collectAsMap()
# kmer_dict.take(5)

## Next, find the contigs by creating a graph and walking it.
Note: The original source expects d = {'kmer':count}
* Maybe build a dict in all_contigs then go forward?

In [None]:
def twin(km):
    compliment = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
    # return Seq.reverse_complement(km)
    return "".join(compliment.get(base, base) for base in reversed(km))


In [None]:
def fw(km):
    for x in 'ACGT':
        yield km[1:]+x

def bw(km):
    for x in 'ACGT':
        yield x + km[:-1]

In [None]:
def contig_to_string(c):
    return c[0] + ''.join(x[-1] for x in c[1:])

def get_contig(d,km):
    '''
    Find kmer's contig.
    Return: the string, list of kmers in contig
    '''
    c_fw = get_contig_forward(d,km)

    c_bw = get_contig_forward(d,twin(km))

    if km in fw(c_fw[-1]):
        c = c_fw
    else:
        c = [twin(x) for x in c_bw[-1:0:-1]] + c_fw
    return contig_to_string(c),c


def get_contig_forward(d,km):
    c_fw = [km]

    while True:
        if sum(x in d for x in fw(c_fw[-1])) != 1:
            break

        cand = [x for x in fw(c_fw[-1]) if x in d][0]
        if cand == km or cand == twin(km):
            break # break out of cycles or mobius contigs
        if cand == twin(c_fw[-1]):
            break # break out of hairpins

        if sum(x in d for x in bw(cand)) != 1:
            break

        c_fw.append(cand)

    return c_fw

def all_contigs(k_tuples):
    d = dict(k_tuples)
    done = set()
    r = []
    for x in d:
        if x not in done:
            s,c = get_contig(d,x)
            for y in c:
                done.add(y)
                done.add(twin(y))
            r.append(s)
    return r

# Code below is for putting the graph in GFA format
# Could use it as part of a visualizer step: GFA -> .dot (for GraphViz)
#     G = {}
#     heads = {}
#     tails = {}
#     for i,x in enumerate(r):
#         G[i] = ([],[])
#         heads[x[:k]] = (i,'+')
#         tails[twin(x[-k:])] = (i,'-')

#     for i in G:
#         x = r[i]
#         for y in fw(x[-k:]):
#             if y in heads:
#                 G[i][0].append(heads[y])
#             if y in tails:
#                 G[i][0].append(tails[y])
#         for z in fw(twin(x[:k])):
#             if z in heads:
#                 G[i][1].append(heads[z])
#             if z in tails:
#                 G[i][1].append(tails[z])

#     return G,r



In [None]:
%time contigs = kmer_counts.mapPartitions(all_contigs).collect()

In [None]:
contigs[:5]

In [None]:
contigs.count()

In [None]:
contigs.getNumPartitions()

## Now let's try a bigger data set

In [None]:
raw_data = sc.textFile('hdfs://172.31.26.32:8020/genome/sra_data.fastq', 100)
raw_data.getNumPartitions()

In [None]:
k = 21

In [None]:
data = raw_data.filter(lambda x: len(x) > 0 and x[0] in ['A','C','G','T'] )

In [None]:
fwd_list = data.flatMap(lambda x: [x[i:i+k] for i in range(len(x.rstrip())-k+1)])


rev_comp = data.map(lambda x:''.join(reversed([compliment.get(base, base) for base in x])))

rev_list = rev_comp.flatMap(lambda x: [x[i:i+k] for i in range(len(x.rstrip())-k+1)])

In [None]:
kmer_list = fwd_list + rev_list
emitter = kmer_list.map(lambda x: (x, 1))
kmer_counts = emitter.reduceByKey(lambda x, y: x+y)


In [None]:
%%time
contigs = kmer_counts.mapPartitions(all_contigs)
contigs.saveAsTextFile('hdfs://172.31.26.32:8020/genome/sra_output')

## Now for the really big data set
And let's try to chain it, like the big boys do...

In [None]:
raw_data = sc.textFile('hdfs://172.31.26.32:8020/genome/Ecoli-RR359304-2.fastq',500)
raw_data.getNumPartitions()

In [None]:
k = 21

In [None]:
data = raw_data.filter(lambda x: len(x) > 0 and x[0] in ['A','C','G','T'] )

In [None]:
fwd_list = data.flatMap(lambda x: [x[i:i+k] for i in range(len(x.rstrip())-k+1)])
rev_comp = data.map(lambda x:''.join(reversed([compliment.get(base, base) for base in x])))
rev_list = rev_comp.flatMap(lambda x: [x[i:i+k] for i in range(len(x.rstrip())-k+1)])

In [None]:
kmer_list = fwd_list + rev_list
emitter = kmer_list.map(lambda x: (x, 1))
kmer_counts = emitter.reduceByKey(lambda x, y: x+y)

In [None]:
# %%time
contigs = kmer_counts.mapPartitions(all_contigs)
contigs.saveAsTextFile('hdfs://172.31.26.32:8020/genome/ecoli_output')