# PySpark
* Familiar interface (map, reduce, take, fold, etc.)
* Uses YARN and HDFS

### Monitoring:
    http://<driver-node>:4040
    
Assembler based on: 
    `https://pmelsted.wordpress.com/2013/11/23/naive-python-implementation-of-a-de-bruijn-graph/`

In [1]:
from pyspark import SparkContext

In [2]:
sc = SparkContext(master='local[*]')

ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=pyspark-shell, master=local[*]) created by getOrCreate at <string>:43 

In [None]:
# sc.defaultParallelism

Data is stored internally in an RDD (Resilient Distributed Dataset)

In [3]:
sc.defaultMinPartitions

2

In [4]:
complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}

In [5]:
raw_data = sc.textFile('hdfs://localhost:9000/Genome/Ba10k.sim1.fq')
raw_data.getNumPartitions()

2

## fastq file parser

In [6]:
data = raw_data.filter(lambda x: x[0] in ['A','C','G','T'])

In [7]:
k = 17

## calculate list of kmers
Both forward and the reverse complement.

In [8]:
fwd_list = data.flatMap(lambda x: [x[i:i+k] for i in range(len(x.rstrip())-k+1)])


rev_comp = data.map(lambda x:''.join(reversed([complement.get(base, base) for base in x])))

rev_list = rev_comp.flatMap(lambda x: [x[i:i+k] for i in range(len(x.rstrip())-k+1)])

In [9]:
kmer_list = fwd_list + rev_list

## now we need to coalesce them, like WordCount

In [10]:
emitter = kmer_list.map(lambda x: (x, 1))

In [11]:
emitter.take(5)

[('GATAACTCGATTTAAAC', 1),
 ('ATAACTCGATTTAAACC', 1),
 ('TAACTCGATTTAAACCA', 1),
 ('AACTCGATTTAAACCAG', 1),
 ('ACTCGATTTAAACCAGA', 1)]

In [12]:
kmer_counts = emitter.reduceByKey(lambda x, y: x+y)
kmer_counts.take(10)

[('CAAAACATCGCCATTAC', 10),
 ('TACGAAATCATCATCAT', 9),
 ('ACAGATGTAAAAGAGCG', 1),
 ('ACAAGATAGTCTTTTTT', 1),
 ('TTGGTGATACAGATCAA', 12),
 ('AAATTCGAACTTTCGAA', 1),
 ('GAAGAACGTTCATAATA', 1),
 ('GATTTCCCAGGAGTTAT', 1),
 ('ATACGTACAGAGGGTTT', 1),
 ('AATTACGTGATGAATCA', 1)]

In [13]:
kmer_counts.count()

131480

In [14]:
kmer_counts.keys().take(5)

['CAAAACATCGCCATTAC',
 'TACGAAATCATCATCAT',
 'ACAGATGTAAAAGAGCG',
 'ACAAGATAGTCTTTTTT',
 'TTGGTGATACAGATCAA']

In [15]:
kmer_counts.values().take(5)

[10, 9, 1, 1, 12]

In [None]:
# kmer_dict = kmer_counts.collectAsMap()
# kmer_dict.take(5)

## Next, find the contigs by creating a graph and walking it.
Note: The original source expects d = {'kmer':count}
* Maybe build a dict in all_contigs then go forward?

In [24]:
def twin(km):
#     complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
    # return Seq.reverse_complement(km)
    return "".join(complement.get(base, base) for base in reversed(km))


In [25]:
def fw(km):
    for x in 'ACGT':
        yield km[1:]+x

def bw(km):
    for x in 'ACGT':
        yield x + km[:-1]

In [21]:
def contig_to_string(c):
    return c[0] + ''.join(x[-1] for x in c[1:])

def get_contig(d,km):
    '''
    Find kmer's contig.
    Return: the string, list of kmers in contig
    '''
    c_fw = get_contig_forward(d,km)

    c_bw = get_contig_forward(d,twin(km))

    if km in fw(c_fw[-1]):
        c = c_fw
    else:
        c = [twin(x) for x in c_bw[-1:0:-1]] + c_fw
    return contig_to_string(c),c


def get_contig_forward(d,km):
    c_fw = [km]

    while True:
        if sum(x in d for x in fw(c_fw[-1])) != 1:
            break

        cand = [x for x in fw(c_fw[-1]) if x in d][0]
        if cand == km or cand == twin(km):
            break # break out of cycles or mobius contigs
        if cand == twin(c_fw[-1]):
            break # break out of hairpins

        if sum(x in d for x in bw(cand)) != 1:
            break

        c_fw.append(cand)

    return c_fw

def all_contigs(k_tuples):
    d = dict(k_tuples)
    done = set()
    r = []
    for x in d:
        if x not in done:
            s,c = get_contig(d,x)
            for y in c:
                done.add(y)
                done.add(twin(y))
            r.append(s)
    return r

# Code below is for putting the graph in GFA format
# Could use it as part of a visualizer step: GFA -> .dot (for GraphViz)
#     G = {}
#     heads = {}
#     tails = {}
#     for i,x in enumerate(r):
#         G[i] = ([],[])
#         heads[x[:k]] = (i,'+')
#         tails[twin(x[-k:])] = (i,'-')

#     for i in G:
#         x = r[i]
#         for y in fw(x[-k:]):
#             if y in heads:
#                 G[i][0].append(heads[y])
#             if y in tails:
#                 G[i][0].append(tails[y])
#         for z in fw(twin(x[:k])):
#             if z in heads:
#                 G[i][1].append(heads[z])
#             if z in tails:
#                 G[i][1].append(tails[z])

#     return G,r



In [26]:
contigs = kmer_counts.mapPartitions(all_contigs)

In [27]:
contigs.take(5)

['CAAAACATCGCCATTAC',
 'TACGAAATCATCATCAT',
 'TTCTTACAAGATAGTCTTTTTT',
 'AAATTCGAACTTTCGAA',
 'TTGTTGATTTAACAGCTA']

In [28]:
contigs.count()

97777

In [29]:
contigs.getNumPartitions()

4