In [1]:
from pyspark import SparkConf, SparkContext
sc = SparkContext()

# cartesian

In [2]:
# 笛卡尔
r1 = sc.parallelize([('k1', 'v1'), ('k2', 'v2')])
r2 = sc.parallelize([('k3', 'v3'), ('k4', 'v4'), ('k5', 'v5')])
r3 = r1.cartesian(r2)
r3.collect()

[(('k1', 'v1'), ('k3', 'v3')),
 (('k1', 'v1'), ('k4', 'v4')),
 (('k1', 'v1'), ('k5', 'v5')),
 (('k2', 'v2'), ('k3', 'v3')),
 (('k2', 'v2'), ('k4', 'v4')),
 (('k2', 'v2'), ('k5', 'v5'))]

# word cartesian

In [3]:
! cat './data.txt'

crazy crazy fox jumped over the fence
crazy fox jumped
the fence is high of fox
crazy fox is smart
fox jumped very high


In [5]:
lines = sc.textFile('./data.txt')
words = lines.map(lambda x : x.split(' ')).fold([], lambda x, y : x+y)
words = sc.parallelize(words)
word_cartesian = words.cartesian(words)
word_cartesian.collect()

[('crazy', 'crazy'),
 ('crazy', 'crazy'),
 ('crazy', 'fox'),
 ('crazy', 'jumped'),
 ('crazy', 'over'),
 ('crazy', 'the'),
 ('crazy', 'fence'),
 ('crazy', 'crazy'),
 ('crazy', 'fox'),
 ('crazy', 'jumped'),
 ('crazy', 'the'),
 ('crazy', 'fence'),
 ('crazy', 'is'),
 ('crazy', 'high'),
 ('crazy', 'of'),
 ('crazy', 'fox'),
 ('crazy', 'crazy'),
 ('crazy', 'fox'),
 ('crazy', 'is'),
 ('crazy', 'smart'),
 ('crazy', 'fox'),
 ('crazy', 'jumped'),
 ('crazy', 'very'),
 ('crazy', 'high'),
 ('crazy', 'crazy'),
 ('crazy', 'crazy'),
 ('crazy', 'fox'),
 ('crazy', 'jumped'),
 ('crazy', 'over'),
 ('crazy', 'the'),
 ('crazy', 'fence'),
 ('crazy', 'crazy'),
 ('crazy', 'fox'),
 ('crazy', 'jumped'),
 ('crazy', 'the'),
 ('crazy', 'fence'),
 ('crazy', 'is'),
 ('crazy', 'high'),
 ('crazy', 'of'),
 ('crazy', 'fox'),
 ('crazy', 'crazy'),
 ('crazy', 'fox'),
 ('crazy', 'is'),
 ('crazy', 'smart'),
 ('crazy', 'fox'),
 ('crazy', 'jumped'),
 ('crazy', 'very'),
 ('crazy', 'high'),
 ('fox', 'crazy'),
 ('fox', 'crazy'),
 (

In [9]:
# 去重， 计数
bigrams = word_cartesian.distinct()

n_combinations = word_cartesian.count()
word_cartesian = word_cartesian.map(lambda x: (x, 1)).reduceByKey(lambda x,y : x+y)
word_cartesian.collect()

[(('of', 'very'), 1),
 (('the', 'very'), 2),
 (('of', 'the'), 2),
 (('jumped', 'fox'), 15),
 (('fence', 'fence'), 4),
 (('crazy', 'over'), 4),
 (('of', 'over'), 1),
 (('high', 'crazy'), 8),
 (('smart', 'high'), 2),
 (('the', 'is'), 4),
 (('the', 'fence'), 4),
 (('very', 'very'), 1),
 (('of', 'fence'), 2),
 (('of', 'crazy'), 4),
 (('very', 'fox'), 5),
 (('high', 'fence'), 4),
 (('fox', 'very'), 5),
 (('jumped', 'high'), 6),
 (('smart', 'smart'), 1),
 (('crazy', 'high'), 8),
 (('is', 'crazy'), 8),
 (('high', 'jumped'), 6),
 (('high', 'of'), 2),
 (('crazy', 'smart'), 4),
 (('fox', 'fox'), 25),
 (('of', 'jumped'), 3),
 (('fence', 'crazy'), 8),
 (('fox', 'smart'), 5),
 (('fence', 'very'), 2),
 (('crazy', 'fox'), 20),
 (('smart', 'the'), 2),
 (('over', 'fox'), 5),
 (('smart', 'fence'), 2),
 (('high', 'over'), 2),
 (('very', 'smart'), 1),
 (('jumped', 'very'), 3),
 (('over', 'jumped'), 3),
 (('the', 'high'), 4),
 (('very', 'jumped'), 3),
 (('jumped', 'is'), 6),
 (('is', 'the'), 4),
 (('of', '

# combine-by-key

In [3]:
rdd = sc.parallelize([('k1', 1), ('k1', 2), ('k1', 3), ('k1', 4), ('k1', 5),
                                 ('k2', 6), ('k2', 7), ('k2', 8),
                                 ('k3', 9), ('k4', 10)])

In [4]:
rdd.combineByKey( 
                                (lambda x: (x, 1)), 
                                (lambda x, y: (x[0] + y, x[1] + 1)), 
                                (lambda x, y: (x[0] + y[0], x[1] + y[1])) 
                               ).collect()

[('k1', (15, 5)), ('k3', (9, 1)), ('k4', (10, 1)), ('k2', (21, 3))]

In [6]:
# 等价于楼上
rdd.map(lambda x : (x[0], (x[1], 1))).reduceByKey(lambda x, y: (x[0]+y[0], x[1]+y[1])).collect()

[('k1', (15, 5)), ('k3', (9, 1)), ('k4', (10, 1)), ('k2', (21, 3))]

# dna-count

In [2]:
dna_seq = sc.textFile('./dna_seq.txt')

dna_seq.collect()

['ATATCCCCGGGAT', 'ATCGATCGATATA']

In [11]:
#   dnas = sc.parallelize(dna_seq.fold('', lambda x, y: x+y)).map(lambda x: (x, 1)).collect()
dnas = dna_seq.flatMap(lambda x : [(c, 1) for c in list(x)]).collect()
dnas

[('A', 1),
 ('T', 1),
 ('A', 1),
 ('T', 1),
 ('C', 1),
 ('C', 1),
 ('C', 1),
 ('C', 1),
 ('G', 1),
 ('G', 1),
 ('G', 1),
 ('A', 1),
 ('T', 1),
 ('A', 1),
 ('T', 1),
 ('C', 1),
 ('G', 1),
 ('A', 1),
 ('T', 1),
 ('C', 1),
 ('G', 1),
 ('A', 1),
 ('T', 1),
 ('A', 1),
 ('T', 1),
 ('A', 1)]

In [12]:
base_count = sc.parallelize(dnas).reduceByKey(lambda x ,y: x+y)
base_count.collect()

[('T', 7), ('G', 5), ('C', 6), ('A', 8)]

### solution 2 : custom function

In [14]:
def mapper(seq):
    freq = dict()
    for x in list(seq):
        if x in freq:
            freq[x] += 1
        else :
            freq[x] = 1
    kv = [(x, freq[x]) for x in freq]
    return kv

dna_count = dna_seq.flatMap(mapper)
dna_count.collect()

[('A', 3),
 ('T', 3),
 ('C', 4),
 ('G', 3),
 ('A', 5),
 ('T', 4),
 ('C', 2),
 ('G', 2)]

In [15]:
dna_count = dna_count.reduceByKey(lambda x,y: x+y)
dna_count.collect()

[('C', 6), ('A', 8), ('T', 7), ('G', 5)]

# mapPartitions()

类似于map(),但是不同于map(),mapPartitions()在每一个分区（块）上单独运行,传入函数的参数必须为一个可迭代的对象。
分区：在初始化RDD对象时指定或默认的分区。

例如：要寻找RDD对象每个分区中的最大最小值，mapPartitions()就很合适。

In [38]:
nums = [1,2,3,4,5,6,7,8,9,10]
nums_rdd = sc.parallelize(nums, 3)
print(nums_rdd.collect())

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]


In [40]:
# rdd object distributed across five partitions
nums_rdd.getNumPartitions()


3

In [41]:
def adder(iterator):
    yield sum(iterator)

nums_rdd.mapPartitions(adder).collect()

[6, 15, 34]

# min max

In [57]:
def minmax(iter):
    firstiter = 0
    for x in iter:
        if firstiter == 0:
            min_num = x
            max_num = x
            firstiter = 1
        else:
            if x > max_num:
                max_num = x
            if x < min_num:
                min_num = x
    return (min_num, max_num)

nums_rdd.mapPartitions(minmax).collect()

[1, 3, 4, 6, 7, 10]

# top-N

In [59]:
nums

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [62]:
top_3 = nums_rdd.takeOrdered(3)
top_3

[1, 2, 3]

In [64]:
bottom_3 = nums_rdd.takeOrdered(3, key = lambda x: -x)
bottom_3

[10, 9, 8]

In [65]:
kv = [(10,"z1"), (1,"z2"), (2,"z3"), (9,"z4"), (3,"z5"), (4,"z6"), (5,"z7"), (6,"z8"), (7,"z9")]
sc.parallelize(kv).takeOrdered(3, key=lambda x: x[0])

[(1, 'z2'), (2, 'z3'), (3, 'z5')]

In [66]:
kv = [(10,"z1"), (1,"z2"), (2,"z3"), (9,"z4"), (3,"z5"), (4,"z6"), (5,"z7"), (6,"z8"), (7,"z9")]
sc.parallelize(kv).takeOrdered(3, key=lambda x: x[1])

[(10, 'z1'), (1, 'z2'), (2, 'z3')]

In [67]:
kv = [(10,"z1"), (1,"z2"), (2,"z3"), (9,"z4"), (3,"z5"), (4,"z6"), (5,"z7"), (6,"z8"), (7,"z9")]
sc.parallelize(kv).takeOrdered(3, key=lambda x: -x[0])

[(10, 'z1'), (9, 'z4'), (7, 'z9')]