In [1]:
from fast_pyspark_tester import Context

Word Count
---------

In [2]:
(
    Context()
    .textFile('../README.rst')
    .map(lambda line: ''.join(ch if ch.isalnum() else ' ' for ch in line))
    .flatMap(lambda line: line.split(' '))
    .filter(lambda word: len(word) > 0)
    .map(lambda word: (word, 1))
    .reduceByKey(lambda a, b: a + b)
    .collect()
)

[('image', 3),
 ('https', 9),
 ('raw', 2),
 ('githubusercontent', 2),
 ('com', 7),
 ('svenkreiss', 7),
 ('fast_pyspark_tester', 22),
 ('master', 4),
 ('logo', 2),
 ('w100', 1),
 ('png', 2),
 ('target', 3),
 ('github', 5),
 ('Pysparkling', 2),
 ('provides', 1),
 ('a', 13),
 ('faster', 1),
 ('more', 1),
 ('responsive', 3),
 ('way', 1),
 ('to', 10),
 ('develop', 1),
 ('programs', 1),
 ('for', 11),
 ('PySpark', 4),
 ('It', 1),
 ('enables', 1),
 ('code', 6),
 ('intended', 1),
 ('Spark', 5),
 ('applications', 1),
 ('execute', 1),
 ('entirely', 1),
 ('in', 3),
 ('Python', 1),
 ('without', 1),
 ('incurring', 1),
 ('the', 12),
 ('overhead', 1),
 ('of', 8),
 ('initializing', 1),
 ('and', 13),
 ('passing', 1),
 ('data', 3),
 ('through', 1),
 ('JVM', 1),
 ('Hadoop', 1),
 ('The', 2),
 ('focus', 1),
 ('is', 4),
 ('on', 3),
 ('having', 1),
 ('lightweight', 1),
 ('fast', 4),
 ('implementation', 1),
 ('small', 3),
 ('datasets', 2),
 ('at', 1),
 ('expense', 1),
 ('some', 2),
 ('resilience', 1),
 ('featu

## Distributed Confusion Matrix

In [3]:
import numpy as np

predicted_true = Context().parallelize([
    (0, 0), (1, 0), (2, 2), (1, 1), (1, 2), (2, 2),
    (0, 2), (1, 0), (2, 1), (1, 1), (0, 0), (0, 0),
])

def cm_per_pair(p_t):
    cm = np.zeros((3, 3))
    cm[p_t[0], p_t[1]] = 1
    return cm

predicted_true.aggregate(
    zeroValue=np.zeros((3, 3)),
    seqOp=lambda prev, p_t: prev + cm_per_pair(p_t),
    combOp=np.add,
)

array([[ 3.,  0.,  1.],
       [ 2.,  2.,  1.],
       [ 0.,  1.,  2.]])

Accessing Public Datasets on AWS S3
===================

You need to have valid AWS account credentials in your environment under ``AWS_ACCESS_KEY_ID`` and ``AWS_SECRET_ACCESS_KEY`` for the next demos.


Common Crawl
-----------

More info on the dataset is in this [blog post](http://blog.commoncrawl.org/2015/05/march-2015-crawl-archive-available/).

In [3]:
# read all the paths of warc and wat files of the latest Common Crawl
paths_rdd = Context().textFile(
    's3n://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2015-11/warc.paths.*,'
    's3n://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2015-11/wat.paths.gz'
)

print(paths_rdd.takeSample(5))

[u'common-crawl/crawl-data/CC-MAIN-2015-11/segments/1424936462555.21/warc/CC-MAIN-20150226074102-00158-ip-10-28-5-156.ec2.internal.warc.gz', u'common-crawl/crawl-data/CC-MAIN-2015-11/segments/1424936462982.10/warc/CC-MAIN-20150226074102-00095-ip-10-28-5-156.ec2.internal.warc.gz', u'common-crawl/crawl-data/CC-MAIN-2015-11/segments/1424936463181.39/warc/CC-MAIN-20150226074103-00261-ip-10-28-5-156.ec2.internal.warc.gz', u'common-crawl/crawl-data/CC-MAIN-2015-11/segments/1424936463956.95/wat/CC-MAIN-20150226074103-00277-ip-10-28-5-156.ec2.internal.warc.wat.gz', u'common-crawl/crawl-data/CC-MAIN-2015-11/segments/1424936465069.3/wat/CC-MAIN-20150226074105-00332-ip-10-28-5-156.ec2.internal.warc.wat.gz']


Human Biome Project
------------------

In [4]:
# access data from the Human Microbiome Project
by_subject_rdd = Context().textFile(
    's3n://human-microbiome-project/DEMO/HM16STR/46333/by_subject/*'
)
print(by_subject_rdd.takeSample(1))

[u'ACCAAGGCTTTGACGGGTAGCCGGCCTGAGAGGGTGACCGGCCACATTGGGACTGAGATA']
