# Wordcount

In [13]:
# script1.py
import sys
wordcount = {}
for line in sys.stdin:
  line = line.strip()
  for word in line.split():
    if word not in wordcount:
      wordcount[word] = 1
    else:
      wordcount[word] += 1

for k,v in wordcount.items():
  print k,v

In [16]:
!cat pg4300.txt | ./script1.py | head

fawn 3
highspliced 1
noctambules 1
Debts 1
grenadier. 1
considered? 1
woods 2
clotted 4
phenomenologist 1
hanging 22
Traceback (most recent call last):
  File "./script1.py", line 13, in <module>
    print k,v
IOError: [Errno 32] Broken pipe


On linux command-line: `cat pg4300.txt | ./script1.py`

Output:
```
fawn 3
highspliced 1
noctambules 1
Debts 1
grenadier. 1
considered? 1
woods 2
clotted 4
phenomenologist 1
hanging 22
...
```

But very difficult to parallelize, mostly because of global variable `wordcount` which is mutable. Consider the following piece of code.

In [17]:
a = 1

def add(x):
  global a
  a = a + x
  return a

def multiply_by_two():
  global a
  return a*2

print("Start")
print(a)
print("Multiply a by 2")
print(multiply_by_two())
print("Add 5 to a")
print(add(5))
print("Multiply a by 2")
print(multiply_by_two())

Start
1
Multiply a by 2
2
Add 5 to a
6
Multiply a by 2
12


Depending on the moment that you run `multiply_by_two()` you will get another answer: `2` or `12`.

# Functional programming

### Exponentials: the non-functional way

In [18]:
def loopExp(x,n):
    tmp = 1
    for i in range(0,n):
        tmp = tmp * x
    return tmp

In [19]:
loopExp(5,3)

125

### Exponentials: the functional way

In [20]:
def exp(x,n):
  if n == 0:
    return 1
  else:
    return x * exp(x, n-1)

In [21]:
exp(5,3)

125

### Higher-order functions

In [22]:
def exp2(x):
    return exp(x,2)

In [23]:
def sum(x,y):
    return x + y

## Map, reduce, filter

### Map
Apply a function to every element of a list

In [24]:
map(exp2, [1,2,3,4,5])

[1, 4, 9, 16, 25]

... or using an anonymous function with `lambda`

In [25]:
map(lambda x: x+2, [1,2,3,4,5])

[3, 4, 5, 6, 7]

### Reduce
Collapse all elements using a function

In [26]:
reduce(sum, [1,2,3,4,5])

15

... or using an anonymous function with 'lambda'

In [27]:
reduce(lambda a,b: a+b, [1,2,3,4,5])

15

### Filter
Filter elements from a list

In [28]:
filter(lambda x: x > 2, [1,2,3,4,5])

[3, 4, 5]

### Combining these

In [11]:
reduce(sum, map(exp2, [1,2,3,4]))

30

# Spark

Spark notebook started with:

`docker run -v /Users/jaerts/Google\ Drive/Teaching/I0U19A/ExerciseMaterial:/home/jovyan/work -d -p 8888:8888 jupyter/pyspark-notebook start-notebook.sh`

In [2]:
import os
import sys
import pyspark

In [3]:
sc = pyspark.SparkContext('local[*]')

In [5]:
file = sc.textFile("pg4300_1000.txt")
counts = file.flatMap(lambda line: line.split(" ")) \
                .map(lambda word: (word,1)) \
                .reduceByKey(lambda a,b:a+b)
counts.collect()

[(u'', 489),
 (u'all', 16),
 (u'Bread,', 1),
 (u'ashamed', 1),
 (u'Lead', 1),
 (u'stay,', 1),
 (u"don't,", 1),
 (u'Father', 1),
 (u'relief.', 1),
 (u'crumpled', 1),
 (u'yellow', 2),
 (u'moved', 2),
 (u'mild', 1),
 (u'--Four', 1),
 (u'go', 2),
 (u'druids.', 1),
 (u'saved', 1),
 (u'Today', 1),
 (u'up.', 2),
 (u'looking', 3),
 (u'up,', 5),
 (u'look.', 1),
 (u'now.', 1),
 (u'agony.', 1),
 (u'Chrysostomos.', 1),
 (u'send', 1),
 (u'tea.', 1),
 (u'corpuscles.', 1),
 (u'tea,', 5),
 (u'mother,', 1),
 (u'funk?', 1),
 (u'suffered', 1),
 (u'And', 8),
 (u'helped', 1),
 (u'shielding', 1),
 (u'lobes', 1),
 (u'very', 3),
 (u'cheap', 2),
 (u'nomine', 1),
 (u'floated', 1),
 (u'soul.', 1),
 (u'scornful', 1),
 (u'every', 2),
 (u'sunny', 2),
 (u'Not', 1),
 (u'more,', 1),
 (u'word', 1),
 (u'trouble', 1),
 (u'Where', 2),
 (u'--Pooh!', 1),
 (u'No,', 1),
 (u"we'll", 2),
 (u'kinswoman', 1),
 (u'sighed', 3),
 (u'Grogan', 2),
 (u'attires.', 1),
 (u'did', 6),
 (u'forth', 2),
 (u'--Do,', 1),
 (u'bay,', 2),
 (u'brig

In [4]:
# Output is a list:
type(counts.collect())
counts.collect()[1]

(u'fawn', 3)

In [6]:
file.first()

u'The Project Gutenberg EBook of Ulysses, by James Joyce'

In [8]:
file.filter(lambda line: "clotted" in line).count()

4

In [4]:
sc.textFile("pg4300.txt").map(lambda s: len(s)).reduce(lambda a, b: a + b)

1506937

In [10]:
totalLength

1506937

In [13]:
data = sc.parallelize([1,2,3,4,5])

In [14]:
data.collect()

[1, 2, 3, 4, 5]

In [15]:
data

ParallelCollectionRDD[13] at parallelize at PythonRDD.scala:475

In [18]:
counter = 0
rdd = data
def increment_counter(x):
    global counter
    counter += x
rdd.foreach(increment_counter)
print("Counter value: ", counter)

('Counter value: ', 0)


In [19]:
rdd.collect()

[1, 2, 3, 4, 5]

In [27]:
rdd.map(lambda x: println(x))

PythonRDD[17] at RDD at PythonRDD.scala:48

In [33]:
rdd.take(2).foreach(lambda x: println(x))

AttributeError: 'list' object has no attribute 'foreach'

In [41]:
d = sc.parallelize([["this"], ["is", "a", "string"]])

In [42]:
d

ParallelCollectionRDD[27] at parallelize at PythonRDD.scala:475

In [45]:
d.flatMap(lambda x: len(x))

PythonRDD[30] at RDD at PythonRDD.scala:48

In [9]:
a = 1

def add(x):
    global a
    a = a + x
    return a

def multiply_by_two():
    global a
    a = a*2
    return a

print(a)
print(multiply_by_two())
print(add(5))
print(multiply_by_two())

1
2
7
14


In [10]:
def loopExp(x,n):
    tmp = 1
    for i in range(0,n):
        tmp = tmp*x
    return tmp

loopExp(2,3)

8

In [11]:
def exp(x,n):
    if n == 0:
        return 1
    else:
        return x*exp(x,n-1)
exp(2,3)

8

In [12]:
def exp2(x):
    return exp(x,2)

map(exp2, [1,2,3,4])


[1, 4, 9, 16]

In [13]:
def sum(x,y):
    return x+y

reduce(sum, [1,2,3,4])

10

In [14]:
reduce(sum, map(exp2, [1,2,3,4]))

30

In [15]:
reduce(sum, [1, 4, 9, 16])

30