# Map Reduce

1. Use a mapper function to turn each item into zero or more key-value pairs. (Often this is called the map function, but there is already a Python function called map and we donâ€™t need to confuse the two.) 

2. Collect together all the pairs with identical keys. 

3. Use a reducer function on each collection of grouped values to produce output values for the corresponding key.


In [1]:
from typing import List
from collections import Counter
from typing import Iterator, Tuple, Iterable
from collections import defaultdict

Not using Map Reduce

In [2]:
def tokenize(document: str) -> List[str]:
    """Just split on whitespace"""
    return document.split()

In [3]:
def word_count_old(documents: List[str]):
    """Word Count not using MapReduce"""
    return Counter(word
        for document in documents
        for word in tokenize(document))

Using Map Reduce

In [9]:
def wc_mapper(document: str) -> Iterator[Tuple[str, int]]:
    """For each word in the document, emit (word, 1)"""
    for word in tokenize(document):
        yield (word, 1)

In [10]:
def wc_reducer(word: str, counts: Iterable[int]) -> Iterator[Tuple[str, int]]:
    """Sum up the counts for a word"""
    yield (word, sum(counts))

In [19]:
def word_count(documents: List[str]) -> List[Tuple[str, int]]:
    """Count the words in the input documents using MapReduce"""

    collector = defaultdict(list)  # To store grouped values

    for document in documents:
        for word, count in wc_mapper(document):
            collector[word].append(count)

    print(collector)                    

    return [output
            for word, counts in collector.items()
            for output in wc_reducer(word, counts)]

### Example

In [12]:
documents = ["data science", "big data", "science fiction"]

In [17]:
wc_mapper(documents[0])

<generator object wc_mapper at 0x00000205FA83F318>

In [18]:
list(wc_mapper(documents[0]))

[('data', 1), ('science', 1)]

In [21]:
wc = word_count(["data science", "big data", "science fiction"])

defaultdict(<class 'list'>, {'data': [1, 1], 'science': [1, 1], 'big': [1], 'fiction': [1]})


In [22]:
wc

[('data', 2), ('science', 2), ('big', 1), ('fiction', 1)]