In [1]:
# Analyzing status updates
import datetime

status_updates = [
    {"id": 2,
        "username" : "joelgrus",
        "text" : "Should I write a second edition of my data science book?",
        "created_at" : datetime.datetime(2018, 2, 21, 11, 47, 0),
        "liked_by" : ["data_guy", "data_gal", "mike"] },
        # ...
]

In [2]:
from typing import Iterable

from common.map_reduce import map_reduce
from common.sum_reducer import sum_reducer

def data_science_day_mapper(status_update: dict) -> Iterable:
    """Yields (day_of_week, 1) if status_update contains "data science" """
    if "data science" in status_update["text"].lower():
        day_of_week = status_update["created_at"].weekday()
        yield (day_of_week, 1)

data_science_days = map_reduce(status_updates,
                                data_science_day_mapper,
                                sum_reducer)

In [3]:
from collections import Counter

from common.kv_type import KV
from common.tokenize import tokenize

def words_per_user_mapper(status_update: dict):
    user = status_update["username"]
    for word in tokenize(status_update["text"]):
        yield (user, (word, 1))

def most_popular_word_reducer(user: str,
                                words_and_counts: Iterable[KV]):
    """
    Given a sequence of (word, count) pairs,
    return the word with the highest total count
    """
    word_counts = Counter()
    for word, count in words_and_counts:
        word_counts[word] += count

    word, count = word_counts.most_common(1)[0]

    yield (user, (word, count))

user_words = map_reduce(status_updates,
                        words_per_user_mapper,
                        most_popular_word_reducer)

In [4]:
from common.count_distinct_reducer import count_distinct_reducer

def liker_mapper(status_update: dict):
    user = status_update["username"]
    for liker in status_update["liked_by"]:
        yield (user, liker)

distinct_likers_per_user = map_reduce(status_updates,
                                        liker_mapper,
                                        count_distinct_reducer)