# Faking MapReduce with Python's multiprocessing module
Footnote: there's also python's mrjob-module developed in cooperation with Yelp and with target Amazon's Elastic MapReduce Sercive (ESR)

In [1]:
%%file multiprocessing_mapreduce.py

import collections
import itertools
import multiprocessing

class SimpleMapReduce(object):
    
    def __init__(self, map_func, reduce_func, num_workers=None):
        """
        map_func

          Function to map inputs to intermediate data. Takes as
          argument one input value and returns a tuple with the key
          and a value to be reduced.
        
        reduce_func

          Function to reduce partitioned version of intermediate data
          to final output. Takes as argument a key as produced by
          map_func and a sequence of the values associated with that
          key.
         
        num_workers

          The number of workers to create in the pool. Defaults to the
          number of CPUs available on the current host.
        """
        self.map_func = map_func
        self.reduce_func = reduce_func
        self.pool = multiprocessing.Pool(num_workers)
    
    def partition(self, mapped_values):
        """Organize the mapped values by their key.
        Returns an unsorted sequence of tuples with a key and a sequence of values.
        """
        partitioned_data = collections.defaultdict(list)
        for key, value in mapped_values:
            partitioned_data[key].append(value)
        return partitioned_data.items()
    
    def __call__(self, inputs, chunksize=1):
        """Process the inputs through the map and reduce functions given.
        
        inputs
          An iterable containing the input data to be processed.
        
        chunksize=1
          The portion of the input data to hand to each worker.  This
          can be used to tune performance during the mapping phase.
        """
        map_responses = self.pool.map(self.map_func, inputs, chunksize=chunksize)
        partitioned_data = self.partition(itertools.chain(*map_responses))
        reduced_values = self.pool.map(self.reduce_func, partitioned_data)
        return reduced_values

Writing multiprocessing_mapreduce.py


In [3]:
import multiprocessing
import string
import os
import re
import codecs

from multiprocessing_mapreduce import SimpleMapReduce

def file_to_words(filename):
    """Read a file and return a sequence of (word, occurances) values.
    """
    print(multiprocessing.current_process().name, 'reading', filename)
    output = []
    with codecs.open(filename,'r', encoding='latin1') as inf:
        for word in re.findall(r'\b\w\w+\b',inf.read()):
            output.append((word, 1))
    return output
    
    STOP_WORDS = set([
            'a', 'an', 'and', 'are', 'as', 'be', 'by', 'for', 'if', 'in', 
            'is', 'it', 'of', 'or', 'py', 'rst', 'that', 'the', 'to', 'with',
            ])
    TR = str.maketrans(string.punctuation, ' ' * len(string.punctuation))

    
    output = []

    with open(filename, 'rt') as f:
        for line in f:
            if line.lstrip().startswith('..'): # Skip rst comment lines
                continue
            line = line.translate(TR) # Strip punctuation
            for word in line.split():
                word = word.lower()
                if word.isalpha() and word not in STOP_WORDS:
                    output.append( (word, 1) )
    return output


def count_words(item):
    """Convert the partitioned data for a word to a
    tuple containing the word and the number of occurances.
    """
    word, occurances = item
    return (word, sum(occurances))


if __name__ == '__main__':
    import operator
    import glob
    from sklearn.datasets import fetch_20newsgroups


    categories = [
    'alt.atheism',]
# Uncomment the following to do the analysis on all the categories
#categories = None

#print("Loading 20 newsgroups dataset for categories:")
#print(categories)

    data = fetch_20newsgroups(subset='train', categories=categories)

    #input_files = glob.glob('20news-bydate-train/alt.atheism/*')
    mapper = SimpleMapReduce(file_to_words, count_words)
    word_counts = mapper(data.filenames)
    #word_counts = mapper(input_files)
    word_counts.sort(key=operator.itemgetter(1))
    word_counts.reverse()
    
    print('\nTOP 20 WORDS BY FREQUENCY\n')
    top20 = word_counts[:20]
    longest = max(len(word) for word, count in top20)
    for word, count in top20:
        print('%-*s: %5s' % (longest+1, word, count))

ForkPoolWorker-2 reading /root/scikit_learn_data/20news_home/20news-bydate-train/alt.atheism/51318
ForkPoolWorker-2 reading /root/scikit_learn_data/20news_home/20news-bydate-train/alt.atheism/51215
ForkPoolWorker-2 reading /root/scikit_learn_data/20news_home/20news-bydate-train/alt.atheism/53753
ForkPoolWorker-2 reading /root/scikit_learn_data/20news_home/20news-bydate-train/alt.atheism/53284
ForkPoolWorker-2 reading /root/scikit_learn_data/20news_home/20news-bydate-train/alt.atheism/51253
ForkPoolWorker-2 reading /root/scikit_learn_data/20news_home/20news-bydate-train/alt.atheism/51315
ForkPoolWorker-2 reading /root/scikit_learn_data/20news_home/20news-bydate-train/alt.atheism/53761
ForkPoolWorker-2 reading /root/scikit_learn_data/20news_home/20news-bydate-train/alt.atheism/53521
ForkPoolWorker-2 reading /root/scikit_learn_data/20news_home/20news-bydate-train/alt.atheism/53065
ForkPoolWorker-2 reading /root/scikit_learn_data/20news_home/20news-bydate-train/alt.atheism/53082
ForkPoolWo

FileNotFoundError: [Errno 2] No such file or directory: '/root/scikit_learn_data/20news_home/20news-bydate-train/alt.atheism/51318'



# Example 0 WordCount on a single document
taken from Gutenberg (also, the stepping stone for Example 2, if we didn't have Google)

# Example 1 WordCount on 20 Newsgroups for classification
find the Top 20 (non-stop) words 

# Example 2
find the phrase (n-gram) with the largest (relative) gain per year; relative to the number of occurences last year (and -- if the total numbers per year vary widely-- also with repsect to the total number of n-grams in that year).

Test dataset (English One Million)