In [1]:
# super class
class InputData(object):
    def read(self):
        raise NotImplmentedError

In [2]:
# sub class
class PathInputData(InputData):
    def __init__(self, path):
        super().__init__()
        self.path = path
        
    def read(self):
        return open(self.path).read()

In [3]:
class Worker(object):
    def __init__(self, input_data):
        self.input_data = input_data
        self.result = None

    def map(self):
        raise NotImplementedError
    
    def reduce(self, other):
        raise NotImplmentedError

In [4]:
class LineCountWorker(Worker):
    def map(self):
        data = self.input_data.read()
        self.result = data.count('\n')
        
    def reduce(self, other):
        self.result += other.result

In [5]:
def generate_inputs(data_dir):
    import os
    for name in os.listdir(data_dir):
        yield PathInputData(os.path.join(data_dir, name))

In [6]:
generate_inputs('.')

<generator object generate_inputs at 0x7f3b88108258>

In [7]:
def create_workers(input_list):
    workers = []
    for input_data in input_list:
        workers.append(LineCountWorker(input_data))
    return workers

In [8]:
workers = create_workers('.')

In [9]:
import threading
def execute(workers):
    threads = [threading.Thread(target=w.map) for w in workers]
    for thread in threads: thread.start()
    for thread in threads: thread.join()
    
    first, rest = workers[0], workers[1:]
    for worker in rest:
        first.reduce(worker)
    return first.result

In [10]:
def mapreduce(data_dir):
    inputs = generate_inputs(data_dir)
    workers = create_workers(inputs)
    return execute(workers)

In [11]:
mapreduce('/home/luno/Workspace/Sources/mora/api/statistics')

138

In [12]:
# using @classmethod polymorphism to make mapreduce function to generic
class GenericInputData(object):
    def read(self):
        raise NotImplementedError
    
    @classmethod
    def generate_inputs(cls, config):
        raise NotImplementedError

In [18]:
import os
class PathInputData(GenericInputData):
    def __init__(self, path):
        super().__init__()
        self.path = path
        
    def read(self):
        return open(self.path).read()
        
    @classmethod
    def generate_inputs(cls, config):
        data_dir = config['data_dir']
        print(data_dir)
        for name in os.listdir(data_dir):
            yield cls(os.path.join(data_dir, name))

In [20]:
class GenericWorker(object):
    def __init__(self, input_data):
        self.input_data = input_data
        self.result = None
        
    def map(self):
        raise NotImplemented
        
    def reduce(self, other):
        raise NotImplemented
        
    @classmethod
    def create_workers(cls, input_class, config):
        workers = []
        for input_data in input_class.generate_inputs(config):
            workers.append(cls(input_data))
        return workers

In [22]:
class LineCountWorker(GenericWorker):
    def map(self):
        data = self.input_data.read()
        self.result = data.count('\n')
        
    def reduce(self, other):
        self.result += other.result

In [23]:
def mapreduce(worker_class, input_class, config):
    workers = worker_class.create_workers(input_class, config)
    return execute(workers)

In [31]:
config = {'data_dir': '/home/luno/Workspace/Sources/mora/api/statistics'}
result = mapreduce(LineCountWorker, PathInputData, config)
print ('There are', result, 'lines') # it's same with the resucat *.go | wc -l

/home/luno/Workspace/Sources/mora/api/statistics
There are 138 lines
