In [17]:
import os
from threading import Thread
from tempfile import TemporaryDirectory

class InputData(object):
    def read(self):
        raise NotImplementedError

class PathInputData(InputData):
    def __init__(self, path):
        super().__init__()
        self.path = path

    def read(self):
        with open(self.path) as f:
            return f.read()

class Worker(object):
    def __init__(self, input_data):
        """create worker instnace

        Args:
            input_data (PathInputData): instance of PathInputData
        """
        self.input_data = input_data
        self.result = None

    def map(self):
        raise NotImplementedError

    def reduce(self, other):
        raise NotImplementedError

#why we do not need to write other __init__() with super() method
#parent need path parameter for init, this child do not need additional attributes
class LineCountWorker(Worker):
    #this class is child of Workder but do not write __init__() because it do not need additional attributes
    #each worker will store count result in result
    def map(self):
        data = self.input_data.read()
        self.result = data.count('\n')
    #update count result from other workers
    def reduce(self, other):
        self.result += other.result

def generate_inputs(data_dir):
    for name in os.listdir(data_dir):
        yield PathInputData(os.path.join(data_dir, name))


def create_workers(input_list):
    """create a list of workers type LineCountWorker

    Args:
        input_list (generator(PathInputData)): generator which yield PathInputData

    Returns:
        list[LineCountWorkder]: list of wokers type LineCountWorkers
    """
    workers = []
    for input_data in input_list:
        workers.append(LineCountWorker(input_data))
    return workers

def execute(workers):
    """we Thead object to do line count

    Args:
        workers (list[CountLineWorkders]): list of CountLineWorkder

    Returns:
        int: counted result
    """
    threads = [Thread(target=w.map) for w in workers]
    for thread in threads: thread.start()
    for thread in threads: thread.join()

    first, *rest = workers
    for worker in rest:
        first.reduce(worker)
    return first.result

def mapreduce(data_dir):
    """the main function which use other function to complete map and reduce

    Args:
        data_dir (str): data directory

    Returns:
        None: run execute method
    """
    #step one: create file path generator
    inputs = generate_inputs(data_dir) #return generator which yeild PathInputData
    print('inputs: ', inputs)
    #step two: create a list of worker
    workers = create_workers(inputs)
    #put each worker.map into a thread, execute all thread and update result by combine all thread result
    return execute(workers)

def write_test_files(tmpdir):
    os.makedirs(tmpdir)
    for i in range(100):
        with open(os.path.join(tmpdir, str(i)), 'w') as f:
            f.write('\n' * random.randint(0, 100))


tmpdir = './test_inputs'
if not os.path.isdir(tmpdir):
    write_test_files(tmpdir)
result = mapreduce(tmpdir)
print('There are', result, 'lines')

inputs:  <generator object generate_inputs at 0x000002006A3DD9A0>
There are 5134 lines


In [6]:
from threading import Thread
class GenericInputData:
    def read(self):
        raise NotImplementedError
    @classmethod
    def generate_inputs(cls, config):
        raise NotImplementedError

class PathInputData(GenericInputData):
    def __init__(self, path):
        super().__init__()
        self.path = path
    def read(self):
        with open(self.path) as f:
            return f.read()
            
    @classmethod
    def generate_inputs(cls, config):
        data_dir = config["data_dir"]
        for name in os.listdir(data_dir):
            #cls(path) return PathIn
            yield cls(os.path.join(data_dir, name))

class GenericWorker:

    def __init__(self, input_data):
        self.input_data = input_data
        self.result = None
    
    def map(self):
        raise NotImplementedError
    
    def reduce(self, other):
        raise NotImplementedError
    
    @classmethod
    def create_worker(cls, input_class, config):
        workers = []
        for input_data in input_class.generate_inputs(config):
            workers.append(cls(input_data))
        return workers

class LineCountWorker(GenericWorker):
    
    def map(self):
        data = self.input_data.read()
        self.result = data.count("\n")
    
    def reduce(self, other):
        self.result += other.result


def execute(workers):
    """we Thead object to do line count

    Args:
        workers (list[CountLineWorkders]): list of CountLineWorkder

    Returns:
        int: counted result
    """
    threads = [Thread(target=w.map) for w in workers]
    for thread in threads: thread.start()
    for thread in threads: thread.join()

    first, *rest = workers
    for worker in rest:
        first.reduce(worker)
    return first.result

def mapreduce(worker_class, input_class, config):
    #use access method from class with classmethod
    workers = worker_class.create_worker(input_class, config)
    return execute(workers)

def write_test_files(tmpdir):
    os.makedirs(tmpdir)
    for i in range(100):
        with open(os.path.join(tmpdir, str(i)), 'w') as f:
            f.write('\n' * random.randint(0, 100))


tmpdir = './test_inputs'
if not os.path.isdir(tmpdir):
    write_test_files(tmpdir)
    
config = {"data_dir": tmpdir}
result = mapreduce(LineCountWorker, PathInputData, config)
print(f"There are {result} lines")

There are 5134 lines


**Summary on @classmethod and cls parameter:**
- @classmethod, cls parameter allow us to access method from class instead of instance of class
- cls() method return instance of the current class
- classmethod usually return instance or a list of instance of that class

In [None]:
def generate_big_random_sentences(filename,linecount):
    import random
    nouns = ("puppy", "car", "rabbit", "girl", "monkey")
    verbs = ("runs", "hits", "jumps", "drives", "barfs")
    adv = ("crazily.", "dutifully.", "foolishly.", "merrily.", "occasionally.")
    adj = ("adorable", "clueless", "dirty", "odd", "stupid")

    all = [nouns, verbs, adj, adv]

    with open(filename,'w') as f:
        for i in range(linecount):
            f.writelines([' '.join([random.choice(i) for i in all]),'\n'])
    pass

In [5]:
import random
nouns = ("puppy", "car", "rabbit", "girl", "monkey")
verbs = ("runs", "hits", "jumps", "drives", "barfs")
adv = ("crazily.", "dutifully.", "foolishly.", "merrily.", "occasionally.")
adj = ("adorable", "clueless", "dirty", "odd", "stupid")
all = [nouns, verbs, adj, adv]


test = [random.choice(i) for i in all]
print(test)

['rabbit', 'runs', 'dirty', 'merrily.']
