In [3]:
import itertools
# Mapper function
def map(doc_id, text):
    stopwords = set(["the", "and", "of", "a", "to", "in", "is", "it"])
    words = text.split()
    for word in words:
        if word.lower() not in stopwords:
            yield (word.lower(), 1)

# Reducer function
def reduce(word, counts):
    total_count = sum(counts)
    yield (word, total_count)

# Read the content of the input file
def read_input_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    return content

# Main function to run the MapReduce program
def main(file_path):
    input_text = read_input_file(file_path)
    # Assuming each line in the input file is a separate document
    documents = input_text.splitlines()

    # Mapper phase
    intermediate_output = []
    for doc_id, text in enumerate(documents):
        intermediate_output.extend(map(doc_id, text))

    # Sort and group intermediate output by keys
    intermediate_output.sort(key=lambda x: x[0])
    grouped_output = {}
    for key, group in itertools.groupby(intermediate_output, key=lambda x: x[0]):
        grouped_output[key] = [count for _, count in group]

    # Reducer phase
    final_output = []
    for word, counts in grouped_output.items():
        final_output.extend(reduce(word, counts))

    return final_output

if __name__ == "__main__":
    input_file_path = r"/content/sample_data/1.txt"
    result = main(input_file_path)
    print(result)


[('a,', 1), ('and,', 1), ('as', 1), ('be', 1), ('common', 1), ('contains', 1), ('input', 1), ('of,', 1), ('output.', 1), ('removed', 1), ('sample', 1), ('should', 1), ('some', 1), ('stopwords', 1), ('such', 1), ('text.', 1), ('the,', 1), ('these', 1), ('this', 1), ('to.', 1), ('words', 1)]
