In [None]:
!pip install mrjob

# Python MRJob Package
- https://mrjob.readthedocs.io/en/latest/

`mrjob` is the easiest route to writing Python programs that run on Hadoop. If you use mrjob, you’ll be able to test your code locally without installing Hadoop or run it on a cluster of your choice.

# MRJob Examples
- https://github.com/Yelp/mrjob/tree/master/mrjob/examples


In [None]:
%%file wc.py
from mrjob.job import MRJob

class MRWordFrequencyCount(MRJob):

    def mapper(self, _, line):
        yield "chars", len(line)
        yield "words", len(line.split())
        yield "lines", 1

    def reducer(self, key, values):
        yield key, sum(values)


if __name__ == '__main__':
    MRWordFrequencyCount.run()

# Text - War And Peace
https://www.gutenberg.org/ebooks/2600

In [None]:
!curl https://www.gutenberg.org/files/2600/2600-0.txt -o warpeace.txt

In [None]:
import wc

mr_job = wc.MRWordFrequencyCount(args=['warpeace.txt'])
with mr_job.make_runner() as runner:
    runner.run()
    for key, value in mr_job.parse_output(runner.cat_output()):
        print(key, value)

# Temperature data
https://github.com/PacktPublishing/Frank-Kanes-Taming-Big-Data-with-Apache-Spark-and-Python

In [None]:
!curl https://raw.githubusercontent.com/PacktPublishing/Frank-Kanes-Taming-Big-Data-with-Apache-Spark-and-Python/master/1800.csv -o 1800.csv

In [None]:
!tail -n 10 1800.csv

In [None]:
%%file max_temp.py
from mrjob.job import MRJob

class MRMaxTemperature(MRJob):
    
    @staticmethod
    def to_fahrenheit(cels):
        celsius = float(cels) / 10.0
        fahrenheit = celsius * 1.8 + 32.0
        return fahrenheit

    def mapper(self, _, line):
        (location, date, type, data, x, y, z, w) = line.split(',')
        if (type == 'TMAX'):
            temperature = self.to_fahrenheit(data)
            yield location, temperature

    def reducer(self, location, temps):
        yield location, max(temps)


if __name__ == '__main__':
    MRMaxTemperature.run()
    

In [None]:
import max_temp

mr_job = max_temp.MRMaxTemperature(args=['1800.csv'])
with mr_job.make_runner() as runner:
    runner.run()
    for key, value in mr_job.parse_output(runner.cat_output()):
        print(key, value)

# Let's do it on EMR
https://github.com/Yelp/mrjob/blob/master/docs/guides/emr-quickstart.rst