MRJob is meant to be used within standalone Python scripts and called from the command line. To use it inside the notebook, we will use the `%%file` magic to write code to a file and then `!` to execute a shell command inside the notebook.

In [6]:
for x in zip([1,2],[7,8]):
    print(x)
#output of zip() is a 'generator'; behaves LIKE a list BUT NOT STORED IN MEMORY!

(1, 7)
(2, 8)


In [9]:
%%file word_frequency_count.py 
#jupyter magic
#wrote code below to file.py
# see new file!

from mrjob.job import MRJob


class MRWordFrequencyCount(MRJob):

    def mapper(self, _, line): # FOCUS ON
        #underscore means we will not be using this arg at all
        #go line by line in text
        yield "chars", len(line)
        yield "words", len(line.split()) #take a line and split by whitespace
        yield "lines", 1
        #yield creates a 'generator', we're making generators to not store stuff in memory

    def reducer(self, key, values ): #FOCUS ON 
        yield key, sum(values)
                #values is a list of number of words


if __name__ == '__main__':
    MRWordFrequencyCount.run()

Overwriting word_frequency_count.py


In [13]:
! python3 word_frequency_count.py /data/AnnaKarenina.txt
#exclamation point allows u to call from terminal

No configs found; falling back on auto-configuration
Creating temp directory /tmp/word_frequency_count.wileong.20160428.212721.603607
Running step 1 of 1...
Streaming final output from /tmp/word_frequency_count.wileong.20160428.212721.603607/output...
"chars"	1944838
"lines"	40291
"words"	353076
Removing temp directory /tmp/word_frequency_count.wileong.20160428.212721.603607...


In [17]:
%%file character_count.py 
#jupyter magic
#writes to filename specified
#wrote code below to file.py
# see new file!

#use pass to do nothing

from mrjob.job import MRJob

#count number of occurrences of ea distinct character in the text
class CharacterCount(MRJob):

    def mapper(self, _, line): # FOCUS ON
        #underscore means we will not be using this arg at all
        #go line by line in text
        for char in line:
            if char.isalnum():
                yield char.lower(), 1 #lowercase only, creates key
        
        #yield creates a 'generator', we're making generators to not store stuff in memory

    def reducer(self, key, values ):
        #what shoudl u do to vals to get total of letters?
        #what should we do with all those "1"'s for count of char...
        yield key, sum(values)


if __name__ == '__main__':
    CharacterCount.run()

Overwriting character_count.py


In [18]:
! python3 character_count.py /data/AnnaKarenina.txt

No configs found; falling back on auto-configuration
Creating temp directory /tmp/character_count.wileong.20160428.212937.726876
Running step 1 of 1...
Streaming final output from /tmp/character_count.wileong.20160428.212937.726876/output...
"0"	51
"1"	189
"2"	116
"3"	71
"4"	36
"5"	40
"6"	33
"7"	31
"8"	39
"9"	38
"\u00e0"	24
"\u00e2"	4
"\u00e4"	1
"\u00e7"	4
"\u00e8"	22
"\u00e9"	36
"\u00ea"	7
"\u00ef"	13
"\u00f4"	5
"\u00fc"	4
"a"	125132
"b"	21525
"c"	34744
"d"	69021
"e"	187106
"f"	31489
"g"	33663
"h"	106968
"i"	110257
"j"	1509
"k"	15548
"l"	61103
"m"	34483
"n"	111349
"o"	115233
"p"	24178
"q"	1421
"r"	80827
"s"	98658
"t"	141980
"u"	40143
"v"	20072
"w"	37313
"x"	3426
"y"	32376
"z"	1044
Removing temp directory /tmp/character_count.wileong.20160428.212937.726876...


In [24]:
%%file max_length.py 
#jupyter magic
#writes to filename specified
#wrote code below to file.py
# see new file!

#use pass to do nothing

from mrjob.job import MRJob

# get the length of the longest word in a document
class MaxLength(MRJob):

    def mapper(self, _, line): # FOCUS ON
        #underscore means we will not be using this arg at all
        #go line by line in text
        
        words = line.split()
        
#         for word in words:
#             yield None, len(word) #we're not looking for key!
#             #YIELD EVERY WORD in line

        lengths = [len(w) for w in words] #
        
        if len(lengths) > 0:
            yield None,max(lengths)
        
        
        #yield creates a 'generator', we're making generators to not store stuff in memory

    def reducer(self, key, values ): #finds max of max of each line
        
        #Every key would be called none
        yield None, max(values)


if __name__ == '__main__':
    MaxLength.run()

Overwriting max_length.py


In [25]:
! python3 max_length.py /data/AnnaKarenina.txt

No configs found; falling back on auto-configuration
Creating temp directory /tmp/max_length.wileong.20160428.214607.904895
Running step 1 of 1...
Streaming final output from /tmp/max_length.wileong.20160428.214607.904895/output...
null	43
Removing temp directory /tmp/max_length.wileong.20160428.214607.904895...
