In [1]:
# Load environment variables
%load_ext dotenv
%dotenv -o /opt/envvars.sh
%env

{'HOSTNAME': 'hadoop',
 'OLDPWD': '/',
 'PWD': '/opt',
 'HOME': '/home/hadoop',
 'SHELL': '/bin/bash',
 'SHLVL': '1',
 'PATH': '/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/hadoop/bin:/opt/hadoop/sbin:/opt/flume/bin:/opt/sqoop/bin',
 '_': '/usr/bin/nohup',
 'LANGUAGE': 'en.UTF-8',
 'LANG': 'en.UTF-8',
 'JPY_PARENT_PID': '1566',
 'TERM': 'xterm-color',
 'CLICOLOR': '1',
 'PAGER': 'cat',
 'GIT_PAGER': 'cat',
 'MPLBACKEND': 'module://ipykernel.pylab.backend_inline',
 'JAVA_HOME': '/usr/lib/jvm/java-1.8.0-openjdk-amd64',
 'PDSH_RCMD_TYPE': 'ssh',
 'HADOOP_HOME': '/opt/hadoop',
 'HADOOP_COMMON_HOME': '/opt/hadoop',
 'HADOOP_CONF_DIR': '/opt/hadoop/etc/hadoop',
 'HADOOP_HDFS_HOME': '/opt/hadoop',
 'HADOOP_MAPRED_HOME': '/opt/hadoop',
 'HADOOP_YARN_HOME': '/opt/hadoop',
 'FLUME_HOME': '/opt/flume',
 'SQOOP_HOME': '/opt/sqoop'}

# MRJob

- https://github.com/Yelp/mrjob
- https://mrjob.readthedocs.io/en/stable/

## Local execution

In [2]:
%%bash

# local install
pip3 install mrjob

Collecting mrjob
  Downloading https://files.pythonhosted.org/packages/8e/58/fc28ab743aba16e90736ad4e29694bd2adaf7b879376ff149306d50c4e90/mrjob-0.7.4-py2.py3-none-any.whl (439kB)
Collecting PyYAML>=3.10 (from mrjob)
  Downloading https://files.pythonhosted.org/packages/7a/5b/bc0b5ab38247bba158504a410112b6c03f153c652734ece1849749e5f518/PyYAML-5.4.1-cp36-cp36m-manylinux1_x86_64.whl (640kB)
Installing collected packages: PyYAML, mrjob
Successfully installed PyYAML-5.4.1 mrjob-0.7.4


In [3]:
%mkdir /opt/src/mrjob
%cd /opt/src/mrjob

/opt/src/mrjob


In [4]:
%%writefile mrwordcount.py
import re
from mrjob.job import MRJob

WORD_RE = re.compile(r"[\w']+")

class MRWordCount(MRJob):

    def mapper(self, _, line):
        for word in WORD_RE.findall(line):
            word = word.lower()
            yield word,1

    def combiner(self, word, counts):
        yield word, sum(counts)

    def reducer(self, word, counts):
        yield word, sum(counts)

if __name__ == '__main__':
    MRWordCount.run()

Writing mrwordcount.py


In [5]:
%%bash

cd /opt/datasets
# download book "The History of Don Quixote by Miguel de Cervantes" from Gutenberg Project
wget -q -c http://www.gutenberg.org/files/996/996-0.txt -O donquixote.txt

In [7]:
%%bash

# inline runner (default)
#python3 mrwordcount.py /opt/datasets/donquixote.txt > /opt/datasets/donquixote-output.txt

# local runner
python3 mrwordcount.py -r local /opt/datasets/donquixote.txt > /opt/datasets/donquixote-output.txt

# head output
head /opt/datasets/donquixote-output.txt

"lackeys"	1
"lacking"	1
"lacquered"	1
"lacquey"	26
"lacqueys"	1
"lad"	16
"ladder"	1
"ladders"	1
"laden"	4
"ladies"	84


No configs found; falling back on auto-configuration
No configs specified for local runner
Creating temp directory /tmp/mrwordcount.hadoop.20210129.130828.810038
Running step 1 of 1...
job output is in /tmp/mrwordcount.hadoop.20210129.130828.810038/output
Streaming final output from /tmp/mrwordcount.hadoop.20210129.130828.810038/output...
Removing temp directory /tmp/mrwordcount.hadoop.20210129.130828.810038...


## Cluster execution

### Setup

In [8]:
%%bash

# install in all hadoop nodes
pdsh -w hadoop1,hadoop2,hadoop3 pip3 install mrjob

hadoop3: Collecting mrjob
hadoop2: Collecting mrjob
hadoop1: Collecting mrjob
hadoop3:   Downloading https://files.pythonhosted.org/packages/8e/58/fc28ab743aba16e90736ad4e29694bd2adaf7b879376ff149306d50c4e90/mrjob-0.7.4-py2.py3-none-any.whl (439kB)
hadoop2:   Downloading https://files.pythonhosted.org/packages/8e/58/fc28ab743aba16e90736ad4e29694bd2adaf7b879376ff149306d50c4e90/mrjob-0.7.4-py2.py3-none-any.whl (439kB)
hadoop1:   Downloading https://files.pythonhosted.org/packages/8e/58/fc28ab743aba16e90736ad4e29694bd2adaf7b879376ff149306d50c4e90/mrjob-0.7.4-py2.py3-none-any.whl (439kB)
hadoop2: Collecting PyYAML>=3.10 (from mrjob)
hadoop1: Collecting PyYAML>=3.10 (from mrjob)
hadoop3: Collecting PyYAML>=3.10 (from mrjob)
hadoop3:   Downloading https://files.pythonhosted.org/packages/7a/5b/bc0b5ab38247bba158504a410112b6c03f153c652734ece1849749e5f518/PyYAML-5.4.1-cp36-cp36m-manylinux1_x86_64.whl (640kB)
hadoop1:   Downloading https://files.pythonhosted.org/packages/7a/5b/bc0b5ab38247bba158



In [9]:
%%bash

# create directory in HDFS and send file
hdfs dfs -mkdir donquixote
hdfs dfs -put /opt/datasets/donquixote.txt donquixote

# run in hadoop
python3 mrwordcount.py -r hadoop --output-dir donquixote-output hdfs:///user/hadoop/donquixote

2021-01-29 13:11:10,925 INFO sasl.SaslDataTransferClient: SASL encryption trust check: localHostTrusted = false, remoteHostTrusted = false
No configs found; falling back on auto-configuration
No configs specified for hadoop runner
Looking for hadoop binary in /opt/hadoop/bin...
Found hadoop binary: /opt/hadoop/bin/hadoop
Using Hadoop version 3.2.1
Looking for Hadoop streaming jar in /opt/hadoop...
Found Hadoop streaming jar: /opt/hadoop/share/hadoop/tools/lib/hadoop-streaming-3.2.1.jar
Creating temp directory /tmp/mrwordcount.hadoop.20210129.131112.308929
uploading working dir files to hdfs:///user/hadoop/tmp/mrjob/mrwordcount.hadoop.20210129.131112.308929/files/wd...
Copying other local files to hdfs:///user/hadoop/tmp/mrjob/mrwordcount.hadoop.20210129.131112.308929/files/
Running step 1 of 1...
  packageJobJar: [/tmp/hadoop-unjar3235736449570638627/] [] /tmp/streamjob8909491657222742448.jar tmpDir=null
  Connecting to ResourceManager at hadoop/172.17.0.2:8032
  Connecting to Applicat

In [10]:
%%bash

# get output
hdfs dfs -getmerge donquixote-output donquixote-output.txt

# head output
head donquixote-output.txt

"0"	2
"000"	1
"1"	45
"104k"	1
"105k"	1
"106k"	1
"1085"	1
"108k"	1
"109k"	2
"10k"	1


2021-01-29 13:14:14,597 INFO sasl.SaslDataTransferClient: SASL encryption trust check: localHostTrusted = false, remoteHostTrusted = false


## MapReduce patterns

### Datasets

- weblog.csv
- books from Gutenberg project
- departments.csv and employees.csv

In [16]:
%cd /opt/notebooks

/opt/notebooks


In [17]:
%%bash

tar -zxf mrjobdataset.tgz -C /opt/datasets

In [18]:
%cd /opt/src/mrjob

/opt/src/mrjob


### 1. Count

In [19]:
%%writefile 1_count_weblog.py
#Total number of times each page is visited
from mrjob.job import MRJob

class MRURLCount(MRJob):

    def mapper(self, _, line):
        #Split the line with comma separated fields
        data = line.split(',')

        #Parse line
        ip = data[0].strip()
        #Check if it's not the header line
        if ip == 'IP' : return
        time = data[1].strip()
        request = data[2].strip()
        status = data[3].strip()
        visit = data[4].strip()

        #Extract site
        url = request.split(' ')[1]

        #Emit url and 1
        yield url, 1

    def reducer(self, key, list_of_values):
        yield key,sum(list_of_values)

if __name__ == '__main__':
    MRURLCount.run()

Writing 1_count_weblog.py


In [20]:
%%bash

python3 1_count_weblog.py /opt/datasets/weblog.csv 2> /dev/null | head

"/img/ruet.png"	213
"/index.php"	6
"/js/chart.min.js"	58
"/js/jquery.min.js"	56
"/js/vendor/jquery-1.12.0.min.js"	387
"/js/vendor/modernizr-2.8.3.min.js"	1417
"/js/vendor/moment.min.js"	173
"/login.php"	3298
"/login.php?value=fail"	128
"/logout.php"	44


### 2. Max value

In [21]:
%%writefile 2_max_weblog.py
# Return most visited URL
from mrjob.job import MRJob
from mrjob.step import MRStep

class MRURLMax(MRJob) :

    def mapper1(self, _, line) :
        #Split the line with comma separated fields
        data = line.split(',')

        #Parse line
        ip = data[0].strip()
        #Check if it's not the header line
        if ip == 'IP' : return
        time = data[1].strip()
        request = data[2].strip()
        status = data[3].strip()
        visit = data[4].strip()

        #Extract site
        url = request.split(' ')[1]

        #Emit url and 1
        yield url, 1

    def reducer1(self, key, list_of_values) :
        yield None, (sum(list_of_values), key)

    def reducer2(self, key, list_of_values) :
        yield max(list_of_values)

    def steps(self) :
        return [MRStep(mapper=self.mapper1, reducer=self.reducer1),
        MRStep(reducer=self.reducer2)]

if __name__ == '__main__' :
    MRURLMax.run()

Writing 2_max_weblog.py


In [22]:
%%bash

python3 2_max_weblog.py /opt/datasets/weblog.csv 2> /dev/null

3298	"/login.php"


### 3. Average

In [23]:
%%writefile 3_average_weblog.py
#Average visit time
from mrjob.job import MRJob

class MRAvgVisitTime(MRJob):

    def mapper(self, _, line):
        #Split the line with comma separated fields
        data = line.split(',')

        #Parse line
        ip = data[0].strip()
        #Check if it's not the header line
        if ip == 'IP' : return
        time = data[1].strip()
        request = data[2].strip()
        status = data[3].strip()
        visit = float(data[4].strip())

        #Extract site
        url = request.split(' ')[1]

        #Emit url and visit time
        yield url, visit

    def reducer(self, key, list_of_values):
        count = 0
        total = 0.0
        for x in list_of_values:
            total = total + x
            count = count + 1

        avglen = ("%.2f" % (total/count))
        yield key,avglen

if __name__ == '__main__':
    MRAvgVisitTime.run()

Writing 3_average_weblog.py


In [24]:
%%bash

python3 3_average_weblog.py /opt/datasets/weblog.csv 2> /dev/null | head

"/img/ruet.png"	"258.97"
"/index.php"	"227.32"
"/js/chart.min.js"	"299.16"
"/js/jquery.min.js"	"207.62"
"/js/vendor/jquery-1.12.0.min.js"	"252.07"
"/js/vendor/modernizr-2.8.3.min.js"	"244.30"
"/js/vendor/moment.min.js"	"279.61"
"/login.php"	"252.70"
"/login.php?value=fail"	"227.98"
"/logout.php"	"237.21"


### 4. Top N

In [25]:
%%writefile 4_topn_weblog.py
#Top 3 visited pages
from mrjob.job import MRJob
from mrjob.step import MRStep

class MRTopN(MRJob):

    def mapper(self, _, line):
        #Split the line with comma separated fields
        data = line.split(',')

        #Parse line
        ip = data[0].strip()
        #Check if it's not the header line
        if ip == 'IP' : return
        time = data[1].strip()
        request = data[2].strip()
        status = data[3].strip()
        visit = data[4].strip()

        #Extract url
        url = request.split(' ')[1]

        #Emit url and 1
        yield url, 1

    def reducer1(self, key, list_of_values):
        total_count = sum(list_of_values)
        yield None, (total_count, key)

    def reducer2(self, _, list_of_values):
        N=3
        list_of_values = sorted(list(list_of_values), reverse=True)
        return list_of_values[:N]

    def steps(self):
        return [MRStep(mapper=self.mapper, reducer=self.reducer1),
        MRStep(reducer=self.reducer2)]

if __name__ == '__main__':
    MRTopN.run()

Writing 4_topn_weblog.py


In [26]:
%%bash

python3 4_topn_weblog.py /opt/datasets/weblog.csv 2> /dev/null

3298	"/login.php"
2653	"/home.php"
1417	"/js/vendor/modernizr-2.8.3.min.js"


### 5. Filter

In [27]:
%%writefile 5_filter_weblog.py
#Filter accesses to "/login.php?value=fail" on Feb/2018
from mrjob.job import MRJob

class MRFilter(MRJob):

    def mapper(self, _, line):
        #Split the line with comma separated fields
        data = line.split(',')

        #Parse line
        ip = data[0].strip()
        #Check if it's not the header line
        if ip == 'IP' : return
        time = data[1].strip()
        request = data[2].strip()
        status = data[3].strip()
        visit = data[4].strip()

        #Extract site
        url = request.split(' ')[1]

        #Extract month/year
        date = time[4:12]

        #Filter access to "/login.php?value=fail" on Feb/2018
        if url == "/login.php?value=fail" and date == "Feb/2018" :
            yield url, (time, ip, visit)

if __name__ == '__main__':
    MRFilter.run()

Writing 5_filter_weblog.py


In [28]:
%%bash

python3 5_filter_weblog.py /opt/datasets/weblog.csv 2> /dev/null

"/login.php?value=fail"	["[17/Feb/2018:20:08:56", "10.128.2.1", "185.3492762625749"]
"/login.php?value=fail"	["[17/Feb/2018:20:20:28", "10.130.2.1", "231.88065750818035"]
"/login.php?value=fail"	["[18/Feb/2018:19:40:31", "10.128.2.1", "119.2142303257265"]
"/login.php?value=fail"	["[18/Feb/2018:19:40:35", "10.128.2.1", "147.46020967961022"]
"/login.php?value=fail"	["[18/Feb/2018:19:40:38", "10.131.0.1", "257.82366658285235"]
"/login.php?value=fail"	["[18/Feb/2018:19:40:39", "10.131.0.1", "169.11496206962957"]
"/login.php?value=fail"	["[20/Feb/2018:11:14:24", "10.130.2.1", "330.90262403457416"]
"/login.php?value=fail"	["[22/Feb/2018:06:26:02", "10.131.0.1", "766.0191400216718"]
"/login.php?value=fail"	["[22/Feb/2018:06:26:08", "10.128.2.1", "34.905177651134075"]
"/login.php?value=fail"	["[22/Feb/2018:06:26:10", "10.128.2.1", "49.17624847339196"]
"/login.php?value=fail"	["[22/Feb/2018:06:26:11", "10.128.2.1", "268.1915691580601"]
"/login.php?value=fail"	["[22/Feb/2018:06:26:33", "10.130.2

### 6. Distinct

In [29]:
%%writefile 6_distinct_weblog.py
#Distinct IPs
from mrjob.job import MRJob

class MRDistinct(MRJob):

    def mapper(self, _, line):
        #Split the line with comma separated fields
        data = line.split(',')

        #Parse line
        ip = data[0].strip()
        #Check if it's not the header line
        if ip == 'IP' : return
        time = data[1].strip()
        request = data[2].strip()
        status = data[3].strip()
        visit = data[4].strip()

        yield ip, None

    def reducer(self, key, list_of_values) :
        yield key, None

if __name__ == '__main__':
    MRDistinct.run()

Writing 6_distinct_weblog.py


In [30]:
%%bash

python3 6_distinct_weblog.py /opt/datasets/weblog.csv 2> /dev/null

"10.131.2.1"	null
"10.128.2.1"	null
"10.131.0.1"	null
"10.129.2.1"	null
"10.130.2.1"	null


### 7. Binning

In [31]:
%%writefile 7_binning_weblog.py
#Create bins for different status codes for 20/Feb/2018
from mrjob.job import MRJob

class MRBinning(MRJob):

    def mapper(self, _, line):
        #Split the line with comma separated fields
        data = line.split(',')

        #Parse line
        ip = data[0].strip()
        #Check if it's not the header line
        if ip == 'IP' : return
        time = data[1].strip()
        request = data[2].strip()
        status = data[3].strip()
        visit = data[4].strip()
        
        #Extract month/year
        date = time[1:12]

        #Filter accesses on 20/Feb/2018
        if date == "20/Feb/2018" :
            yield status, (time, request, ip)

    def reducer(self, key, list_of_values):
        yield key, (list(list_of_values))

if __name__ == '__main__':
    MRBinning.run()

Writing 7_binning_weblog.py


In [32]:
%%bash

python3 7_binning_weblog.py /opt/datasets/weblog.csv 2> /dev/null

"200"	[["[20/Feb/2018:01:51:41", "GET /login.php HTTP/1.1", "10.130.2.1"], ["[20/Feb/2018:07:06:12", "GET /login.php HTTP/1.1", "10.130.2.1"], ["[20/Feb/2018:09:23:17", "GET /login.php HTTP/1.1", "10.128.2.1"], ["[20/Feb/2018:09:23:21", "GET /fonts/fontawesome-webfont.woff2?v=4.6.3 HTTP/1.1", "10.128.2.1"], ["[20/Feb/2018:09:23:29", "GET /home.php HTTP/1.1", "10.128.2.1"], ["[20/Feb/2018:09:23:30", "GET /bootstrap-3.3.7/js/bootstrap.js HTTP/1.1", "10.131.0.1"], ["[20/Feb/2018:09:24:12", "GET /compiler.php HTTP/1.1", "10.131.0.1"], ["[20/Feb/2018:11:04:47", "GET /login.php HTTP/1.1", "10.130.2.1"], ["[20/Feb/2018:11:04:51", "GET /css/bootstrap.min.css HTTP/1.1", "10.128.2.1"], ["[20/Feb/2018:11:04:52", "GET /css/normalize.css HTTP/1.1", "10.130.2.1"], ["[20/Feb/2018:11:04:55", "GET /css/font-awesome.min.css HTTP/1.1", "10.130.2.1"], ["[20/Feb/2018:11:04:57", "GET /css/main.css HTTP/1.1", "10.130.2.1"], ["[20/Feb/2018:11:04:58", "GET /css/style.css HTTP/1.1", "10.130.2.1"], ["[20/Feb/201

### 8. Inverted index

In [33]:
%%writefile 8_invertedindex_books.py
#Inverted Index
from mrjob.job import MRJob
import os

class MRInvertedIndex(MRJob):

    def mapper(self, _, line):
        fileName = os.environ['mapreduce_map_input_file']

        words = line.split()
        for word in words:
            yield word, fileName

    def reducer(self, key, list_of_values):
        docs = set()
        for x in list_of_values :
            docs.add(x)
        yield key,list(docs)

if __name__ == '__main__':
    MRInvertedIndex.run()

Writing 8_invertedindex_books.py


In [35]:
%%bash

rm -rf /opt/datasets/books/.ipynb_checkpoints

In [36]:
%%bash

python3 8_invertedindex_books.py /opt/datasets/books 2> /dev/null | head -n 40

"life,"	["file:///opt/datasets/books/book1.txt", "file:///opt/datasets/books/book2.txt", "file:///opt/datasets/books/book3.txt"]
"life,\u2014all"	["file:///opt/datasets/books/book2.txt"]
"life,\u2014in"	["file:///opt/datasets/books/book2.txt"]
"life--and"	["file:///opt/datasets/books/book3.txt"]
"life--animal"	["file:///opt/datasets/books/book3.txt"]
"life--was"	["file:///opt/datasets/books/book3.txt"]
"life-blood"	["file:///opt/datasets/books/book3.txt"]
"life-boats."	["file:///opt/datasets/books/book2.txt"]
"life-buoy"	["file:///opt/datasets/books/book2.txt"]
"life-buoy-coffin"	["file:///opt/datasets/books/book2.txt"]
"life-buoy\u2014a"	["file:///opt/datasets/books/book2.txt"]
"life-buoys"	["file:///opt/datasets/books/book2.txt"]
"life-giving,"	["file:///opt/datasets/books/book3.txt"]
"life-like"	["file:///opt/datasets/books/book2.txt"]
"life-like,"	["file:///opt/datasets/books/book2.txt"]
"life-line,"	["file:///opt/datasets/books/book2.txt"]
"life-lines,"	["file:///opt/datasets/book

### 9. Sort

In [37]:
%%writefile 9_sort_weblog.py
# Sort visit times in descending order
from mrjob.job import MRJob
class MRSortVisit(MRJob) :
    def mapper(self, _, line):
        #Split the line with comma separated fields
        data = line.split(',')

        #Parse line
        ip = data[0].strip()
        #Check if it's not the header line
        if ip == 'IP' : return
        time = data[1].strip()
        request = data[2].strip()
        status = data[3].strip()
        visit = data[4].strip()

        #Extract site
        url = request.split(' ')[1]

        yield None, (visit, (time, url, ip))

    def reducer(self, key, list_of_values):
        l = [(float(v), content) for v, content in list_of_values]
        l.sort(reverse=True)
        return l

if __name__ == '__main__':
    MRSortVisit.run()

Writing 9_sort_weblog.py


In [38]:
%%bash

python3 9_sort_weblog.py /opt/datasets/weblog.csv 2> /dev/null | head -n 20

8742.057189992775	["[29/Jan/2018:20:33:45", "/login.php", "10.128.2.1"]
6287.629575490433	["[29/Jan/2018:20:55:42", "/login.php", "10.128.2.1"]
5711.326270236046	["[29/Jan/2018:20:35:56", "/js/vendor/modernizr-2.8.3.min.js", "10.128.2.1"]
5283.59276598012	["[09/Nov/2017:19:53:43", "/js/vendor/modernizr-2.8.3.min.js", "10.131.0.1"]
5123.493680523937	["[29/Jan/2018:20:47:45", "/js/vendor/modernizr-2.8.3.min.js", "10.128.2.1"]
5053.873518356491	["[13/Nov/2017:09:11:14", "/login.php", "10.131.2.1"]
4642.003763780086	["[29/Jan/2018:20:29:30", "/js/vendor/modernizr-2.8.3.min.js", "10.131.0.1"]
4615.5689039456365	["[29/Jan/2018:20:32:40", "/login.php", "10.128.2.1"]
4421.817090530212	["[29/Jan/2018:20:35:37", "/js/vendor/modernizr-2.8.3.min.js", "10.131.0.1"]
3857.8085677731	["[24/Nov/2017:08:29:49", "/css/font-awesome.min.css", "10.131.2.1"]
3725.3713482396056	["[29/Jan/2018:20:34:22", "/login.php", "10.131.0.1"]
3667.904220839362	["[12/Nov/2017:19:44:01", "/archive.php", "10.130.2.1"]
3570.

### 10. Joins

#### InnerJoin

In [39]:
%%writefile 10_innerjoin_db.py
from mrjob.job import MRJob
import os

class MRInnerJoin(MRJob) :
    def mapper(self, _, line):
        data = line.split(',')

        filename = os.environ['mapreduce_map_input_file']

        if 'employees.csv' in filename :
            dep_no = data[2]
            yield dep_no, ('Employee', data)
        elif 'departments.csv' in filename:
            dep_no = data[0]
            yield dep_no, ('Department', data)

    def reducer(self, key, list_of_values) :
        values = list(list_of_values)
        employees = []
        departments = []
        for v in values:
            if v[0] == 'Employee' :
                employees.append(v)
            elif v[0] == 'Department' :
                departments.append(v)

        # Inner Join
        for e in employees :
            for d in departments :
                yield key, (e+d)

if __name__ == '__main__' :
    MRInnerJoin.run()

Writing 10_innerjoin_db.py


In [40]:
%%bash

python3 10_innerjoin_db.py /opt/datasets/employees.csv /opt/datasets/departments.csv 2> /dev/null | head

"d006"	["Employee", ["10009", "Sumant Peac", "d006", "1985-02-18"], "Department", ["d006", "Quality Management"]]
"d006"	["Employee", ["10010", "Duangkaew Piveteau", "d006", "1989-08-24"], "Department", ["d006", "Quality Management"]]
"d006"	["Employee", ["10029", "Otmar Herbst", "d006", "1985-11-20"], "Department", ["d006", "Quality Management"]]
"d006"	["Employee", ["10033", "Arif Merlo", "d006", "1987-03-18"], "Department", ["d006", "Quality Management"]]
"d006"	["Employee", ["10067", "Claudi Stavenow", "d006", "1987-03-04"], "Department", ["d006", "Quality Management"]]
"d006"	["Employee", ["10073", "Shir McClurg", "d006", "1991-12-01"], "Department", ["d006", "Quality Management"]]
"d006"	["Employee", ["10111", "Hugo Rosis", "d006", "1988-06-19"], "Department", ["d006", "Quality Management"]]
"d006"	["Employee", ["10124", "Geraldo Marwedel", "d006", "1991-09-05"], "Department", ["d006", "Quality Management"]]
"d006"	["Employee", ["10138", "Perry Shimshoni", "d006", "1986-09-18"], 

#### LeftOuterJoin

In [None]:
%%writefile 11_leftouterjoin_db.py
from mrjob.job import MRJob
import os

class MRLeftOuterJoin(MRJob) :
    def mapper(self, _, line):
        data = line.split(',')

        filename = os.environ['mapreduce_map_input_file']

        if 'employees.csv' in filename :
            dep_no = data[2]
            yield dep_no, ('Employee', data)
        elif 'departments.csv' in filename:
            dep_no = data[0]
            yield dep_no, ('Department', data)

    def reducer(self, key, list_of_values) :
        # yield None, list(list_of_values)
        values = list(list_of_values)
        employees = []
        departments = []
        for v in values:
            if v[0] == 'Employee' :
                employees.append(v)
            elif v[0] == 'Department' :
                departments.append(v)

        # Left Outer Join
        for e in employees :
            if len(departments) > 0 :
                for d in departments :
                    yield key, (e+d)
            else :
                yield key, (e)

if __name__ == '__main__' :
    MRLeftOuterJoin.run()

In [None]:
%%bash

python3 11_leftouterjoin_db.py /opt/datasets/employees.csv /opt/datasets/departments.csv 2> /dev/null | head

#### RightOuterJoin

In [None]:
%%writefile 12_rightouterjoin_db.py
from mrjob.job import MRJob
import os

class MRRightOuterJoin(MRJob) :
    def mapper(self, _, line):
        data = line.split(',')

        filename = os.environ['mapreduce_map_input_file']
        
        if 'employees.csv' in filename :
            dep_no = data[2]
            yield dep_no, ('Employee', data)
        elif 'departments.csv' in filename:
            dep_no = data[0]
            yield dep_no, ('Department', data)

    def reducer(self, key, list_of_values) :
        # yield None, list(list_of_values)
        values = list(list_of_values)
        employees = []
        departments = []
        for v in values:
            if v[0] == 'Employee' :
                employees.append(v)
            elif v[0] == 'Department' :
                departments.append(v)

        # Right Outer Join
        for d in departments :
            if len(employees) > 0 :
                for e in employees :
                    yield key, (e+d)
            else :
                yield key, (d)

if __name__ == '__main__' :
    MRRightOuterJoin.run()

In [None]:
%%bash

python3 12_rightouterjoin_db.py /opt/datasets/employees.csv /opt/datasets/departments.csv 2> /dev/null | head

#### FullOuterJoin

In [None]:
%%writefile 13_fullouterjoin_db.py
from mrjob.job import MRJob
import os

class MRFullOuterJoin(MRJob) :
    def mapper(self, _, line):
        data = line.split(',')

        filename = os.environ['mapreduce_map_input_file']

        if 'employees.csv' in filename :
            dep_no = data[2]
            yield dep_no, ('Employee', data)
        elif 'departments.csv' in filename:
            dep_no = data[0]
            yield dep_no, ('Department', data)

    def reducer(self, key, list_of_values) :
        values = list(list_of_values)
        employees = []
        departments = []
        for v in values:
            if v[0] == 'Employee' :
                employees.append(v)
            elif v[0] == 'Department' :
                departments.append(v)

        # Full Outer Join
        if len(employees) > 0 :
            for e in employees :
                if len(departments) > 0 :
                    for d in departments :
                        yield key, (e+d)
                else :
                    yield key, (e)
        else :
            yield None, (d)

if __name__ == '__main__' :
    MRFullOuterJoin.run()

In [None]:
%%bash

python3 13_fullouterjoin_db.py /opt/datasets/employees.csv /opt/datasets/departments.csv 2> /dev/null | head