In [None]:
%load_ext dockermagic

# Hadoop - multi-node cluster setup 
![Hadoop](https://hadoop.apache.org/elephant.png)

## Create Hadoop base image

### Create docker container

- Ubuntu 18.04 (https://ubuntu.com/)
- Docker (https://www.docker.com/)
    - container based virtualization

In [None]:
%%bash

docker run -d -t --rm --name hadoopimg -h hadoopimg ubuntu:18.04

docker ps

### Install Dependencies

- Java 8 (OpenJDK) - https://cwiki.apache.org/confluence/display/HADOOP/Hadoop+Java+Versions
- Other packages: ssh pdsh wget apt-utils

In [None]:
%%dockerexec hadoopimg

# Update package list
apt -qq update > /install.log 2>&1

# Install Hadoop dependencies
apt -qq -f -y install openjdk-8-jdk ssh pdsh >> /install.log 2>&1

# Install other dependencies
apt -qq -f -y install vim wget apt-utils python3 python3-pip \
    ipython3 less unzip sudo net-tools >> /install.log 2>&1

### Install Hadoop

- http://hadoop.apache.org/
- Version 3.2.3
- Base directory: /opt
- User/group: hadoop/hadoop
- Package with binaries (version 3.2.3): https://hadoop.apache.org/releases.html

In [None]:
%%dockerexec hadoopimg

# Enable rwx for all on /opt
chmod 777 /opt

# Create user/group hadoop
useradd -m -U -s /bin/bash hadoop

# Enable sudo for hadoop
sed -i "\$ahadoop  ALL=(ALL) NOPASSWD:ALL" /etc/sudoers

In [None]:
%%bash

HADOOPVERSION=hadoop-3.2.3

# Download package
cd ../pkgs
wget -q -c https://downloads.apache.org/hadoop/common/$HADOOPVERSION/$HADOOPVERSION.tar.gz

# Copy installation package to container
docker cp $HADOOPVERSION.tar.gz hadoopimg:/opt

In [None]:
%%dockerexec -u hadoop hadoopimg

HADOOPVERSION=hadoop-3.2.3

# Modify user/group permissions and unpack file
sudo chown hadoop:hadoop /opt/$HADOOPVERSION.tar.gz
tar -zxf /opt/$HADOOPVERSION.tar.gz -C /opt
rm /opt/$HADOOPVERSION.tar.gz

# Create link
ln -s /opt/$HADOOPVERSION /opt/hadoop

### Configure environment variables

- Create file /opt/envvars.sh with environment variables

In [None]:
%%dockerexec -u hadoop hadoopimg

cat > /opt/envvars.sh << EOF
export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk-amd64
export PDSH_RCMD_TYPE=ssh

export HADOOP_HOME=/opt/hadoop
export HADOOP_COMMON_HOME=\${HADOOP_HOME}
export HADOOP_CONF_DIR=\${HADOOP_HOME}/etc/hadoop
export HADOOP_HDFS_HOME=\${HADOOP_HOME}
export HADOOP_MAPRED_HOME=\${HADOOP_HOME}
export HADOOP_YARN_HOME=\${HADOOP_HOME}

export PATH=\${PATH}:\${HADOOP_HOME}/bin:\${HADOOP_HOME}/sbin     

EOF

cat /opt/envvars.sh

### Configure passwordless ssh

In [None]:
%%dockerexec -u hadoop hadoopimg

# Disable host key checking
sudo tee -a /etc/ssh/ssh_config << EOF
    StrictHostKeyChecking no
    UserKnownHostsFile /dev/null
EOF

# Create ssh key
ssh-keygen -q -t rsa -P "" -f ~/.ssh/id_rsa

# Copy public key to authorized_keys file
cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys

### Hadoop configuration files

- Hadoop configuration files location: \$HADOOP\_HOME\/etc\/hadoop
- All cluster nodes contain the same files

#### hadoop-env.sh

- Definition of environment variables used by Hadoop processes

In [None]:
%%dockerexec -u hadoop hadoopimg

cat >> /opt/hadoop/etc/hadoop/hadoop-env.sh << EOF
export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk-amd64
EOF

#### core-site.xml

- Hadoop main configuration
- Default parameters: http://hadoop.apache.org/docs/r3.2.3/hadoop-project-dist/hadoop-common/core-default.xml

In [None]:
%%dockerexec -u hadoop hadoopimg

cat > /opt/hadoop/etc/hadoop/core-site.xml << EOF
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>

<property>
    <name>fs.defaultFS</name>
    <value>hdfs://hadoop:9000</value>
</property>

<property>
    <name>hadoop.proxyuser.hadoop.groups</name>
    <value>*</value>
</property>

<property>
    <name>hadoop.proxyuser.hadoop.hosts</name>
    <value>*</value>
</property>

</configuration>
EOF

#### hdfs-site.xml

- HDFS configuration
- Default parameters: http://hadoop.apache.org/docs/r3.2.3/hadoop-project-dist/hadoop-hdfs/hdfs-default.xml

In [None]:
%%dockerexec -u hadoop hadoopimg

cat > /opt/hadoop/etc/hadoop/hdfs-site.xml << EOF
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>

<property>
    <name>dfs.namenode.name.dir</name>
    <value>/opt/hadoop/data/nameNode</value>
</property>

<property>
    <name>dfs.datanode.data.dir</name>
    <value>/opt/hadoop/data/dataNode</value>
</property>

<property>
    <name>dfs.replication</name>
    <value>2</value>
</property>

<property>
    <name>dfs.blocksize</name>
    <value>33554432</value>
</property>

<property>
    <name>dfs.hosts.exclude</name>
    <value>/opt/hadoop/etc/hadoop/dfs.exclude</value>
</property>

<property>
    <name>dfs.namenode.heartbeat.recheck-interval</name>
    <value>10000</value>
</property>

</configuration>

EOF

#### yarn-site.xml

- YARN configuration
- Default parameters: http://hadoop.apache.org/docs/r3.2.3/hadoop-yarn/hadoop-yarn-common/yarn-default.xml

In [None]:
%%dockerexec -u hadoop hadoopimg

cat > /opt/hadoop/etc/hadoop/yarn-site.xml << EOF
<?xml version="1.0" encoding="UTF-8"?>
<configuration>

<property>
    <name>yarn.resourcemanager.hostname</name>
    <value>hadoop</value>
</property>

<property>
    <name>yarn.nodemanager.aux-services</name>
    <value>mapreduce_shuffle</value>
</property>

<property>
    <name>yarn.nodemanager.resource.memory-mb</name>
    <value>1536</value>
</property>

<property>
    <name>yarn.scheduler.maximum-allocation-mb</name>
    <value>1536</value>
</property>

<property>
    <name>yarn.scheduler.minimum-allocation-mb</name>
    <value>128</value>
</property>

<property>
    <name>yarn.timeline-service.enabled</name>
    <value>true</value>
</property>

<property>
    <name>yarn.timeline-service.hostname</name>
    <value>hadoop</value>
</property>

<property>
    <name>yarn.system-metrics-publisher.enabled</name>
    <value>true</value>
</property>

<property>
    <name>yarn.log-aggregation-enable</name>
    <value>true</value>
</property>

<property>
    <name>yarn.nm.liveness-monitor.expiry-interval-ms</name>
    <value>10000</value>
</property>

</configuration>
EOF

#### mapred-site.xml

- MapReduce configuration
- Default parameters: http://hadoop.apache.org/docs/r3.2.3/hadoop-mapreduce-client/hadoop-mapreduce-client-core/mapred-default.xml

In [None]:
%%dockerexec -u hadoop hadoopimg

cat > /opt/hadoop/etc/hadoop/mapred-site.xml << EOF
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>

<property>
    <name>mapreduce.framework.name</name>
    <value>yarn</value>
</property>

<property>
    <name>mapreduce.application.classpath</name>
    <value>/opt/hadoop/share/hadoop/mapreduce/*:/opt/hadoop/share/hadoop/mapreduce/lib/*</value>
</property>

<property>
    <name>yarn.app.mapreduce.am.resource.mb</name>
    <value>512</value>
</property>

<property>
    <name>mapreduce.map.memory.mb</name>
    <value>256</value>
</property>

<property>
    <name>mapreduce.reduce.memory.mb</name>
    <value>256</value>
</property>

</configuration>
EOF

#### workers

- List of worker nodes (NodeManager and DataNode)

In [None]:
%%dockerexec -u hadoop hadoopimg

cat > /opt/hadoop/etc/hadoop/workers << EOF
hadoop1
hadoop2
hadoop3
EOF

### Commit base image

In [None]:
%%bash

# Create hadoopimg image based on hadoop container
docker commit hadoopimg hadoopimg

# Stop base container
docker stop hadoopimg

## Create cluster

### Run nodes

In [None]:
%%bash

# MASTER

# Ports
# 9870 - Namenode
# 9868 - Secondary Namenode
# 8088 - ResourceManager
# 19888 - MapReduce Job History
# 8188 - Timeline Service
# 4040 - Spark Application UI
# 8080 - Jupyter

cd ..
docker run -d -t --memory 4g --memory-swap 4g --rm --name hadoop -h hadoop -u hadoop \
    -v "$(pwd)"/pkgs:/opt/pkgs -v "$(pwd)"/notebooks:/opt/notebooks -v "$(pwd)"/datasets:/opt/datasets \
    -p 9870:9870 -p 9868:9868 -p 8088:8088 -p 19888:19888 -p 8188:8188 -p 4040:4040 -p 8080:8080 hadoopimg

# WORKERS

# Ports
# 9864 - DataNode WebUI
# 8042 - NodeManager WebUI

# Hadoop1
docker run -d -t --memory 2g --memory-swap 2g --rm --name hadoop1 -h hadoop1 -u hadoop \
    -p 9864:9864 -p 8042:8042 hadoopimg
# Hadoop2
docker run -d -t --memory 2g --memory-swap 2g --rm --name hadoop2 -h hadoop2 -u hadoop \
    -p 9865:9864 -p 8043:8042  hadoopimg
# Hadoop3
docker run -d -t --memory 2g --memory-swap 2g --rm --name hadoop3 -h hadoop3 -u hadoop \
    -p 9866:9864 -p 8044:8042  hadoopimg

docker ps

### Configure hosts file on all nodes

- /etc/hosts

In [None]:
%%bash

# Get IPs
M=$(docker inspect hadoop | grep \"IPAddress\" | head -1 | awk '{ print $2 }' | tr -d \",)
H1=$(docker inspect hadoop1 | grep \"IPAddress\" | head -1 | awk '{ print $2 }' | tr -d \",)
H2=$(docker inspect hadoop2 | grep \"IPAddress\" | head -1 | awk '{ print $2 }' | tr -d \",)
H3=$(docker inspect hadoop3 | grep \"IPAddress\" | head -1 | awk '{ print $2 }' | tr -d \",)

# Create hosts file
cat > hosts << EOF  
$M hadoop
$H1 hadoop1
$H2 hadoop2
$H3 hadoop3
EOF

cat hosts

# Copy to all nodes
docker exec -i -u root hadoop sh -c 'cat >> /etc/hosts' < hosts
docker exec -i -u root hadoop1 sh -c 'cat >> /etc/hosts' < hosts
docker exec -i -u root hadoop2 sh -c 'cat >> /etc/hosts' < hosts
docker exec -i -u root hadoop3 sh -c 'cat >> /etc/hosts' < hosts

# Remove local file
rm hosts

### Start ssh server on all nodes

In [None]:
%%bash

for HOST in hadoop hadoop1 hadoop2 hadoop3
do
    echo $HOST
    docker exec -u root $HOST service ssh restart
    docker exec -u root $HOST service ssh status
done

## Format HDFS on Namenode

In [None]:
%%dockerexec hadoop

source /opt/envvars.sh

hdfs namenode -format -force -nonInteractive

## Start Hadoop daemons

- manual execution: ```hdfs --daemon start (namenode|datanode)``` and ```yarn --daemon start (resourcemanager|nodemanager)```
- auxilliary scripts to run all processes on the cluster: start-dfs.sh (HDFS) and start-yarn.sh (YARN)
- some services still need to be executed manually (timelineserver, historyserver)

In [None]:
%%dockerexec hadoop

source /opt/envvars.sh

# HDFS
start-dfs.sh

# YARN
start-yarn.sh

# timelineserver
yarn --daemon start timelineserver

# historyserver
mapred --daemon start historyserver

In [None]:
%%bash

# Listing all processes
for HOST in hadoop hadoop1 hadoop2 hadoop3; do
    echo $HOST
    docker exec $HOST jps
done

## Create HDFS directories

In [None]:
%%dockerexec hadoop

source /opt/envvars.sh

hdfs dfs -mkdir -p /user/hadoop
hdfs dfs -chown hadoop:hadoop /user/hadoop
hdfs dfs -mkdir /tmp
hdfs dfs -chmod 777 /tmp

## Install and run Jupyter on master node

In [None]:
%%dockerexec hadoop

# pip3 -q install notebook
pip3 -q install jupyterlab

IP=$(ifconfig eth0 | grep inet | awk '{ print $2 }')

cd /opt

export SHELL=/bin/bash
# nohup /home/hadoop/.local/bin/jupyter-notebook --ip=$IP --port=8080 --NotebookApp.token='' --NotebookApp.password='' --NotebookApp.notebook_dir='/' --no-browser &
nohup /home/hadoop/.local/bin/jupyter-lab --ip=$IP --port=8080 --notebook-dir='/' --ServerApp.token='' --ServerApp.password='' --no-browser &

echo $! > /tmp/jupyter.pid

# To kill
# kill $(cat /tmp/jupyter.pid)

## Access web interfaces

- Jupyterlab
    - http://localhost:8080
- Master - hadoop
    - Resource Manager: http://localhost:8088
    - NameNode: http://localhost:9870
    - Secondary NameNode: http://localhost:9868
    - MapReduce Job History: http://localhost:19888
    - Timeline Service: http://localhost:8188
- Workers
    - hadoop1
        - NodeManager: http://localhost:8042
        - DataNode: http://localhost:9864
    - hadoop2
        - NodeManager: http://localhost:8043
        - DataNode: http://localhost:9865
    - hadoop3
        - NodeManager: http://localhost:8044
        - DataNode: http://localhost:9866

## Run mapreduce Pi example

In [None]:
%%dockerexec hadoop

source /opt/envvars.sh
cd /opt/hadoop/share/hadoop/mapreduce

hadoop jar ./hadoop-mapreduce-examples-3.2.3.jar pi 6 10000

# SHUTDOWN PROCEDURE

## Stop Jupyter

In [None]:
%%dockerexec hadoop

kill $(cat /tmp/jupyter.pid)

## Stop Hadoop daemons

In [None]:
%%dockerexec hadoop

source /opt/envvars.sh

stop-dfs.sh
stop-yarn.sh
yarn --daemon stop timelineserver
mapred --daemon stop historyserver

## Stop Docker containers

In [None]:
%%bash

for HOST in hadoop hadoop1 hadoop2 hadoop3; do
    docker stop $HOST
done

docker ps