In [27]:
# Adding Docker to PATH
%env PATH /usr/bin:/bin:/usr/sbin:/sbin:/usr/local/bin

env: PATH=/usr/bin:/bin:/usr/sbin:/sbin:/usr/local/bin


In [194]:
# dockerexec magic
from IPython.core.magic import register_cell_magic
import tempfile
import os

@register_cell_magic
def dockerexec(args, cell):
    tmpf, filename = tempfile.mkstemp()
    os.write(tmpf, bytes(cell, "utf8"))
    !docker exec -i {args} bash < {filename}
    os.close(tmpf)

# Instalação do Hadoop ![Hadoop](https://hadoop.apache.org/elephant.png)

- http://hadoop.apache.org/
- Versão 3.2.1

## Plataforma

- Ubuntu 18.04 (https://ubuntu.com/)
- Docker (https://www.docker.com/)
    - virtualização baseada em containers

In [264]:
%%bash

# Executando um container ubuntu 18.04
# docker run -d -ti --rm --name hadoop ubuntu:18.04 /bin/bash
docker run -d -t --memory 6g --memory-swap 6g --rm --name hadoop -h hadoop -p 9870:9870 -p 8088:8088 ubuntu:18.04

# Mostrando o container em execução
docker ps

9ec76322789a4a8a5f6caa0bf2bb2c11e5569290b60d9ad6a421016fae6c7be6
CONTAINER ID   IMAGE          COMMAND       CREATED                  STATUS                  PORTS                                            NAMES
9ec76322789a   ubuntu:18.04   "/bin/bash"   Less than a second ago   Up Less than a second   0.0.0.0:8088->8088/tcp, 0.0.0.0:9870->9870/tcp   hadoop


## Dependências

- Java 8 (OpenJDK) - https://cwiki.apache.org/confluence/display/HADOOP/Hadoop+Java+Versions
- Outros pacotes: ssh wget

In [265]:
%%dockerexec hadoop

# Atualizar a lista de pacotes disponíveis para o sistema operacional
apt -q update

# Instalar dependências
apt -q -f -y install openjdk-8-jdk ssh wget apt-utils



Get:1 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic InRelease [242 kB]
Get:3 http://security.ubuntu.com/ubuntu bionic-security/universe amd64 Packages [1372 kB]
Get:4 http://security.ubuntu.com/ubuntu bionic-security/restricted amd64 Packages [237 kB]
Get:5 http://security.ubuntu.com/ubuntu bionic-security/multiverse amd64 Packages [15.3 kB]
Get:6 http://security.ubuntu.com/ubuntu bionic-security/main amd64 Packages [1816 kB]
Get:7 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Get:8 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [74.6 kB]
Get:9 http://archive.ubuntu.com/ubuntu bionic/main amd64 Packages [1344 kB]
Get:10 http://archive.ubuntu.com/ubuntu bionic/universe amd64 Packages [11.3 MB]
Get:11 http://archive.ubuntu.com/ubuntu bionic/multiverse amd64 Packages [186 kB]
Get:12 http://archive.ubuntu.com/ubuntu bionic/restricted amd64 Packages [13.5 kB]
Get:13 http://archive.ubun

## Instalação

- Diretório base /opt
- Usuário/grupo: hadoop/hadoop
- Pacote com binários (versão 3.2.1): https://hadoop.apache.org/releases.html

In [266]:
%%dockerexec hadoop

# Permitir escrita em /opt
chmod 777 /opt

# Criar usuário/grupo hadoop
useradd -m -U -s /bin/bash hadoop

In [267]:
%%bash

# Download do arquivo de instalação
# wget -q -c https://downloads.apache.org/hadoop/common/hadoop-3.2.1/hadoop-3.2.1.tar.gz

# Copiar arquivo de instalação para o container em /opt
docker cp hadoop-3.2.1.tar.gz hadoop:/opt

In [268]:
%%dockerexec hadoop

# Descompactar arquivo e alterar usuário/grupo
tar -zxf /opt/hadoop-3.2.1.tar.gz -C /opt
chown -R hadoop:hadoop /opt/hadoop-3.2.1
rm /opt/hadoop-3.2.1.tar.gz

In [269]:
%%dockerexec -u hadoop hadoop

# Criar link como usuário hadoop
ln -s /opt/hadoop-3.2.1 /opt/hadoop

## Configuração

1. Criar arquivo /opt/envvars.sh com variáveis de ambiente JAVA_HOME, HADOOP_HOME e PATH

In [270]:
%%dockerexec -u hadoop hadoop


# Escrevendo arquivo /opt/envvars.sh
cat > /opt/envvars.sh << EOF
JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk-amd64
HADOOP_HOME=/opt/hadoop

PATH=\$PATH:\$HADOOP_HOME/bin:\$HADOOP_HOME/sbin
EOF

ls -l /opt
echo "----------------"
cat /opt/envvars.sh

total 8
-rw-r--r-- 1 hadoop hadoop  119 Jan  3 14:02 envvars.sh
lrwxrwxrwx 1 hadoop hadoop   17 Jan  3 14:02 hadoop -> /opt/hadoop-3.2.1
drwxr-xr-x 9 hadoop hadoop 4096 Sep 10  2019 hadoop-3.2.1
----------------
JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk-amd64
HADOOP_HOME=/opt/hadoop

PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin


# Arquivos de configuração

- hadoop-env.sh
- core-site.xml
- hdfs-site.xml
- yarn-site.xml
- mapred-site.xml

In [271]:
%%dockerexec -u hadoop hadoop

sed -i "\$aexport JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk-amd64" /opt/hadoop/etc/hadoop/hadoop-env.sh

In [272]:
%%dockerexec -u hadoop hadoop

cat > /opt/hadoop/etc/hadoop/core-site.xml << EOF
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://localhost:9000</value>
</property>
<property>
<name>hadoop.tmp.dir</name>
<value>/opt/hadooptmpdir</value>
</property>
</configuration>
EOF

In [273]:
%%dockerexec -u hadoop hadoop

cat > /opt/hadoop/etc/hadoop/hdfs-site.xml << EOF
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
<name>dfs.replication</name>
<value>1</value>
</property>
</configuration>
EOF

In [274]:
%%dockerexec -u hadoop hadoop

cat > /opt/hadoop/etc/hadoop/yarn-site.xml << EOF
<?xml version="1.0" encoding="UTF-8"?>
<configuration>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
</configuration>
EOF

In [275]:
%%dockerexec -u hadoop hadoop

cat > /opt/hadoop/etc/hadoop/mapred-site.xml << EOF
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
<property>
<name>mapreduce.application.classpath</name>
<value>/opt/hadoop/share/hadoop/mapreduce/*:/opt/hadoop/share/hadoop/mapreduce/lib/*</value>
</property>
</configuration>
EOF

# Formatação do HDFS no Namenode

docker exec -u hadoop hadoop "source /opt/envvars.sh && hdfs namenode -format -force -nonInteractive"

In [276]:
%%dockerexec -u hadoop hadoop

source /opt/envvars.sh
hdfs namenode -format -force -nonInteractive

2021-01-03 14:03:01,119 INFO namenode.NameNode: STARTUP_MSG: 
/************************************************************
STARTUP_MSG: Starting NameNode
STARTUP_MSG:   host = hadoop/172.17.0.2
STARTUP_MSG:   args = [-format, -force, -nonInteractive]
STARTUP_MSG:   version = 3.2.1
STARTUP_MSG:   classpath = /opt/hadoop-3.2.1/etc/hadoop:/opt/hadoop-3.2.1/share/hadoop/common/lib/jackson-core-2.9.8.jar:/opt/hadoop-3.2.1/share/hadoop/common/lib/jetty-io-9.3.24.v20180605.jar:/opt/hadoop-3.2.1/share/hadoop/common/lib/audience-annotations-0.5.0.jar:/opt/hadoop-3.2.1/share/hadoop/common/lib/checker-qual-2.5.2.jar:/opt/hadoop-3.2.1/share/hadoop/common/lib/paranamer-2.3.jar:/opt/hadoop-3.2.1/share/hadoop/common/lib/jackson-annotations-2.9.8.jar:/opt/hadoop-3.2.1/share/hadoop/common/lib/animal-sniffer-annotations-1.17.jar:/opt/hadoop-3.2.1/share/hadoop/common/lib/jettison-1.1.jar:/opt/hadoop-3.2.1/share/hadoop/common/lib/jersey-core-1.19.jar:/opt/hadoop-3.2.1/share/hadoop/common/lib/commons-io-2

In [277]:
%%dockerexec -u hadoop hadoop

source /opt/envvars.sh
hdfs --daemon start namenode
hdfs --daemon start datanode
yarn --daemon start resourcemanager
yarn --daemon start nodemanager
jps

6513 ResourceManager
6579 NodeManager
6631 Jps
6440 DataNode
6379 NameNode


In [278]:
%%dockerexec -u hadoop hadoop

source /opt/envvars.sh
hdfs dfs -mkdir -p /user/hadoop
hdfs dfs -chown hadoop:hadoop /user/hadoop
hdfs dfs -mkdir /tmp
hdfs dfs -chmod 777 /tmp

- http://localhost:8088
- http://localhost:9870

In [279]:
%%dockerexec -u hadoop hadoop

source /opt/envvars.sh
hadoop jar $HADOOP_HOME/share/hadoop/mapreduce/hadoop-mapreduce-examples-3.2.1.jar pi 16 1000

Number of Maps  = 16
Samples per Map = 1000
2021-01-03 14:04:02,209 INFO sasl.SaslDataTransferClient: SASL encryption trust check: localHostTrusted = false, remoteHostTrusted = false
Wrote input for Map #0
2021-01-03 14:04:03,071 INFO sasl.SaslDataTransferClient: SASL encryption trust check: localHostTrusted = false, remoteHostTrusted = false
Wrote input for Map #1
2021-01-03 14:04:03,122 INFO sasl.SaslDataTransferClient: SASL encryption trust check: localHostTrusted = false, remoteHostTrusted = false
Wrote input for Map #2
2021-01-03 14:04:03,173 INFO sasl.SaslDataTransferClient: SASL encryption trust check: localHostTrusted = false, remoteHostTrusted = false
Wrote input for Map #3
2021-01-03 14:04:03,287 INFO sasl.SaslDataTransferClient: SASL encryption trust check: localHostTrusted = false, remoteHostTrusted = false
Wrote input for Map #4
2021-01-03 14:04:03,345 INFO sasl.SaslDataTransferClient: SASL encryption trust check: localHostTrusted = false, remoteHostTrusted = false
Wrote i

In [280]:
%%bash

docker stop hadoop
docker ps

hadoop
CONTAINER ID   IMAGE     COMMAND   CREATED   STATUS    PORTS     NAMES


# Dockerfile

- Arquivo de definição para criação da imagem de um container
- Referência: https://docs.docker.com/engine/reference/builder

In [25]:
%%writefile Dockerfile
# base image
FROM ubuntu:18.04

# run apt update
RUN apt update

# install ssh rsync wget openjdk
RUN apt -f -y install ssh rsync wget openjdk-8-jdk

# change /opt permissions to 777
RUN chmod 777 /opt

# create hadoop user and group
RUN useradd -m -U -s /bin/bash hadoop

# download hadoop
WORKDIR /opt

# RUN wget -q -c https://downloads.apache.org/hadoop/common/hadoop-3.2.1/hadoop-3.2.1.tar.gz
ADD hadoop-3.2.1.tar.gz /opt

RUN chown -R hadoop:hadoop /opt/hadoop-3.2.1

# execute next commands as user hadoop
USER hadoop

RUN ln -s hadoop-3.2.1 hadoop



#CMD ["/bin/bash"]

Overwriting Dockerfile


In [19]:
%%bash

docker build -t hadoopdocker .

Process is interrupted.


In [13]:
%%bash

docker run -ti -d --rm --name hadoop -d hadoopdocker

6961753d2be1aacc2fd4a1b94a64aee08fc1cf099df4209c018d9ce4875a5ea8


In [14]:
%%bash

docker ps

CONTAINER ID   IMAGE          COMMAND       CREATED         STATUS         PORTS     NAMES
6961753d2be1   hadoopdocker   "/bin/bash"   9 seconds ago   Up 8 seconds             hadoop


In [15]:
%%bash

docker exec hadoop java -version

openjdk version "1.8.0_275"
OpenJDK Runtime Environment (build 1.8.0_275-8u275-b01-0ubuntu1~18.04-b01)
OpenJDK 64-Bit Server VM (build 25.275-b01, mixed mode)


In [6]:
%env PATH /usr/bin:/bin:/usr/sbin:/sbin:/usr/local/bin

env: PATH=/usr/bin:/bin:/usr/sbin:/sbin:/usr/local/bin


In [3]:
%env PATH

'/usr/bin:/bin:/usr/sbin:/sbin:/usr/local/bin'

In [162]:
%%bash

cat << EOF | docker exec -i -u hadoop hadoop bash
# source /home/hadoop/.bashrc
# echo $PATH
# A=123
# echo $A
source /opt/envvars.sh
echo \$JAVA_HOME
EOF

# docker exec -i -u hadoop hadoop bash < teste.sh

/usr/lib/jvm/java-1.8.0-openjdk-amd64


In [161]:
%%bash

cat << EOF | sed -e "s/[\$]/\\\&/g" | head

EOF

91476914769147691476


In [169]:
%%writefile script.sh

source /opt/envvars.sh
echo $JAVA_HOME
A=123
echo $A

Overwriting script.sh


In [170]:
%%bash

docker exec -i -u hadoop hadoop bash < script.sh

/usr/lib/jvm/java-1.8.0-openjdk-amd64
123


In [171]:
# Magic - write_and_run - write file and execute
from IPython.core.magic import register_cell_magic


@register_cell_magic
def write_and_run(line, cell):
    argz = line.split()
    file = argz[-1]
    mode = 'w'
    if len(argz) == 2 and argz[0] == '-a':
        mode = 'a'
    with open(file, mode) as f:
        f.write(cell)
    get_ipython().run_cell(cell)

In [174]:
%%write_and_run teste.py

a = [1,2,3]
a += [4]
print(a)

[1, 2, 3, 4]


In [191]:
from IPython.core.magic import register_cell_magic
import tempfile
import os

@register_cell_magic
def dockerexec(args, cell):
    tmpf, filename = tempfile.mkstemp()
    os.write(tmpf, bytes(cell, "utf8"))
    !docker exec -i {args} bash < {filename}
    os.close(tmpf)

In [193]:
%%dockerexec hadoop

whoami

root


In [186]:
%%dockerexec

source /opt/envvars.sh
echo $JAVA_HOME
A=123
echo $A

/usr/lib/jvm/java-1.8.0-openjdk-amd64
123
