In [None]:
# source /opt/envvars.sh

!pip3 install python-dotenv
%load_ext dotenv
%dotenv -o /opt/envvars.sh
%env

# HDFS

## HDFS - Web Interface

- Master node
    - NameNode: http://localhost:9870
    - Secondary NameNode: http://localhost:9868
- Worker node
    - hadoop1
        - DataNode: http://localhost:9864
    - hadoop2
        - DataNode: http://localhost:9865
    - hadoop3
        - DataNode: http://localhost:9866

## HDFS - CLI

In [None]:
%%bash

hdfs help

## Filesystem Basic Commands

- https://hadoop.apache.org/docs/r3.2.3/hadoop-project-dist/hadoop-common/FileSystemShell.html

Download books from Gutenberg project (http://www.gutenberg.org/)

- Moby Dick; Or, The Whale by Herman Melville
- Pride and Prejudice by Jane Austen
- Dracula by Bram Stoker

In [None]:
%%bash

cd /opt/datasets

wget -qc http://www.gutenberg.org/files/2701/2701-0.txt -O mobydick.txt
wget -qc http://www.gutenberg.org/files/1342/1342-0.txt -O prideandprejudice.txt
wget -qc http://www.gutenberg.org/cache/epub/345/pg345.txt -O dracula.txt

ls /opt/datasets

In [None]:
%%bash

cd /opt/datasets

# create gutenberg folder in HDFS
# hdfs dfs -mkdir /user/hadoop/gutenberg

# copy books to HDFS
# hdfs dfs -put * /user/hadoop/gutenberg
# hdfs dfs -copyFromLocal * /user/hadoop/gutenberg

# list files in HDFS
# hdfs dfs -ls /user/hadoop/gutenberg

# show first KB of file
# hdfs dfs -head /user/hadoop/gutenberg/mobydick.txt

# show last KB of file
# hdfs dfs -tail /user/hadoop/gutenberg/prideandprejudice.txt

# show whole file - CAREFUL
# hdfs dfs -cat /user/hadoop/gutenberg/dracula.txt

# append file contents to a file in HDFS
# hdfs dfs -appendToFile mobydick.txt prideandprejudice.txt dracula.txt /user/hadoop/allbooks.txt

# copy allbooks.txt (in HDFS) to gutenberg directory (in HDFS)
# hdfs dfs -cp allbooks.txt /user/hadoop/gutenberg
# hdfs dfs -ls -h -R

# retrieve allbooks.txt from HDFS
# hdfs dfs -get allbooks.txt .
# hdfs dfs -copyToLocal /user/hadoop/allbooks.txt .

# remove file
# hdfs dfs -rm allbooks.txt
# hdfs dfs -rm /user/hadoop/allbooks.txt

# mv file (also used for renaming)
# hdfs dfs -mv gutenberg/allbooks.txt gutenberg/books.txt

# print statistics on folder
# printf "name\ttype\tsize\treps\n"
# hdfs dfs -stat "%n %F %b %r" /user/hadoop/gutenberg/*

# getmerge
# hdfs dfs -getmerge /user/hadoop/gutenberg mergebooks.txt

# remove directory and files (-R recursive)
# hdfs dfs -rm -R /user/hadoop/gutenberg

## Utilization in a MapReduce job

In [None]:
%%bash

cd /opt/datasets

hdfs dfs -mkdir /user/hadoop/gutenberg
hdfs dfs -put mobydick.txt prideandprejudice.txt dracula.txt /user/hadoop/gutenberg

In [None]:
%%bash

cd /opt/hadoop/share/hadoop/mapreduce

# run wordcount application
hadoop jar ./hadoop-mapreduce-examples-3.2.3.jar wordcount \
/user/hadoop/gutenberg /user/hadoop/gutenberg-output

In [None]:
%%bash

# list output folder contents
hdfs dfs -ls /user/hadoop/gutenberg-output

In [None]:
%%bash

# show head
hdfs dfs -head /user/hadoop/gutenberg-output/part-r-00000

In [None]:
%%bash

cd /tmp

# copy HDFS file to local filesystem
hdfs dfs -get /user/hadoop/gutenberg-output/part-r-00000 gutenberg-output.txt
head /tmp/gutenberg-output.txt

In [None]:
%%bash

# remove folder on HDFS
hdfs dfs -rm -R /user/hadoop/gutenberg-output

In [None]:
%%bash

cd /opt/hadoop/share/hadoop/mapreduce

# run wordcount application with 2 reducers
hadoop jar ./hadoop-mapreduce-examples-3.2.3.jar wordcount \
-Dmapreduce.job.reduces=2 \
/user/hadoop/gutenberg /user/hadoop/gutenberg-output

In [None]:
%%bash

# list output folder contents
hdfs dfs -ls /user/hadoop/gutenberg-output

In [None]:
%%bash

cd /tmp

# copy HDFS file to local filesystem
hdfs dfs -getmerge /user/hadoop/gutenberg-output gutenberg-output.txt
head /tmp/gutenberg-output.txt

hdfs dfs -rm -R /user/hadoop/gutenberg-output

## Advanced Commands

- https://hadoop.apache.org/docs/r3.2.3/hadoop-project-dist/hadoop-hdfs/HDFSCommands.html

### Verify HDFS cluster status

In [None]:
%%bash

# print topology
hdfs dfsadmin -printTopology

printf "\n%40s\n\n" |tr " " "="

hdfs dfsadmin -report

### Replication factor

In [None]:
%%bash

# list folder block location
#hdfs fsck /user/hadoop/gutenberg -files -blocks -locations

# change replication factor of all files in directory to 3
#hdfs dfs -setrep 3 /user/hadoop/gutenberg

# list folder block location
#hdfs fsck /user/hadoop/gutenberg -files -blocks -locations

# change replication factor back to 2
#hdfs dfs -setrep 2 /user/hadoop/gutenberg

# list folder block location
hdfs fsck /user/hadoop/gutenberg -files -blocks -locations

### Decomission nodes

- dfs.hosts.exclude in hdfs-site.xml

In [None]:
%%bash

# Decomissioning hadoop1
cat > /opt/hadoop/etc/hadoop/dfs.exclude << EOF
hadoop1
EOF

hdfs dfsadmin -refreshNodes

http://localhost:9870

In [None]:
%%bash

# report HDFS status
hdfs dfsadmin -report

In [None]:
%%bash

# Recomission all nodes
cat > /opt/hadoop/etc/hadoop/dfs.exclude << EOF
EOF

hdfs dfsadmin -refreshNodes

In [None]:
%%bash

# report HDFS status
hdfs dfsadmin -report

### Handling datanode failures

- timeouts defined in hdfs-site.xml 
    - dfs.namenode.heartbeat.recheck-interval = 10000 (10 seconds)
    - dfs.heartbeat.interval = 3 seconds
- timeout = 2 x recheck-interval + 10 x heartbeat.interval
    - timeout = 50 seconds

In [None]:
%%bash

# get dfs.namenode.heartbeat.recheck-interval
hdfs getconf -confKey dfs.namenode.heartbeat.recheck-interval

# get dfs.heartbeat.interval
hdfs getconf -confKey dfs.heartbeat.interval

In [None]:
%%bash

# simulate node fault
ssh hadoop1 'kill -9 $(cat /tmp/hadoop-hadoop-datanode.pid)'

http://localhost:9870

In [None]:
%%bash

hdfs dfsadmin -report

In [None]:
%%bash

# Restart nodemanager
ssh hadoop1 /opt/hadoop/bin/hdfs --daemon start datanode

In [None]:
%%bash

hdfs dfsadmin -report