In [1]:
%load_ext dockermagic

# HDFS

## HDFS - Web Interface

- Master node
    - NameNode: http://localhost:9870
    - Secondary NameNode: http://localhost:9868
- Worker node
    - hadoop1
        - DataNode: http://localhost:9864
    - hadoop2
        - DataNode: http://localhost:9865
    - hadoop3
        - DataNode: http://localhost:9866

## HDFS - CLI

In [2]:
%%dockerexec hadoop

source /opt/envvars.sh
hdfs help

Usage: hdfs [OPTIONS] SUBCOMMAND [SUBCOMMAND OPTIONS]

  OPTIONS is none or any of:

tput: No value for $TERM and no -T specified
--buildpaths                       attempt to add class files from build tree
--config dir                       Hadoop config directory
--daemon (start|status|stop)       operate on a daemon
--debug                            turn on shell script debug mode
--help                             usage information
--hostnames list[,of,host,names]   hosts to use in worker mode
--hosts filename                   list of hosts to use in worker mode
--loglevel level                   set the log4j level for this command
--workers                          turn on worker mode

  SUBCOMMAND is one of:


    Admin Commands:

tput: No value for $TERM and no -T specified
cacheadmin           configure the HDFS cache
crypto               configure HDFS encryption zones
debug                run a Debug Admin to execute HDFS debug commands
dfsadmin             run a DFS admi

## Filesystem Basic Commands

- https://hadoop.apache.org/docs/r3.2.1/hadoop-project-dist/hadoop-common/FileSystemShell.html

Download books from Gutenberg project (http://www.gutenberg.org/)

- Moby Dick; Or, The Whale by Herman Melville
- Pride and Prejudice by Jane Austen
- Dracula by Bram Stoker

In [4]:
%%dockerexec hadoop

source /opt/envvars.sh
mkdir /opt/datasets
cd /opt/datasets

wget -qc http://www.gutenberg.org/files/2701/2701-0.txt -O mobydick.txt
wget -qc http://www.gutenberg.org/files/1342/1342-0.txt -O prideandprejudice.txt
wget -qc http://www.gutenberg.org/cache/epub/345/pg345.txt -O dracula.txt

ls /opt/datasets

dracula.txt
mobydick.txt
prideandprejudice.txt


In [10]:
%%dockerexec hadoop

source /opt/envvars.sh

cd /opt/datasets

# create gutenberg folder in HDFS
# hdfs dfs -mkdir /user/hadoop/gutenberg

# copy books to HDFS
# hdfs dfs -put * /user/hadoop/gutenberg
# hdfs dfs -copyFromLocal * /user/hadoop/gutenberg

# list files in HDFS
# hdfs dfs -ls /user/hadoop/gutenberg

# show first KB of file
# hdfs dfs -head /user/hadoop/gutenberg/mobydick.txt

# show last KB of file
# hdfs dfs -tail /user/hadoop/gutenberg/prideandprejudice.txt

# show whole file - CAREFUL
# hdfs dfs -cat /user/hadoop/gutenberg/dracula.txt

# append file contents to a file in HDFS
# hdfs dfs -appendToFile mobydick.txt prideandprejudice.txt dracula.txt /user/hadoop/allbooks.txt

# copy allbooks.txt (in HDFS) to gutenberg directory (in HDFS)
# hdfs dfs -cp allbooks.txt /user/hadoop/gutenberg
# hdfs dfs -ls -h -R

# retrieve allbooks.txt from HDFS
# hdfs dfs -get allbooks.txt .
# hdfs dfs -copyToLocal /user/hadoop/allbooks.txt .

# remove file
# hdfs dfs -rm allbooks.txt
# hdfs dfs -rm /user/hadoop/allbooks.txt

# mv file (also used for renaming)
# hdfs dfs -mv gutenberg/allbooks.txt gutenberg/books.txt

# print statistics on folder
# printf "name\ttype\tsize\treps\n"
hdfs dfs -stat "%n %F %b %r" /user/hadoop/gutenberg/*

# getmerge
# hdfs dfs -getmerge /user/hadoop/gutenberg mergebooks.txt

# remove directory and files (-R recursive)
# hdfs dfs -rm -R /user/hadoop/gutenberg

dracula.txt regular file 890355 2
mobydick.txt regular file 1276235 2
prideandprejudice.txt regular file 772186 2


## Utilization in a MapReduce job

In [11]:
%%dockerexec hadoop

source /opt/envvars.sh
cd /opt/datasets

hdfs dfs -mkdir /user/hadoop/gutenberg
hdfs dfs -put mobydick.txt prideandprejudice.txt dracula.txt /user/hadoop/gutenberg

mkdir: `/user/hadoop/gutenberg': File exists
put: `/user/hadoop/gutenberg/mobydick.txt': File exists
put: `/user/hadoop/gutenberg/prideandprejudice.txt': File exists
put: `/user/hadoop/gutenberg/dracula.txt': File exists


In [12]:
%%dockerexec hadoop

source /opt/envvars.sh
cd /opt/hadoop/share/hadoop/mapreduce

# run wordcount application
hadoop jar ./hadoop-mapreduce-examples-$HADOOP_VERSION.jar wordcount \
/user/hadoop/gutenberg /user/hadoop/gutenberg-output

2023-11-30 16:41:01,703 INFO client.DefaultNoHARMFailoverProxyProvider: Connecting to ResourceManager at hadoop/172.18.0.5:8032
2023-11-30 16:41:01,800 INFO client.AHSProxy: Connecting to Application History server at hadoop/172.18.0.5:10200
2023-11-30 16:41:01,975 INFO mapreduce.JobResourceUploader: Disabling Erasure Coding for path: /tmp/hadoop-yarn/staging/hadoop/.staging/job_1701371922987_0006
2023-11-30 16:41:02,231 INFO input.FileInputFormat: Total input files to process : 3
2023-11-30 16:41:02,342 INFO mapreduce.JobSubmitter: number of splits:3
2023-11-30 16:41:02,477 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1701371922987_0006
2023-11-30 16:41:02,477 INFO mapreduce.JobSubmitter: Executing with tokens: []
2023-11-30 16:41:02,639 INFO conf.Configuration: resource-types.xml not found
2023-11-30 16:41:02,640 INFO resource.ResourceUtils: Unable to find 'resource-types.xml'.
2023-11-30 16:41:02,686 INFO impl.YarnClientImpl: Submitted application application_17013719

In [13]:
%%dockerexec hadoop

source /opt/envvars.sh

# list output folder contents
hdfs dfs -ls /user/hadoop/gutenberg-output

Found 2 items
-rw-r--r--   2 hadoop hadoop          0 2023-11-30 16:41 /user/hadoop/gutenberg-output/_SUCCESS
-rw-r--r--   2 hadoop hadoop     572618 2023-11-30 16:41 /user/hadoop/gutenberg-output/part-r-00000


In [14]:
%%dockerexec hadoop

source /opt/envvars.sh
# show head
hdfs dfs -head /user/hadoop/gutenberg-output/part-r-00000

"Defects,"	2
"Information	2
"Plain	4
"Project	10
"Right	2
#1342]	1
#2701]	1
#345]	1
$20,000,000!	1
$5,000)	3
$7,000,000.	1
&	24
'AS-IS',	2
("the	2
($1	3
(1775)	1
(801)	3
(Ahab’s)	1
(Albatross)	1
(American)	1
(Bunger,	1
(By	1
(Entered	1
(Fife)	1
(Greenland	1
(I	2
(It	1
(Kept	1
(Lady	1
(Not	1
(Pig-fish	1
(Pull,	1
(Sent	1
(Sperm	3
(Spiders	1
(Spring,	1
(Steelkilt)	1
(Steelkilt’s)	1
(Strong,	1
(Supplied	3
(Terra	1
(This	1
(Unopened	2
(Why	1
(_A	1
(_Advancing_.)	1
(_Ahab	3
(_Algerine	1
(_As	1
(_Ascending,	1
(_Aside_)	1
(_Aside_.)	1
(_Black	1
(_Carpenter	1
(_Confidence_)	1
(_Dancing_)	1
(_Duodecimo_),	3
(_Duodecimo_).	1
(_During	1
(_Enter	1
(_Fin-Back_).—Under	1
(_Folio_)	1
(_Folio_),	6
(_Foresail	1
(_Grampus_).—Though	1
(_Hump	1
(_Huzza	1
(_Kept	1
(_Killer_).—Of	1
(_Leaps	1
(_Mealy-mouthed	1
(_Mem._,	6
(_Narwhale_),	1
(_Nudging_.)	1
(_Octavo_),	6
(_Octavo_).	1
(_Pasted	1
(_Quietly	1
(_Razor	1
(_Reclining	2
(_Reclining_.)	1
(_Right	1
(_Sings,	1
(_Sperm	1
(_Stubb	1
(_Sulky	1
(_Sulphur	1
(_The

In [15]:
%%dockerexec hadoop

source /opt/envvars.sh

cd /tmp

# copy HDFS file to local filesystem
hdfs dfs -get /user/hadoop/gutenberg-output/part-r-00000 gutenberg-output.txt
head /tmp/gutenberg-output.txt

"Defects,"	2
"Information	2
"Plain	4
"Project	10
"Right	2
#1342]	1
#2701]	1
#345]	1
$20,000,000!	1
$5,000)	3


In [16]:
%%dockerexec hadoop

source /opt/envvars.sh

# remove folder on HDFS
hdfs dfs -rm -R /user/hadoop/gutenberg-output

Deleted /user/hadoop/gutenberg-output


In [17]:
%%dockerexec hadoop

source /opt/envvars.sh

cd /opt/hadoop/share/hadoop/mapreduce

# run wordcount application with 2 reducers
hadoop jar ./hadoop-mapreduce-examples-$HADOOP_VERSION.jar wordcount \
-Dmapreduce.job.reduces=2 \
/user/hadoop/gutenberg /user/hadoop/gutenberg-output

2023-11-30 16:42:44,757 INFO client.DefaultNoHARMFailoverProxyProvider: Connecting to ResourceManager at hadoop/172.18.0.5:8032
2023-11-30 16:42:44,860 INFO client.AHSProxy: Connecting to Application History server at hadoop/172.18.0.5:10200
2023-11-30 16:42:45,025 INFO mapreduce.JobResourceUploader: Disabling Erasure Coding for path: /tmp/hadoop-yarn/staging/hadoop/.staging/job_1701371922987_0007
2023-11-30 16:42:45,326 INFO input.FileInputFormat: Total input files to process : 3
2023-11-30 16:42:45,436 INFO mapreduce.JobSubmitter: number of splits:3
2023-11-30 16:42:45,574 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1701371922987_0007
2023-11-30 16:42:45,575 INFO mapreduce.JobSubmitter: Executing with tokens: []
2023-11-30 16:42:45,709 INFO conf.Configuration: resource-types.xml not found
2023-11-30 16:42:45,709 INFO resource.ResourceUtils: Unable to find 'resource-types.xml'.
2023-11-30 16:42:45,751 INFO impl.YarnClientImpl: Submitted application application_17013719

In [18]:
%%dockerexec hadoop

source /opt/envvars.sh

# list output folder contents
hdfs dfs -ls /user/hadoop/gutenberg-output

Found 3 items
-rw-r--r--   2 hadoop hadoop          0 2023-11-30 16:43 /user/hadoop/gutenberg-output/_SUCCESS
-rw-r--r--   2 hadoop hadoop     285660 2023-11-30 16:43 /user/hadoop/gutenberg-output/part-r-00000
-rw-r--r--   2 hadoop hadoop     286958 2023-11-30 16:43 /user/hadoop/gutenberg-output/part-r-00001


In [19]:
%%dockerexec hadoop

source /opt/envvars.sh

cd /tmp

# copy HDFS file to local filesystem
hdfs dfs -getmerge /user/hadoop/gutenberg-output gutenberg-output.txt
head /tmp/gutenberg-output.txt

hdfs dfs -rm -R /user/hadoop/gutenberg-output

"Project	10
$20,000,000!	1
$7,000,000.	1
'AS-IS',	2
("the	2
($1	3
(1775)	1
(Ahab’s)	1
(American)	1
(Bunger,	1
Deleted /user/hadoop/gutenberg-output


## Advanced Commands

- https://hadoop.apache.org/docs/r3.3.6/hadoop-project-dist/hadoop-hdfs/HDFSCommands.html

### Verify HDFS cluster status

In [20]:
%%dockerexec hadoop

source /opt/envvars.sh

# print topology
hdfs dfsadmin -printTopology

printf "\n%40s\n\n" |tr " " "="

hdfs dfsadmin -report

Rack: /default-rack
   172.18.0.4:9866 (hadoop2.docker_hadoopnet) In Service
   172.18.0.2:9866 (hadoop1.docker_hadoopnet) In Service
   172.18.0.3:9866 (hadoop3.docker_hadoopnet) In Service



Configured Capacity: 100908072960 (93.98 GB)
Present Capacity: 24622888724 (22.93 GB)
DFS Remaining: 23685783552 (22.06 GB)
DFS Used: 937105172 (893.69 MB)
DFS Used%: 3.81%
Replicated Blocks:
	Under replicated blocks: 0
	Blocks with corrupt replicas: 0
	Missing blocks: 0
	Missing blocks (with replication factor 1): 0
	Low redundancy blocks with highest priority to recover: 0
	Pending deletion blocks: 0
Erasure Coded Block Groups: 
	Low redundancy block groups: 0
	Block groups with corrupt internal blocks: 0
	Missing block groups: 0
	Low redundancy blocks with highest priority to recover: 0
	Pending deletion blocks: 0

-------------------------------------------------
Live datanodes (3):

Name: 172.18.0.2:9866 (hadoop1.docker_hadoopnet)
Hostname: hadoop1
Decommission Status : Normal
Configured Ca

### Replication factor

In [21]:
%%dockerexec hadoop

source /opt/envvars.sh

# list folder block location
hdfs fsck /user/hadoop/gutenberg -files -blocks -locations

# change replication factor of all files in directory to 3
# hdfs dfs -setrep 3 /user/hadoop/gutenberg

# list folder block location
# hdfs fsck /user/hadoop/gutenberg -files -blocks -locations

# change replication factor back to 2
# hdfs dfs -setrep 2 /user/hadoop/gutenberg

# list folder block location
# hdfs fsck /user/hadoop/gutenberg -files -blocks -locations

Connecting to namenode via http://hadoop:9870/fsck?ugi=hadoop&files=1&blocks=1&locations=1&path=%2Fuser%2Fhadoop%2Fgutenberg
FSCK started by hadoop (auth:SIMPLE) from /172.18.0.5 for path /user/hadoop/gutenberg at Thu Nov 30 16:44:25 BRT 2023

/user/hadoop/gutenberg <dir>
/user/hadoop/gutenberg/dracula.txt 890355 bytes, replicated: replication=2, 1 block(s):  OK
0. BP-1253975299-172.18.0.5-1701371903365:blk_1073741909_1085 len=890355 Live_repl=2  [DatanodeInfoWithStorage[172.18.0.4:9866,DS-53b865e7-f9b3-4d65-8bf8-c6df9e92dd27,DISK], DatanodeInfoWithStorage[172.18.0.3:9866,DS-3afdbdfe-2ce1-4c1c-8bf9-b25389e6ebc9,DISK]]

/user/hadoop/gutenberg/mobydick.txt 1276235 bytes, replicated: replication=2, 1 block(s):  OK
0. BP-1253975299-172.18.0.5-1701371903365:blk_1073741910_1086 len=1276235 Live_repl=2  [DatanodeInfoWithStorage[172.18.0.2:9866,DS-5077a3a4-0e1b-40f0-875f-ff1a597a1b1b,DISK], DatanodeInfoWithStorage[172.18.0.4:9866,DS-53b865e7-f9b3-4d65-8bf8-c6df9e92dd27,DISK]]

/user/hadoop/gut

### Decomission nodes

- dfs.hosts.exclude in hdfs-site.xml

In [None]:
%%dockerexec hadoop

source /opt/envvars.sh

# Decomissioning hadoop1
cat > /opt/hadoop/etc/hadoop/dfs.exclude << EOF
hadoop1
EOF

hdfs dfsadmin -refreshNodes

http://localhost:9870

In [None]:
%%dockerexec hadoop

source /opt/envvars.sh

# report HDFS status
hdfs dfsadmin -report

In [None]:
%%dockerexec hadoop

source /opt/envvars.sh

# Recomission all nodes
cat > /opt/hadoop/etc/hadoop/dfs.exclude << EOF
EOF

hdfs dfsadmin -refreshNodes

In [None]:
%%dockerexec hadoop

source /opt/envvars.sh

# report HDFS status
hdfs dfsadmin -report

### Handling datanode failures

- timeouts defined in hdfs-site.xml 
    - dfs.namenode.heartbeat.recheck-interval = 10000 (10 seconds)
    - dfs.heartbeat.interval = 3 seconds
- timeout = 2 x recheck-interval + 10 x heartbeat.interval
    - timeout = 50 seconds

In [None]:
%%dockerexec hadoop

source /opt/envvars.sh

# get dfs.namenode.heartbeat.recheck-interval
hdfs getconf -confKey dfs.namenode.heartbeat.recheck-interval

# get dfs.heartbeat.interval
hdfs getconf -confKey dfs.heartbeat.interval

In [None]:
%%dockerexec hadoop

source /opt/envvars.sh

# simulate node fault
ssh hadoop1 'kill -9 $(cat /tmp/hadoop-hadoop-datanode.pid)'

http://localhost:9870

In [None]:
%%dockerexec hadoop

source /opt/envvars.sh

hdfs dfsadmin -report

In [None]:
%%dockerexec hadoop

source /opt/envvars.sh

# Restart nodemanager
ssh hadoop1 /opt/hadoop/bin/hdfs --daemon start datanode

In [None]:
%%dockerexec hadoop

source /opt/envvars.sh

hdfs dfsadmin -report