In [119]:
%load_ext dockermagic

# HDFS

## HDFS - Web Interface

- Master node
    - NameNode: http://localhost:9870
    - Secondary NameNode: http://localhost:9868
- Worker node
    - hadoop1
        - NodeManager: http://localhost:8042
    - hadoop2
        - NodeManager: http://localhost:8043
    - hadoop3
        - NodeManager: http://localhost:8044

## HDFS - CLI

In [None]:
%%dockerexec -u hadoop hadoop
source /opt/envvars.sh

hdfs help

## Filesystem Basic Commands

- https://hadoop.apache.org/docs/r3.2.1/hadoop-project-dist/hadoop-common/FileSystemShell.html

Download books from Gutenberg project (http://www.gutenberg.org/)

- Moby Dick; Or, The Whale by Herman Melville
- Pride and Prejudice by Jane Austen
- Dracula by Bram Stoker

In [121]:
%%dockerexec -u hadoop hadoop

[ ! -d "/opt/datasets" ] && mkdir /opt/datasets
cd /opt/datasets

wget -qc http://www.gutenberg.org/files/2701/2701-0.txt -O mobydick.txt
wget -qc http://www.gutenberg.org/files/1342/1342-0.txt -O prideandprejudice.txt
wget -qc http://www.gutenberg.org/cache/epub/345/pg345.txt -O dracula.txt
    
ls /opt/datasets

dracula.txt
mobydick.txt
prideandprejudice.txt
stations.csv
trips.csv.zip


In [47]:
%%dockerexec -u hadoop hadoop
source /opt/envvars.sh
cd /opt/datasets

# create gutenberg folder in HDFS
# hdfs dfs -mkdir /user/hadoop/gutenberg

# copy books to HDFS
# hdfs dfs -put * /user/hadoop/gutenberg
# hdfs dfs -copyFromLocal * /user/hadoop/gutenberg

# list files in HDFS
# hdfs dfs -ls /user/hadoop/gutenberg

# show first KB of file
# hdfs dfs -head /user/hadoop/gutenberg/mobydick.txt

# show last KB of file
# hdfs dfs -tail /user/hadoop/gutenberg/prideandprejudice.txt

# show whole file - CAREFUL
# hdfs dfs -cat /user/hadoop/gutenberg/dracula.txt

# append file contents to a file in HDFS
# hdfs dfs -appendToFile mobydick.txt prideandprejudice.txt dracula.txt /user/hadoop/allbooks.txt

# copy allbooks.txt (in HDFS) to gutenberg directory (in HDFS)
# hdfs dfs -cp allbooks.txt /user/hadoop/gutenberg
# hdfs dfs -ls -h -R

# retrieve allbooks.txt from HDFS
# hdfs dfs -get allbooks.txt .
# hdfs dfs -copyToLocal /user/hadoop/allbooks.txt .

# remove file
# hdfs dfs -rm allbooks.txt
# hdfs dfs -rm /user/hadoop/allbooks.txt

# mv file (also used for renaming)
# hdfs dfs -mv gutenberg/allbooks.txt gutenberg/books.txt

# print statistics on folder
# printf "name\ttype\tsize\treps\n"
# hdfs dfs -stat "%n %F %b %r" /user/hadoop/gutenberg/*

# getmerge
# hdfs dfs -getmerge /user/hadoop/gutenberg mergebooks.txt

# remove directory and files (-R recursive)
# hdfs dfs -rm -R /user/hadoop/gutenberg

2021-01-06 12:05:35,513 INFO sasl.SaslDataTransferClient: SASL encryption trust check: localHostTrusted = false, remoteHostTrusted = false
2021-01-06 12:05:35,866 INFO sasl.SaslDataTransferClient: SASL encryption trust check: localHostTrusted = false, remoteHostTrusted = false
2021-01-06 12:05:35,980 INFO sasl.SaslDataTransferClient: SASL encryption trust check: localHostTrusted = false, remoteHostTrusted = false
Found 3 items
-rw-r--r--   2 hadoop hadoop     799738 2021-01-06 12:05 /user/hadoop/gutenberg/1342-0.txt
-rw-r--r--   2 hadoop hadoop    1276201 2021-01-06 12:05 /user/hadoop/gutenberg/2701-0.txt
-rw-r--r--   2 hadoop hadoop     883160 2021-01-06 12:05 /user/hadoop/gutenberg/pg345.txt


## Utilization in a MapReduce job

In [122]:
%%dockerexec -u hadoop hadoop
source /opt/envvars.sh

cd /opt/datasets
hdfs dfs -mkdir /user/hadoop/gutenberg
hdfs dfs -put mobydick.txt prideandprejudice.txt dracula.txt /user/hadoop/gutenberg

2021-01-08 12:54:59,516 INFO sasl.SaslDataTransferClient: SASL encryption trust check: localHostTrusted = false, remoteHostTrusted = false
2021-01-08 12:54:59,870 INFO sasl.SaslDataTransferClient: SASL encryption trust check: localHostTrusted = false, remoteHostTrusted = false
2021-01-08 12:54:59,976 INFO sasl.SaslDataTransferClient: SASL encryption trust check: localHostTrusted = false, remoteHostTrusted = false


In [100]:
%%dockerexec -u hadoop hadoop
source /opt/envvars.sh
cd /opt/hadoop/share/hadoop/mapreduce

# run wordcount application
hadoop jar ./hadoop-mapreduce-examples-3.2.1.jar wordcount \
/user/hadoop/gutenberg /user/hadoop/gutenberg-output

2021-01-06 23:44:30,523 INFO sasl.SaslDataTransferClient: SASL encryption trust check: localHostTrusted = false, remoteHostTrusted = false
2021-01-06 23:44:30,813 INFO sasl.SaslDataTransferClient: SASL encryption trust check: localHostTrusted = false, remoteHostTrusted = false
2021-01-06 23:44:30,910 INFO sasl.SaslDataTransferClient: SASL encryption trust check: localHostTrusted = false, remoteHostTrusted = false
2021-01-06 23:44:35,245 INFO client.RMProxy: Connecting to ResourceManager at hadoop/172.17.0.2:8032
2021-01-06 23:44:35,627 INFO client.AHSProxy: Connecting to Application History server at hadoop/172.17.0.2:10200
2021-01-06 23:44:36,192 INFO mapreduce.JobResourceUploader: Disabling Erasure Coding for path: /tmp/hadoop-yarn/staging/hadoop/.staging/job_1609940116821_0001
2021-01-06 23:44:36,511 INFO sasl.SaslDataTransferClient: SASL encryption trust check: localHostTrusted = false, remoteHostTrusted = false
2021-01-06 23:44:36,861 INFO input.FileInputFormat: Total input files 

In [101]:
%%dockerexec -u hadoop hadoop
source /opt/envvars.sh

# list output folder contents
hdfs dfs -ls /user/hadoop/gutenberg-output

Found 2 items
-rw-r--r--   2 hadoop hadoop          0 2021-01-06 23:45 /user/hadoop/gutenberg-output/_SUCCESS
-rw-r--r--   2 hadoop hadoop     566225 2021-01-06 23:45 /user/hadoop/gutenberg-output/part-r-00000


In [102]:
%%dockerexec -u hadoop hadoop
source /opt/envvars.sh

# show head
hdfs dfs -head /user/hadoop/gutenberg-output/part-r-00000

2021-01-06 23:46:27,474 INFO sasl.SaslDataTransferClient: SASL encryption trust check: localHostTrusted = false, remoteHostTrusted = false
"'Are	1
"'E's	1
"'I	1
"'Ittin'	1
"'Little	1
"'Lucy,	1
"'Maybe	1
"'Miss	1
"'My	2
"'Never	1
"'No'	1
"'Ow	1
"'Silence!	1
"'That's	1
"'Tyke	1
"'Wilhelmina'--I	1
"'Yes,	1
"A	8
"ABRAHAM	1
"ART."	1
"ARTHUR."	1
"About	1
"Afraid	1
"Again	1
"Agreed!"	1
"Ah	2
"Ah,	16
"Aha!	1
"Aha!"	2
"Alas!	1
"All	7
"Already?"	1
"Am	2
"Amen"	1
"An	1
"An'	1
"And	49
"And,	1
"Answer	1
"Any	1
"Arabian	1
"Are	5
"Arthur	1
"Arthur!	2
"As	6
"Ask	1
"At	3
"Ay,	1
"Back,	1
"Be	1
"Because	7
"Because,	1
"Because,"	2
"Before	1
"Believe	3
"Besides,	1
"Bloofer	1
"Blow	1
"Blue"	1
"Bother	1
"Brave	2
"Bring	1
"But	21
"But,	6
"But,"	4
"By	3
"Call	1
"Can	3
"Can't	1
"Certainly	2
"Certainly,"	1
"Certainly."	1
"Charcot	1
"Come	5
"Come!"	3
"Come,	6
"Come,"	4
"Come.	1
"Count	4
"DEMETER."	1
"DRACULA."	1
"Dear	5
"Defects,"	1
"Denn	1
"Destroyed?"	1
"Did	3
"Do	19
"Doctor,	1
"Don't	3
"Dr.	12
"Draw	1
"Edward	

In [103]:
%%dockerexec -u hadoop hadoop
source /opt/envvars.sh
cd /tmp

# copy HDFS file to local filesystem
hdfs dfs -get /user/hadoop/gutenberg-output/part-r-00000 gutenberg-output.txt
head /tmp/gutenberg-output.txt

2021-01-06 23:46:49,821 INFO sasl.SaslDataTransferClient: SASL encryption trust check: localHostTrusted = false, remoteHostTrusted = false


In [104]:
%%dockerexec -u hadoop hadoop
source /opt/envvars.sh

# remove folder on HDFS
hdfs dfs -rm -R /user/hadoop/gutenberg-output

Deleted /user/hadoop/gutenberg-output


In [123]:
%%dockerexec -u hadoop hadoop
source /opt/envvars.sh
cd /opt/hadoop/share/hadoop/mapreduce

# run wordcount application with 2 reducers
hadoop jar ./hadoop-mapreduce-examples-3.2.1.jar wordcount \
-Dmapreduce.job.reduces=2 \
/user/hadoop/gutenberg /user/hadoop/gutenberg-output

2021-01-08 12:55:19,796 INFO client.RMProxy: Connecting to ResourceManager at hadoop/172.17.0.2:8032
2021-01-08 12:55:20,214 INFO client.AHSProxy: Connecting to Application History server at hadoop/172.17.0.2:10200
2021-01-08 12:55:20,672 INFO mapreduce.JobResourceUploader: Disabling Erasure Coding for path: /tmp/hadoop-yarn/staging/hadoop/.staging/job_1610022874613_0027
2021-01-08 12:55:20,992 INFO sasl.SaslDataTransferClient: SASL encryption trust check: localHostTrusted = false, remoteHostTrusted = false
2021-01-08 12:55:21,746 INFO input.FileInputFormat: Total input files to process : 3
2021-01-08 12:55:21,830 INFO sasl.SaslDataTransferClient: SASL encryption trust check: localHostTrusted = false, remoteHostTrusted = false
2021-01-08 12:55:21,914 INFO sasl.SaslDataTransferClient: SASL encryption trust check: localHostTrusted = false, remoteHostTrusted = false
2021-01-08 12:55:21,957 INFO mapreduce.JobSubmitter: number of splits:3
2021-01-08 12:55:22,439 INFO sasl.SaslDataTransferCl

In [124]:
%%dockerexec -u hadoop hadoop
source /opt/envvars.sh

# list output folder contents
hdfs dfs -ls /user/hadoop/gutenberg-output

Found 3 items
-rw-r--r--   2 hadoop hadoop          0 2021-01-08 12:56 /user/hadoop/gutenberg-output/_SUCCESS
-rw-r--r--   2 hadoop hadoop     282621 2021-01-08 12:56 /user/hadoop/gutenberg-output/part-r-00000
-rw-r--r--   2 hadoop hadoop     283604 2021-01-08 12:56 /user/hadoop/gutenberg-output/part-r-00001


In [125]:
%%dockerexec -u hadoop hadoop
source /opt/envvars.sh
cd /tmp

# copy HDFS file to local filesystem
hdfs dfs -getmerge /user/hadoop/gutenberg-output gutenberg-output.txt
head /tmp/gutenberg-output.txt

hdfs dfs -rm -R /user/hadoop/gutenberg-output

2021-01-08 12:57:29,087 INFO sasl.SaslDataTransferClient: SASL encryption trust check: localHostTrusted = false, remoteHostTrusted = false
2021-01-08 12:57:29,309 INFO sasl.SaslDataTransferClient: SASL encryption trust check: localHostTrusted = false, remoteHostTrusted = false
"'Are	1
"'Little	1
"'Maybe	1
"'Miss	1
"'My	2
"'Never	1
"'No'	1
"'Ow	1
"'Silence!	1
"'Wilhelmina'--I	1


## Advanced Commands

- https://hadoop.apache.org/docs/r3.2.1/hadoop-project-dist/hadoop-hdfs/HDFSCommands.html

### Verify HDFS cluster status

In [105]:
%%dockerexec -u hadoop hadoop
source /opt/envvars.sh

# print topology
hdfs dfsadmin -printTopology

printf "\n%40s\n\n" |tr " " "="

hdfs dfsadmin -report

Rack: /default-rack
   172.17.0.3:9866 (hadoop1)
   172.17.0.4:9866 (hadoop2)
   172.17.0.5:9866 (hadoop3)

Configured Capacity: 188176871424 (175.25 GB)
Present Capacity: 60270764530 (56.13 GB)
DFS Remaining: 60262526976 (56.12 GB)
DFS Used: 8237554 (7.86 MB)
DFS Used%: 0.01%
Replicated Blocks:
	Under replicated blocks: 0
	Blocks with corrupt replicas: 0
	Missing blocks: 0
	Missing blocks (with replication factor 1): 0
	Low redundancy blocks with highest priority to recover: 0
	Pending deletion blocks: 0
Erasure Coded Block Groups: 
	Low redundancy block groups: 0
	Block groups with corrupt internal blocks: 0
	Missing block groups: 0
	Low redundancy blocks with highest priority to recover: 0
	Pending deletion blocks: 0

-------------------------------------------------
Live datanodes (3):

Name: 172.17.0.3:9866 (hadoop1)
Hostname: hadoop1
Decommission Status : Normal
Configured Capacity: 62725623808 (58.42 GB)
DFS Used: 1757678 (1.68 MB)
Non DFS Used: 39419641362 (36.71 GB)
DFS Remain

### Replication factor

In [107]:
%%dockerexec -u hadoop hadoop
source /opt/envvars.sh

# list folder block location
hdfs fsck /user/hadoop/gutenberg -files -blocks -locations

printf "\n%40s\n\n" |tr " " "="

# change replication factor of all files in directory to 3
hdfs dfs -setrep 3 /user/hadoop/gutenberg

printf "\n%40s\n\n" |tr " " "="

# list folder block location
hdfs fsck /user/hadoop/gutenberg -files -blocks -locations

printf "\n%40s\n\n" |tr " " "="

# change replication factor back to 2
hdfs dfs -setrep 2 /user/hadoop/gutenberg

# list folder block location
hdfs fsck /user/hadoop/gutenberg -files -blocks -locations

Connecting to namenode via http://hadoop:9870/fsck?ugi=hadoop&files=1&blocks=1&locations=1&path=%2Fuser%2Fhadoop%2Fgutenberg
FSCK started by hadoop (auth:SIMPLE) from /172.17.0.2 for path /user/hadoop/gutenberg at Wed Jan 06 23:50:47 GMT 2021
/user/hadoop/gutenberg <dir>
/user/hadoop/gutenberg/book1.txt 1276201 bytes, replicated: replication=3, 1 block(s):  OK
0. BP-361701204-172.17.0.2-1609934326274:blk_1073741862_1038 len=1276201 Live_repl=3  [DatanodeInfoWithStorage[172.17.0.5:9866,DS-5840172a-b11f-4b67-ba2a-dc19c050287b,DISK], DatanodeInfoWithStorage[172.17.0.4:9866,DS-48d422f2-7e3f-402f-9948-1a4d8a13840a,DISK], DatanodeInfoWithStorage[172.17.0.3:9866,DS-d88ef73a-59ee-4c59-bcf5-d23bc4c680af,DISK]]

/user/hadoop/gutenberg/book2.txt 799738 bytes, replicated: replication=3, 1 block(s):  OK
0. BP-361701204-172.17.0.2-1609934326274:blk_1073741863_1039 len=799738 Live_repl=3  [DatanodeInfoWithStorage[172.17.0.5:9866,DS-5840172a-b11f-4b67-ba2a-dc19c050287b,DISK], DatanodeInfoWithStorage[1

### Decomission nodes

- dfs.hosts.exclude in hdfs-site.xml

In [109]:
%%dockerexec -u hadoop hadoop
source /opt/envvars.sh

# Decomissioning hadoop1
cat > /opt/hadoop/etc/hadoop/dfs.exclude << EOF
hadoop1
EOF

hdfs dfsadmin -refreshNodes

Refresh nodes successful


http://localhost:9870

In [110]:
%%dockerexec -u hadoop hadoop
source /opt/envvars.sh

# report HDFS status
hdfs dfsadmin -report

Configured Capacity: 125451247616 (116.84 GB)
Present Capacity: 40175570893 (37.42 GB)
DFS Remaining: 40167333888 (37.41 GB)
DFS Used: 8237005 (7.86 MB)
DFS Used%: 0.02%
Replicated Blocks:
	Under replicated blocks: 0
	Blocks with corrupt replicas: 0
	Missing blocks: 0
	Missing blocks (with replication factor 1): 0
	Low redundancy blocks with highest priority to recover: 0
	Pending deletion blocks: 0
Erasure Coded Block Groups: 
	Low redundancy block groups: 0
	Block groups with corrupt internal blocks: 0
	Missing block groups: 0
	Low redundancy blocks with highest priority to recover: 0
	Pending deletion blocks: 0

-------------------------------------------------
Live datanodes (3):

Name: 172.17.0.3:9866 (hadoop1)
Hostname: hadoop1
Decommission Status : Decommissioned
Configured Capacity: 62725623808 (58.42 GB)
DFS Used: 3849357 (3.67 MB)
Non DFS Used: 39421391731 (36.71 GB)
DFS Remaining: 20083666944 (18.70 GB)
DFS Used%: 0.01%
DFS Remaining%: 32.02%
Configured Cache Capacity: 0 (0 

In [111]:
%%dockerexec -u hadoop hadoop
source /opt/envvars.sh

# Recomission all nodes
cat > /opt/hadoop/etc/hadoop/dfs.exclude << EOF
EOF

hdfs dfsadmin -refreshNodes

Refresh nodes successful


In [115]:
%%dockerexec -u hadoop hadoop
source /opt/envvars.sh

# report HDFS status
hdfs dfsadmin -report

Configured Capacity: 125451247616 (116.84 GB)
Present Capacity: 40182281386 (37.42 GB)
DFS Remaining: 40174968832 (37.42 GB)
DFS Used: 7312554 (6.97 MB)
DFS Used%: 0.02%
Replicated Blocks:
	Under replicated blocks: 0
	Blocks with corrupt replicas: 0
	Missing blocks: 0
	Missing blocks (with replication factor 1): 0
	Low redundancy blocks with highest priority to recover: 0
	Pending deletion blocks: 0
Erasure Coded Block Groups: 
	Low redundancy block groups: 0
	Block groups with corrupt internal blocks: 0
	Missing block groups: 0
	Low redundancy blocks with highest priority to recover: 0
	Pending deletion blocks: 0

-------------------------------------------------
Live datanodes (2):

Name: 172.17.0.4:9866 (hadoop2)
Hostname: hadoop2
Decommission Status : Normal
Configured Capacity: 62725623808 (58.42 GB)
DFS Used: 3222266 (3.07 MB)
Non DFS Used: 39418201350 (36.71 GB)
DFS Remaining: 20087484416 (18.71 GB)
DFS Used%: 0.01%
DFS Remaining%: 32.02%
Configured Cache Capacity: 0 (0 B)
Cache

### Handling datanode failures

- timeouts defined in hdfs-site.xml 
    - dfs.namenode.heartbeat.recheck-interval = 10000 (10 seconds)
    - dfs.heartbeat.interval = 3 seconds
- timeout = 2 x recheck-interval + 10 x heartbeat.interval
    - timeout = 50 seconds

In [116]:
%%dockerexec -u hadoop hadoop
source /opt/envvars.sh

# get dfs.namenode.heartbeat.recheck-interval
hdfs getconf -confKey dfs.namenode.heartbeat.recheck-interval

# get dfs.heartbeat.interval
hdfs getconf -confKey dfs.heartbeat.interval

10000
3s


In [113]:
%%bash

# simulate node fault
docker pause hadoop1

hadoop1


http://localhost:9870

In [93]:
%%dockerexec -u hadoop hadoop
source /opt/envvars.sh

hdfs dfsadmin -report

Configured Capacity: 125451247616 (116.84 GB)
Present Capacity: 40179048448 (37.42 GB)
DFS Remaining: 40171765760 (37.41 GB)
DFS Used: 7282688 (6.95 MB)
DFS Used%: 0.02%
Replicated Blocks:
	Under replicated blocks: 0
	Blocks with corrupt replicas: 0
	Missing blocks: 0
	Missing blocks (with replication factor 1): 0
	Low redundancy blocks with highest priority to recover: 0
	Pending deletion blocks: 0
Erasure Coded Block Groups: 
	Low redundancy block groups: 0
	Block groups with corrupt internal blocks: 0
	Missing block groups: 0
	Low redundancy blocks with highest priority to recover: 0
	Pending deletion blocks: 0

-------------------------------------------------
Live datanodes (2):

Name: 172.17.0.4:9866 (hadoop2)
Hostname: hadoop2
Decommission Status : Normal
Configured Capacity: 62725623808 (58.42 GB)
DFS Used: 3641344 (3.47 MB)
Non DFS Used: 39419383808 (36.71 GB)
DFS Remaining: 20085882880 (18.71 GB)
DFS Used%: 0.01%
DFS Remaining%: 32.02%
Configured Cache Capacity: 0 (0 B)
Cache

In [117]:
%%bash

# recover node
docker unpause hadoop1

hadoop1


In [118]:
%%dockerexec -u hadoop hadoop
source /opt/envvars.sh

hdfs dfsadmin -report

Configured Capacity: 188176871424 (175.25 GB)
Present Capacity: 60270694349 (56.13 GB)
DFS Remaining: 60262416384 (56.12 GB)
DFS Used: 8277965 (7.89 MB)
DFS Used%: 0.01%
Replicated Blocks:
	Under replicated blocks: 0
	Blocks with corrupt replicas: 0
	Missing blocks: 0
	Missing blocks (with replication factor 1): 0
	Low redundancy blocks with highest priority to recover: 0
	Pending deletion blocks: 0
Erasure Coded Block Groups: 
	Low redundancy block groups: 0
	Block groups with corrupt internal blocks: 0
	Missing block groups: 0
	Low redundancy blocks with highest priority to recover: 0
	Pending deletion blocks: 0

-------------------------------------------------
Live datanodes (3):

Name: 172.17.0.3:9866 (hadoop1)
Hostname: hadoop1
Decommission Status : Normal
Configured Capacity: 62725623808 (58.42 GB)
DFS Used: 40960 (40 KB)
Non DFS Used: 39421394944 (36.71 GB)
DFS Remaining: 20087472128 (18.71 GB)
DFS Used%: 0.00%
DFS Remaining%: 32.02%
Configured Cache Capacity: 0 (0 B)
Cache Use