In [2]:
%load_ext dockermagic

# Flume
![Flume](https://flume.apache.org/_static/flume-logo.png)

- https://flume.apache.org

## Setup

- download from https://flume.apache.org/download.html
- version 1.9.0

In [3]:
%%bash

# Download package
wget -q -c https://downloads.apache.org/flume/1.9.0/apache-flume-1.9.0-bin.tar.gz

# Copy installation package to container
docker cp apache-flume-1.9.0-bin.tar.gz hadoop:/opt

In [4]:
%%dockerexec -u hadoop hadoop

# unpack file and create link
tar -zxf /opt/apache-flume-1.9.0-bin.tar.gz -C /opt
ln -s /opt/apache-flume-1.9.0-bin /opt/flume

# update guava library on Flume
rm -f /opt/flume/lib/guava-11.0.2.jar 
cp -f /opt/hadoop/share/hadoop/common/lib/guava-27.0-jre.jar /opt/flume/lib

# update envvars.sh
cat >> /opt/envvars.sh << EOF
# Flume
export FLUME_HOME=/opt/flume
export PATH=\$PATH:\$FLUME_HOME/bin

EOF

sudo rm /opt/apache-flume-1.9.0-bin.tar.gz

## Tailagent example

- https://flume.apache.org/releases/content/1.9.0/FlumeUserGuide.html

In [15]:
%%dockerexec -u hadoop hadoop
source /opt/envvars.sh

# create tailagent.conf
cat > $FLUME_HOME/conf/tailagent.conf << EOF
# Agent components
tailagent.sources = execsource
tailagent.channels = memchannel
tailagent.sinks = hdfssink

# Configuring source
tailagent.sources.execsource.type = exec
tailagent.sources.execsource.command = tail -F /tmp/events

# Configuring sink
tailagent.sinks.hdfssink.type = hdfs
tailagent.sinks.hdfssink.hdfs.path = /tmp
tailagent.sinks.hdfssink.hdfs.filePrefix = tailevents-
tailagent.sinks.hdfssink.hdfs.fileType = DataStream

# Configuring channel
tailagent.channels.memchannel.type = memory

# Bind the source and sink to the channel 
tailagent.sources.execsource.channels = memchannel
tailagent.sinks.hdfssink.channel = memchannel
EOF

In [16]:
%%dockerexec -u hadoop hadoop
source /opt/envvars.sh

# run agent in background
cd /opt/flume
nohup flume-ng agent -n tailagent -c ./conf \
-f ./conf/tailagent.conf > ./tailagent.output 2>&1 &
echo $! > ./tailagent.pid

ps -fp $(cat ./tailagent.pid)

UID        PID  PPID  C STIME TTY          TIME CMD
hadoop   10516 10510  0 22:19 ?        00:00:00 /bin/bash /opt/flume/bin/flume-ng agent -n tailagent -c ./conf -f ./conf/tailagent.conf


In [17]:
%%dockerexec -u hadoop hadoop

# run random generator in background
cd /opt/flume
while true; do echo $RANDOM >> /tmp/events; sleep 1; done &
echo $! > ./randomgen.pid
disown %1

In [20]:
%%dockerexec -u hadoop hadoop
source /opt/envvars.sh

# check files generated in HDFS
hdfs dfs -ls /tmp/tailevents*

-rw-r--r--   2 hadoop supergroup         58 2021-01-10 22:20 /tmp/tailevents-.1610317196676
-rw-r--r--   2 hadoop supergroup         56 2021-01-10 22:20 /tmp/tailevents-.1610317196677
-rw-r--r--   2 hadoop supergroup         56 2021-01-10 22:20 /tmp/tailevents-.1610317196678
-rw-r--r--   2 hadoop supergroup         58 2021-01-10 22:20 /tmp/tailevents-.1610317196679
-rw-r--r--   2 hadoop supergroup         56 2021-01-10 22:20 /tmp/tailevents-.1610317196680
-rw-r--r--   2 hadoop supergroup         56 2021-01-10 22:20 /tmp/tailevents-.1610317196681
-rw-r--r--   2 hadoop supergroup          0 2021-01-10 22:20 /tmp/tailevents-.1610317196682.tmp


In [19]:
%%dockerexec -u hadoop hadoop
source /opt/envvars.sh

# cat files
hdfs dfs -cat /tmp/tailevents*

2021-01-10 22:20:46,543 INFO sasl.SaslDataTransferClient: SASL encryption trust check: localHostTrusted = false, remoteHostTrusted = false
32719
10482
110
24784
16871
27600
27711
28110
14724
20485
2021-01-10 22:20:46,871 INFO sasl.SaslDataTransferClient: SASL encryption trust check: localHostTrusted = false, remoteHostTrusted = false
3753
7063
23118
14781
19959
13979
11058
1944
18707
4090
9301
24316
5406
28845
8926
7828
17084
32219
22451
18105
18468
27422
14026
5063
11698
10843
16120
13702
5822
14653
2891
6160
29513
24807
584
20909
24327
32624
13137
15788


In [21]:
%%dockerexec -u hadoop hadoop
source /opt/envvars.sh
cd /opt/flume

# kill random generator
kill $(cat randomgen.pid)
rm randomgen.pid

# kill tailagent
kill $(cat tailagent.pid)
rm tailagent.pid
rm tailagent.output

# remove files
hdfs dfs -rm /tmp/tailevents*

Deleted /tmp/tailevents-.1610317196676
Deleted /tmp/tailevents-.1610317196677
Deleted /tmp/tailevents-.1610317196678
Deleted /tmp/tailevents-.1610317196679
Deleted /tmp/tailevents-.1610317196680
Deleted /tmp/tailevents-.1610317196681
Deleted /tmp/tailevents-.1610317196682
Deleted /tmp/tailevents-.1610317196683.tmp
