In [1]:
%load_ext dockermagic

# Flume
![Flume](https://flume.apache.org/_static/flume-logo.png)

- https://flume.apache.org

## Setup

- download from https://flume.apache.org/download.html
- version 1.11.0

In [4]:
%%dockerexec hadoop

# Download package
mkdir /opt/pkgs
cd /opt/pkgs
wget -q -c https://downloads.apache.org/flume/1.11.0/apache-flume-1.11.0-bin.tar.gz

# unpack file and create link
tar -zxf /opt/pkgs/apache-flume-1.11.0-bin.tar.gz -C /opt
ln -s /opt/apache-flume-1.11.0-bin /opt/flume

# update envvars.sh
cat >> /opt/envvars.sh << EOF
# Flume
export FLUME_HOME=/opt/flume
export PATH=\${PATH}:\${FLUME_HOME}/bin

EOF

cat /opt/envvars.sh

export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk-amd4
export PDSH_RCMD_TYPE=ssh
export HADOOP_HOME=/opt/hadoop
export HADOOP_VERSION=3.3.6
export HADOOP_COMMON_HOME=${HADOOP_HOME}
export HADOOP_CONF_DIR=${HADOOP_HOME}/etc/hadoop
export HADOOP_HDFS_HOME=${HADOOP_HOME}
export HADOOP_MAPRED_HOME=${HADOOP_HOME}
export HADOOP_YARN_HOME=${HADOOP_HOME}
export PATH=${PATH}:${HADOOP_HOME}/bin:${HADOOP_HOME}/sbin

# Flume
export FLUME_HOME=/opt/flume
export PATH=${PATH}:${FLUME_HOME}/bin



## Tailagent example

- https://flume.apache.org/releases/content/1.11.0/FlumeUserGuide.html

In [5]:
%%dockerexec hadoop

cat <<EOF > /opt/flume/conf/tailagent.conf
# Agent components
tailagent.sources = execsource
tailagent.channels = memchannel
tailagent.sinks = hdfssink

# Configuring source
tailagent.sources.execsource.type = exec
tailagent.sources.execsource.command = tail -F /tmp/events

# Configuring sink
tailagent.sinks.hdfssink.type = hdfs
tailagent.sinks.hdfssink.hdfs.path = /tmp
tailagent.sinks.hdfssink.hdfs.filePrefix = tailevents-
tailagent.sinks.hdfssink.hdfs.fileType = DataStream

# Configuring channel
tailagent.channels.memchannel.type = memory

# Bind the source and sink to the channel 
tailagent.sources.execsource.channels = memchannel
tailagent.sinks.hdfssink.channel = memchannel
EOF

In [6]:
%%dockerexec hadoop

source /opt/envvars.sh

# run agent in background
cd /opt/flume

flume-ng agent -n tailagent -c ./conf \
-f ./conf/tailagent.conf > ./tailagent.output 2>&1 &
echo $! > ./tailagent.pid

ps -fp $(cat ./tailagent.pid)

UID          PID    PPID  C STIME TTY          TIME CMD


In [8]:
%%dockerexec hadoop

# run random generator in background
cd /opt/flume

cat > randomgen.sh << EOF
while true
do
    echo \${RANDOM} >> /tmp/events
    sleep 1
done
EOF

chmod +x randomgen.sh
./randomgen.sh > /dev/null 2>&1 &
echo $! > ./randomgen.pid

ps -fp $(cat ./randomgen.pid)

UID          PID    PPID  C STIME TTY          TIME CMD
hadoop      2550    2542  0 10:38 ?        00:00:00 bash


In [9]:
%%dockerexec hadoop

source /opt/envvars.sh

# check files generated in HDFS
hdfs dfs -ls /tmp/tailevents*

-rw-r--r--   2 hadoop supergroup         54 2023-12-05 10:38 /tmp/tailevents-.1701783503166
-rw-r--r--   2 hadoop supergroup         54 2023-12-05 10:38 /tmp/tailevents-.1701783503167
-rw-r--r--   2 hadoop supergroup          0 2023-12-05 10:38 /tmp/tailevents-.1701783503168.tmp


In [11]:
%%dockerexec hadoop

source /opt/envvars.sh

# cat files
hdfs dfs -cat /tmp/tailevents*

22820
1730
19818
9288
157
29896
2737
31966
27024
8438
57
13912
22149
21682
4636
32751
9308
7610
10160
10424
27527
8378
4966
15820
23815
3454
30339
6711
20963
10475
24744
16173
25017
26700
5771
8953
10394
20998
16547
9895


In [12]:
%%dockerexec hadoop

source /opt/envvars.sh

cd /opt/flume

# kill random generator
kill $(cat randomgen.pid)
rm randomgen.pid

# kill tailagent
kill $(cat tailagent.pid)
rm tailagent.pid
rm tailagent.output

# remove files
hdfs dfs -rm /tmp/tailevents*

Deleted /tmp/tailevents-.1701783503166
Deleted /tmp/tailevents-.1701783503167
Deleted /tmp/tailevents-.1701783503168
Deleted /tmp/tailevents-.1701783503169
Deleted /tmp/tailevents-.1701783503170
Deleted /tmp/tailevents-.1701783503171
Deleted /tmp/tailevents-.1701783503172.tmp
