# Flume
![Flume](https://flume.apache.org/_static/flume-logo.png)

- https://flume.apache.org

## Setup

- download from https://flume.apache.org/download.html
- version 1.9.0

In [1]:
%%bash

# Download package
cd /opt/pkgs
wget -q -c https://downloads.apache.org/flume/1.9.0/apache-flume-1.9.0-bin.tar.gz

# unpack file and create link
tar -zxf /opt/pkgs/apache-flume-1.9.0-bin.tar.gz -C /opt
ln -s /opt/apache-flume-1.9.0-bin /opt/flume

# update guava library on Flume
rm -f /opt/flume/lib/guava-11.0.2.jar 
cp -f /opt/hadoop/share/hadoop/common/lib/guava-27.0-jre.jar /opt/flume/lib

# update envvars.sh
cat >> /opt/envvars.sh << EOF
# Flume
export FLUME_HOME=/opt/flume
export PATH=\${PATH}:\${FLUME_HOME}/bin

EOF

cat /opt/envvars.sh

export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk-amd64
export PDSH_RCMD_TYPE=ssh

export HADOOP_HOME=/opt/hadoop
export HADOOP_COMMON_HOME=${HADOOP_HOME}
export HADOOP_CONF_DIR=${HADOOP_HOME}/etc/hadoop
export HADOOP_HDFS_HOME=${HADOOP_HOME}
export HADOOP_MAPRED_HOME=${HADOOP_HOME}
export HADOOP_YARN_HOME=${HADOOP_HOME}

export PATH=${PATH}:${HADOOP_HOME}/bin:${HADOOP_HOME}/sbin     

# Flume
export FLUME_HOME=/opt/flume
export PATH=${PATH}:${FLUME_HOME}/bin



In [2]:
# Load environment variables
%load_ext dotenv
%dotenv -o /opt/envvars.sh
%env

{'HOSTNAME': 'hadoop',
 'OLDPWD': '/',
 'PWD': '/opt',
 'HOME': '/home/hadoop',
 'SHELL': '/bin/bash',
 'SHLVL': '1',
 'PATH': '/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/hadoop/bin:/opt/hadoop/sbin:/opt/flume/bin',
 '_': '/usr/bin/nohup',
 'LANGUAGE': 'en.UTF-8',
 'LANG': 'en.UTF-8',
 'JPY_PARENT_PID': '1566',
 'TERM': 'xterm-color',
 'CLICOLOR': '1',
 'PAGER': 'cat',
 'GIT_PAGER': 'cat',
 'MPLBACKEND': 'module://ipykernel.pylab.backend_inline',
 'JAVA_HOME': '/usr/lib/jvm/java-1.8.0-openjdk-amd64',
 'PDSH_RCMD_TYPE': 'ssh',
 'HADOOP_HOME': '/opt/hadoop',
 'HADOOP_COMMON_HOME': '/opt/hadoop',
 'HADOOP_CONF_DIR': '/opt/hadoop/etc/hadoop',
 'HADOOP_HDFS_HOME': '/opt/hadoop',
 'HADOOP_MAPRED_HOME': '/opt/hadoop',
 'HADOOP_YARN_HOME': '/opt/hadoop',
 'FLUME_HOME': '/opt/flume'}

## Tailagent example

- https://flume.apache.org/releases/content/1.9.0/FlumeUserGuide.html

In [3]:
%%writefile /opt/flume/conf/tailagent.conf
# Agent components
tailagent.sources = execsource
tailagent.channels = memchannel
tailagent.sinks = hdfssink

# Configuring source
tailagent.sources.execsource.type = exec
tailagent.sources.execsource.command = tail -F /tmp/events

# Configuring sink
tailagent.sinks.hdfssink.type = hdfs
tailagent.sinks.hdfssink.hdfs.path = /tmp
tailagent.sinks.hdfssink.hdfs.filePrefix = tailevents-
tailagent.sinks.hdfssink.hdfs.fileType = DataStream

# Configuring channel
tailagent.channels.memchannel.type = memory

# Bind the source and sink to the channel 
tailagent.sources.execsource.channels = memchannel
tailagent.sinks.hdfssink.channel = memchannel

Writing /opt/flume/conf/tailagent.conf


In [4]:
%%bash

# run agent in background
cd /opt/flume

flume-ng agent -n tailagent -c ./conf \
-f ./conf/tailagent.conf > ./tailagent.output 2>&1 &
echo $! > ./tailagent.pid

ps -fp $(cat ./tailagent.pid)

UID        PID  PPID  C STIME TTY          TIME CMD
hadoop    6284  6283  0 18:42 ?        00:00:00 /bin/bash /opt/flume/bin/flume-ng agent -n tailagent -c ./conf -f ./conf/tailagent.conf


In [5]:
%%bash

# run random generator in background
cd /opt/flume

cat > randomgen.sh << EOF
while true
do
    echo \${RANDOM} >> /tmp/events
    sleep 1
done
EOF

chmod +x randomgen.sh
./randomgen.sh > /dev/null 2>&1 &
echo $! > ./randomgen.pid

ps -fp $(cat ./randomgen.pid)

UID        PID  PPID  C STIME TTY          TIME CMD
hadoop    6402  6399  0 18:43 ?        00:00:00 bash


In [6]:
%%bash

# check files generated in HDFS
hdfs dfs -ls /tmp/tailevents*

-rw-r--r--   2 hadoop supergroup         58 2021-01-28 18:43 /tmp/tailevents-.1611859419661
-rw-r--r--   2 hadoop supergroup         58 2021-01-28 18:43 /tmp/tailevents-.1611859419662
-rw-r--r--   2 hadoop supergroup         60 2021-01-28 18:44 /tmp/tailevents-.1611859419663
-rw-r--r--   2 hadoop supergroup          0 2021-01-28 18:44 /tmp/tailevents-.1611859419664.tmp


In [7]:
%%bash

# cat files
hdfs dfs -cat /tmp/tailevents*

23394
5142
16495
25499
23988
26597
1549
16255
21528
11905
15779
11346
17910
15780
5660
6556
29977
17824
15921
14510
18007
31382
16445
32398
17829
29805
19372
14028
18519
32473
4764
16680
14764
26848
28276
738
30500
26792
30753
21743
16188
2908
905
20420
25857
21791
6036
4479
12589
12286
1138
27667
29653
17335
21126
5907
29906
17289
27682
22521
13668
22511
13277
10957
2245
27363
25891
25581
30651
9290


2021-01-28 18:44:46,857 INFO sasl.SaslDataTransferClient: SASL encryption trust check: localHostTrusted = false, remoteHostTrusted = false
2021-01-28 18:44:47,378 INFO sasl.SaslDataTransferClient: SASL encryption trust check: localHostTrusted = false, remoteHostTrusted = false
2021-01-28 18:44:47,583 INFO sasl.SaslDataTransferClient: SASL encryption trust check: localHostTrusted = false, remoteHostTrusted = false


In [8]:
%%bash

cd /opt/flume

# kill random generator
kill $(cat randomgen.pid)
rm randomgen.pid

# kill tailagent
kill $(cat tailagent.pid)
rm tailagent.pid
rm tailagent.output

# remove files
hdfs dfs -rm /tmp/tailevents*

Deleted /tmp/tailevents-.1611859419661
Deleted /tmp/tailevents-.1611859419662
Deleted /tmp/tailevents-.1611859419663
Deleted /tmp/tailevents-.1611859419664
Deleted /tmp/tailevents-.1611859419665
Deleted /tmp/tailevents-.1611859419666
Deleted /tmp/tailevents-.1611859419667
Deleted /tmp/tailevents-.1611859419668
Deleted /tmp/tailevents-.1611859419669
Deleted /tmp/tailevents-.1611859419670
Deleted /tmp/tailevents-.1611859419671
Deleted /tmp/tailevents-.1611859419672
