In [1]:
# Load environment variables
%load_ext dotenv
%dotenv -o /opt/envvars.sh
%env

{'HOSTNAME': 'hadoop',
 'OLDPWD': '/',
 'PWD': '/opt',
 'HOME': '/home/hadoop',
 'SHELL': '/bin/bash',
 'SHLVL': '1',
 'PATH': '/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/hadoop/bin:/opt/hadoop/sbin:/opt/flume/bin:/opt/sqoop/bin',
 '_': '/usr/bin/nohup',
 'LANGUAGE': 'en.UTF-8',
 'LANG': 'en.UTF-8',
 'JPY_PARENT_PID': '1566',
 'TERM': 'xterm-color',
 'CLICOLOR': '1',
 'PAGER': 'cat',
 'GIT_PAGER': 'cat',
 'MPLBACKEND': 'module://ipykernel.pylab.backend_inline',
 'JAVA_HOME': '/usr/lib/jvm/java-1.8.0-openjdk-amd64',
 'PDSH_RCMD_TYPE': 'ssh',
 'HADOOP_HOME': '/opt/hadoop',
 'HADOOP_COMMON_HOME': '/opt/hadoop',
 'HADOOP_CONF_DIR': '/opt/hadoop/etc/hadoop',
 'HADOOP_HDFS_HOME': '/opt/hadoop',
 'HADOOP_MAPRED_HOME': '/opt/hadoop',
 'HADOOP_YARN_HOME': '/opt/hadoop',
 'FLUME_HOME': '/opt/flume',
 'SQOOP_HOME': '/opt/sqoop'}

# MapReduce

## Java Map Reduce API - WordCount

In [2]:
%mkdir -p /opt/src/wc_java
%cd /opt/src/wc_java

/opt/src/wc_java


In [3]:
%%writefile WordCountDriver.java
// WordCountDriver.java
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class WordCountDriver extends Configured implements Tool {
  public int run(String[] args) throws Exception {
   if (args.length != 2) {
    System.out.printf("Usage: %s [generic options] <inputdir> <outputdir>\n", getClass().getSimpleName());
    return -1;
   }

   Job job = Job.getInstance(getConf(), "Word Count");
   job.setJarByClass(WordCountDriver.class);
   FileInputFormat.setInputPaths(job, new Path(args[0]));
   FileOutputFormat.setOutputPath(job, new Path(args[1]));
   job.setMapperClass(WordCountMapper.class);
   job.setReducerClass(WordCountReducer.class);
   job.setMapOutputKeyClass(Text.class);
   job.setMapOutputValueClass(IntWritable.class);
   job.setOutputKeyClass(Text.class);
   job.setOutputValueClass(IntWritable.class);
   boolean success = job.waitForCompletion(true);
   return success ? 0 : 1;
 }

 public static void main(String[] args) throws Exception {
   int exitCode = ToolRunner.run(new Configuration(), new WordCountDriver(), args);
   System.exit(exitCode);
 }
}

Writing WordCountDriver.java


In [4]:
%%writefile WordCountMapper.java
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable>
{
  private final static IntWritable one = new IntWritable(1);
  private Text wordObject = new Text();

  @Override
  public void map(LongWritable key, Text value, Context context)
    throws IOException, InterruptedException {

   String line = value.toString();
   for (String word : line.split("\\W+")) {
    if (word.length() > 0) {
     wordObject.set(word);
     context.write(wordObject, one);
    }
   }
  }
}

Writing WordCountMapper.java


In [5]:
%%writefile WordCountReducer.java
// WordCountReducer.java
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable>
{
  private IntWritable wordCountWritable = new IntWritable();
  @Override
   public void reduce(Text key, Iterable<IntWritable> values, Context context)
             throws IOException, InterruptedException {
         int wordCount = 0;
         for (IntWritable value : values) {
             wordCount += value.get();
         }
         wordCountWritable.set(wordCount);
         context.write(key, wordCountWritable);
    }
}

Writing WordCountReducer.java


In [6]:
%%bash

hadoop classpath

/opt/hadoop/etc/hadoop:/opt/hadoop/share/hadoop/common/lib/*:/opt/hadoop/share/hadoop/common/*:/opt/hadoop/share/hadoop/hdfs:/opt/hadoop/share/hadoop/hdfs/lib/*:/opt/hadoop/share/hadoop/hdfs/*:/opt/hadoop/share/hadoop/mapreduce/lib/*:/opt/hadoop/share/hadoop/mapreduce/*:/opt/hadoop/share/hadoop/yarn:/opt/hadoop/share/hadoop/yarn/lib/*:/opt/hadoop/share/hadoop/yarn/*


In [7]:
%%bash

# compile source code and create jar file
javac -classpath `hadoop classpath` *.java
jar cvf wc.jar *.class
ls

added manifest
adding: WordCountDriver.class(in = 2277) (out= 1104)(deflated 51%)
adding: WordCountMapper.class(in = 1921) (out= 813)(deflated 57%)
adding: WordCountReducer.class(in = 1711) (out= 719)(deflated 57%)
WordCountDriver.class
WordCountDriver.java
WordCountMapper.class
WordCountMapper.java
WordCountReducer.class
WordCountReducer.java
wc.jar


In [8]:
%%bash

cd /opt/datasets
# download book "The Complete Works of William Shakespeare, by William Shakespeare" from Gutenberg Project
wget -q -c http://www.gutenberg.org/files/100/100-0.txt -O shakespeare.txt
    
# create directory in HDFS and put file
hdfs dfs -mkdir shakespeare
hdfs dfs -put shakespeare.txt shakespeare
hdfs dfs -ls -h shakespeare

Found 1 items
-rw-r--r--   2 hadoop hadoop      5.5 M 2021-01-29 12:53 shakespeare/shakespeare.txt


2021-01-29 12:53:10,798 INFO sasl.SaslDataTransferClient: SASL encryption trust check: localHostTrusted = false, remoteHostTrusted = false


In [9]:
%%bash

# run wordcount using 2 reducers
hadoop jar wc.jar WordCountDriver -D mapreduce.job.reduces=2 shakespeare shakespeare-output

2021-01-29 12:54:45,437 INFO client.RMProxy: Connecting to ResourceManager at hadoop/172.17.0.2:8032
2021-01-29 12:54:46,441 INFO client.AHSProxy: Connecting to Application History server at hadoop/172.17.0.2:10200
2021-01-29 12:54:47,241 INFO mapreduce.JobResourceUploader: Disabling Erasure Coding for path: /tmp/hadoop-yarn/staging/hadoop/.staging/job_1611844877680_0007
2021-01-29 12:54:47,568 INFO sasl.SaslDataTransferClient: SASL encryption trust check: localHostTrusted = false, remoteHostTrusted = false
2021-01-29 12:54:48,499 INFO input.FileInputFormat: Total input files to process : 1
2021-01-29 12:54:48,631 INFO sasl.SaslDataTransferClient: SASL encryption trust check: localHostTrusted = false, remoteHostTrusted = false
2021-01-29 12:54:48,856 INFO sasl.SaslDataTransferClient: SASL encryption trust check: localHostTrusted = false, remoteHostTrusted = false
2021-01-29 12:54:48,903 INFO mapreduce.JobSubmitter: number of splits:1
2021-01-29 12:54:49,429 INFO sasl.SaslDataTransferCl

In [10]:
%%bash

cd /opt/datasets

# check output files
hdfs dfs -ls shakespeare-output

# get output from HDFS
hdfs dfs -getmerge shakespeare-output shakespeare-output.txt

# head shakespeare-output.txt
head shakespeare-output.txt

Found 3 items
-rw-r--r--   2 hadoop hadoop          0 2021-01-29 12:56 shakespeare-output/_SUCCESS
-rw-r--r--   2 hadoop hadoop     168219 2021-01-29 12:56 shakespeare-output/part-r-00000
-rw-r--r--   2 hadoop hadoop     166517 2021-01-29 12:56 shakespeare-output/part-r-00001
1	117
10	3
100	6
1000	1
1004	1
102	1
1020	1
1024	1
1028	1
1033	1


2021-01-29 12:58:05,073 INFO sasl.SaslDataTransferClient: SASL encryption trust check: localHostTrusted = false, remoteHostTrusted = false


## Hadoop Streaming

In [11]:
%mkdir /opt/src/wc_streaming
%cd /opt/src/wc_streaming

/opt/src/wc_streaming


In [12]:
%%writefile wordmapper.py
#!/usr/bin/env python3
# wordmapper.py

import sys
for line in sys.stdin:
  line = line.strip()
  words = line.split()
  for word in words:
   print('%s\t%s' % (word, 1))

Writing wordmapper.py


In [13]:
%%writefile wordreducer.py
#!/usr/bin/env python3
# wordreducer.py

import sys

thisword = None
wordcount = 0
word = None

for line in sys.stdin:
    line = line.strip()
    word, count = line.split('\t', 1)
    count = int(count)

    if thisword == word:
        wordcount += count
    else:
        if thisword:
            print('%s\t%d' % (thisword, wordcount))
        wordcount = count
        thisword = word

if thisword == word:
    print('%s\t%d' % (thisword, wordcount))

Writing wordreducer.py


In [14]:
%%bash

cd /opt/datasets

# download book "Ulysses, by James Joyce" from Gutenberg Project
wget -q -c http://www.gutenberg.org/files/4300/4300-0.txt -O ulysses.txt
    
# create directory in HDFS and put file
hdfs dfs -mkdir ulysses
hdfs dfs -put ulysses.txt ulysses
hdfs dfs -ls -h ulysses

Found 1 items
-rw-r--r--   2 hadoop hadoop      1.5 M 2021-01-29 13:00 ulysses/ulysses.txt


2021-01-29 13:00:51,923 INFO sasl.SaslDataTransferClient: SASL encryption trust check: localHostTrusted = false, remoteHostTrusted = false


In [15]:
%%bash

# execute using Hadoop Streaming
hadoop jar \
$HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-3.2.1.jar \
-input ulysses \
-output ulysses-output \
-mapper wordmapper.py \
-reducer wordreducer.py \
-file wordmapper.py \
-file wordreducer.py

packageJobJar: [wordmapper.py, wordreducer.py, /tmp/hadoop-unjar7214351184219525290/] [] /tmp/streamjob1648118698111977638.jar tmpDir=null


2021-01-29 13:01:17,809 WARN streaming.StreamJob: -file option is deprecated, please use generic option -files instead.
2021-01-29 13:01:21,148 INFO client.RMProxy: Connecting to ResourceManager at hadoop/172.17.0.2:8032
2021-01-29 13:01:22,242 INFO client.AHSProxy: Connecting to Application History server at hadoop/172.17.0.2:10200
2021-01-29 13:01:22,325 INFO client.RMProxy: Connecting to ResourceManager at hadoop/172.17.0.2:8032
2021-01-29 13:01:22,328 INFO client.AHSProxy: Connecting to Application History server at hadoop/172.17.0.2:10200
2021-01-29 13:01:23,322 INFO mapreduce.JobResourceUploader: Disabling Erasure Coding for path: /tmp/hadoop-yarn/staging/hadoop/.staging/job_1611844877680_0008
2021-01-29 13:01:24,370 INFO sasl.SaslDataTransferClient: SASL encryption trust check: localHostTrusted = false, remoteHostTrusted = false
2021-01-29 13:01:24,885 INFO sasl.SaslDataTransferClient: SASL encryption trust check: localHostTrusted = false, remoteHostTrusted = false
2021-01-29 13

In [16]:
%%bash

cd /opt/datasets

# check output files
hdfs dfs -ls ulysses-output

# get output from HDFS
hdfs dfs -getmerge ulysses-output ulysses-output.txt

# head output
head ulysses-output.txt

Found 2 items
-rw-r--r--   2 hadoop hadoop          0 2021-01-29 13:02 ulysses-output/_SUCCESS
-rw-r--r--   2 hadoop hadoop     530683 2021-01-29 13:02 ulysses-output/part-00000
"Defects,"	1
"Information	1
"Plain	2
"Project	5
"Right	1
#4300]	1
$5,000)	1
%	4
&c,	2
&c.	1


2021-01-29 13:02:51,299 INFO sasl.SaslDataTransferClient: SASL encryption trust check: localHostTrusted = false, remoteHostTrusted = false
