In [1]:
%load_ext dockermagic

# MapReduce

## Java Map Reduce API - WordCount

In [2]:
%%dockerexec hadoop

mkdir -p /opt/src/wc_java

cat <<EOF > /opt/src/wc_java/WordCountDriver.java
// WordCountDriver.java
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class WordCountDriver extends Configured implements Tool {
  public int run(String[] args) throws Exception {
   if (args.length != 2) {
    System.out.printf("Usage: %s [generic options] <inputdir> <outputdir>\n", getClass().getSimpleName());
    return -1;
   }

   Job job = Job.getInstance(getConf(), "Word Count");
   job.setJarByClass(WordCountDriver.class);
   FileInputFormat.setInputPaths(job, new Path(args[0]));
   FileOutputFormat.setOutputPath(job, new Path(args[1]));
   job.setMapperClass(WordCountMapper.class);
   job.setReducerClass(WordCountReducer.class);
   job.setMapOutputKeyClass(Text.class);
   job.setMapOutputValueClass(IntWritable.class);
   job.setOutputKeyClass(Text.class);
   job.setOutputValueClass(IntWritable.class);
   boolean success = job.waitForCompletion(true);
   return success ? 0 : 1;
 }

 public static void main(String[] args) throws Exception {
   int exitCode = ToolRunner.run(new Configuration(), new WordCountDriver(), args);
   System.exit(exitCode);
 }
}
EOF

In [9]:
%%dockerexec hadoop

cat <<EOF > /opt/src/wc_java/WordCountMapper.java
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable>
{
  private final static IntWritable one = new IntWritable(1);
  private Text wordObject = new Text();

  @Override
  public void map(LongWritable key, Text value, Context context)
    throws IOException, InterruptedException {

   String line = value.toString();
   for (String word : line.split("\\\W+")) {
    if (word.length() > 0) {
     wordObject.set(word);
     context.write(wordObject, one);
    }
   }
  }
}
EOF

In [4]:
%%dockerexec hadoop

cat <<EOF > /opt/src/wc_java/WordCountReducer.java
// WordCountReducer.java
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable>
{
  private IntWritable wordCountWritable = new IntWritable();
  @Override
   public void reduce(Text key, Iterable<IntWritable> values, Context context)
             throws IOException, InterruptedException {
         int wordCount = 0;
         for (IntWritable value : values) {
             wordCount += value.get();
         }
         wordCountWritable.set(wordCount);
         context.write(key, wordCountWritable);
    }
}
EOF

In [5]:
%%dockerexec hadoop

source /opt/envvars.sh

hadoop classpath

/opt/hadoop/etc/hadoop:/opt/hadoop/share/hadoop/common/lib/*:/opt/hadoop/share/hadoop/common/*:/opt/hadoop/share/hadoop/hdfs:/opt/hadoop/share/hadoop/hdfs/lib/*:/opt/hadoop/share/hadoop/hdfs/*:/opt/hadoop/share/hadoop/mapreduce/*:/opt/hadoop/share/hadoop/yarn:/opt/hadoop/share/hadoop/yarn/lib/*:/opt/hadoop/share/hadoop/yarn/*


In [10]:
%%dockerexec hadoop

source /opt/envvars.sh

cd /opt/src/wc_java

# compile source code and create jar file
javac -classpath `hadoop classpath` *.java
jar cvf wc.jar *.class
ls

added manifest
adding: WordCountDriver.class(in = 2277) (out= 1104)(deflated 51%)
adding: WordCountMapper.class(in = 1921) (out= 813)(deflated 57%)
adding: WordCountReducer.class(in = 1711) (out= 719)(deflated 57%)
WordCountDriver.class
WordCountDriver.java
WordCountMapper.class
WordCountMapper.java
WordCountReducer.class
WordCountReducer.java
wc.jar


In [11]:
%%dockerexec hadoop

source /opt/envvars.sh

# download book "The Complete Works of William Shakespeare, by William Shakespeare" from Gutenberg Project
wget -q -c http://www.gutenberg.org/files/100/100-0.txt -O shakespeare.txt

# create directory in HDFS and put file
hdfs dfs -mkdir shakespeare
hdfs dfs -put shakespeare.txt shakespeare
hdfs dfs -ls -h shakespeare

/opt
Found 1 items
-rw-r--r--   2 hadoop hadoop      5.4 M 2023-12-15 15:41 shakespeare/shakespeare.txt


In [12]:
%%dockerexec hadoop

source /opt/envvars.sh

cd /opt/src/wc_java

# run wordcount using 2 reducers
hadoop jar wc.jar WordCountDriver -D mapreduce.job.reduces=2 shakespeare shakespeare-output

2023-12-15 15:42:30,805 INFO client.DefaultNoHARMFailoverProxyProvider: Connecting to ResourceManager at hadoop/172.18.0.5:8032
2023-12-15 15:42:30,915 INFO client.AHSProxy: Connecting to Application History server at hadoop/172.18.0.5:10200
2023-12-15 15:42:31,128 INFO mapreduce.JobResourceUploader: Disabling Erasure Coding for path: /tmp/hadoop-yarn/staging/hadoop/.staging/job_1702665413412_0001
2023-12-15 15:42:31,897 INFO input.FileInputFormat: Total input files to process : 1
2023-12-15 15:42:31,997 INFO mapreduce.JobSubmitter: number of splits:1
2023-12-15 15:42:32,285 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1702665413412_0001
2023-12-15 15:42:32,286 INFO mapreduce.JobSubmitter: Executing with tokens: []
2023-12-15 15:42:32,404 INFO conf.Configuration: resource-types.xml not found
2023-12-15 15:42:32,405 INFO resource.ResourceUtils: Unable to find 'resource-types.xml'.
2023-12-15 15:42:32,625 INFO impl.YarnClientImpl: Submitted application application_17026654

In [None]:
%%dockerexec hadoop

source /opt/envvars.sh

# check output files
hdfs dfs -ls shakespeare-output

# get output from HDFS
hdfs dfs -getmerge shakespeare-output shakespeare-output.txt

# head shakespeare-output.txt
head shakespeare-output.txt

## Hadoop Streaming

In [None]:
%%dockerexec hadoop

mkdir /opt/src/wc_streaming

cat <<EOF > /opt/src/wc_streaming/wordmapper.py
#!/usr/bin/env python3
# wordmapper.py

import sys
for line in sys.stdin:
  line = line.strip()
  words = line.split()
  for word in words:
   print('%s\t%s' % (word, 1))
EOF

In [None]:
%%dockerexec hadoop

cat <<EOF > /opt/src/wc_streaming/wordreducer.py
#!/usr/bin/env python3
# wordreducer.py

import sys

thisword = None
wordcount = 0
word = None

for line in sys.stdin:
    line = line.strip()
    word, count = line.split('\t', 1)
    count = int(count)

    if thisword == word:
        wordcount += count
    else:
        if thisword:
            print('%s\t%d' % (thisword, wordcount))
        wordcount = count
        thisword = word

if thisword == word:
    print('%s\t%d' % (thisword, wordcount))
EOF

In [None]:
%%dockerexec hadoop

# download book "Ulysses, by James Joyce" from Gutenberg Project
wget -q -c http://www.gutenberg.org/files/4300/4300-0.txt -O ulysses.txt
    
# create directory in HDFS and put file
hdfs dfs -mkdir ulysses
hdfs dfs -put ulysses.txt ulysses
hdfs dfs -ls -h ulysses

In [None]:
%%dockerexec hadoop

source /opt/envvars.sh

cd /opt/src/wc_streaming

# execute using Hadoop Streaming
hadoop jar \
$HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-3.2.2.jar \
-input ulysses \
-output ulysses-output \
-mapper wordmapper.py \
-reducer wordreducer.py \
-file wordmapper.py \
-file wordreducer.py

In [None]:
%%dockerexec hadoop

source /opt/envvars.sh

# check output files
hdfs dfs -ls ulysses-output

# get output from HDFS
hdfs dfs -getmerge ulysses-output ulysses-output.txt

# head output
head ulysses-output.txt