In [None]:
# Load environment variables
%load_ext dotenv
%dotenv -o /opt/envvars.sh
%env

# MapReduce

## Java Map Reduce API - WordCount

In [None]:
%mkdir -p /opt/src/wc_java
%cd /opt/src/wc_java

In [None]:
%%writefile WordCountDriver.java
// WordCountDriver.java
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class WordCountDriver extends Configured implements Tool {
  public int run(String[] args) throws Exception {
   if (args.length != 2) {
    System.out.printf("Usage: %s [generic options] <inputdir> <outputdir>\n", getClass().getSimpleName());
    return -1;
   }

   Job job = Job.getInstance(getConf(), "Word Count");
   job.setJarByClass(WordCountDriver.class);
   FileInputFormat.setInputPaths(job, new Path(args[0]));
   FileOutputFormat.setOutputPath(job, new Path(args[1]));
   job.setMapperClass(WordCountMapper.class);
   job.setReducerClass(WordCountReducer.class);
   job.setMapOutputKeyClass(Text.class);
   job.setMapOutputValueClass(IntWritable.class);
   job.setOutputKeyClass(Text.class);
   job.setOutputValueClass(IntWritable.class);
   boolean success = job.waitForCompletion(true);
   return success ? 0 : 1;
 }

 public static void main(String[] args) throws Exception {
   int exitCode = ToolRunner.run(new Configuration(), new WordCountDriver(), args);
   System.exit(exitCode);
 }
}

In [None]:
%%writefile WordCountMapper.java
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable>
{
  private final static IntWritable one = new IntWritable(1);
  private Text wordObject = new Text();

  @Override
  public void map(LongWritable key, Text value, Context context)
    throws IOException, InterruptedException {

   String line = value.toString();
   for (String word : line.split("\\W+")) {
    if (word.length() > 0) {
     wordObject.set(word);
     context.write(wordObject, one);
    }
   }
  }
}

In [None]:
%%writefile WordCountReducer.java
// WordCountReducer.java
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable>
{
  private IntWritable wordCountWritable = new IntWritable();
  @Override
   public void reduce(Text key, Iterable<IntWritable> values, Context context)
             throws IOException, InterruptedException {
         int wordCount = 0;
         for (IntWritable value : values) {
             wordCount += value.get();
         }
         wordCountWritable.set(wordCount);
         context.write(key, wordCountWritable);
    }
}

In [None]:
%%bash

hadoop classpath

In [None]:
%%bash

# compile source code and create jar file
javac -classpath `hadoop classpath` *.java
jar cvf wc.jar *.class
ls

In [None]:
%%bash

cd /opt/datasets
# download book "The Complete Works of William Shakespeare, by William Shakespeare" from Gutenberg Project
wget -q -c http://www.gutenberg.org/files/100/100-0.txt -O shakespeare.txt
    
# create directory in HDFS and put file
hdfs dfs -mkdir shakespeare
hdfs dfs -put shakespeare.txt shakespeare
hdfs dfs -ls -h shakespeare

In [None]:
%%bash

# run wordcount using 2 reducers
hadoop jar wc.jar WordCountDriver -D mapreduce.job.reduces=2 shakespeare shakespeare-output

In [None]:
%%bash

cd /opt/datasets

# check output files
hdfs dfs -ls shakespeare-output

# get output from HDFS
hdfs dfs -getmerge shakespeare-output shakespeare-output.txt

# head shakespeare-output.txt
head shakespeare-output.txt

## Hadoop Streaming

In [None]:
%mkdir /opt/src/wc_streaming
%cd /opt/src/wc_streaming

In [None]:
%%writefile wordmapper.py
#!/usr/bin/env python3
# wordmapper.py

import sys
for line in sys.stdin:
  line = line.strip()
  words = line.split()
  for word in words:
   print('%s\t%s' % (word, 1))

In [None]:
%%writefile wordreducer.py
#!/usr/bin/env python3
# wordreducer.py

import sys

thisword = None
wordcount = 0
word = None

for line in sys.stdin:
    line = line.strip()
    word, count = line.split('\t', 1)
    count = int(count)

    if thisword == word:
        wordcount += count
    else:
        if thisword:
            print('%s\t%d' % (thisword, wordcount))
        wordcount = count
        thisword = word

if thisword == word:
    print('%s\t%d' % (thisword, wordcount))

In [None]:
%%bash

cd /opt/datasets

# download book "Ulysses, by James Joyce" from Gutenberg Project
wget -q -c http://www.gutenberg.org/files/4300/4300-0.txt -O ulysses.txt
    
# create directory in HDFS and put file
hdfs dfs -mkdir ulysses
hdfs dfs -put ulysses.txt ulysses
hdfs dfs -ls -h ulysses

In [None]:
%%bash

# execute using Hadoop Streaming
hadoop jar \
$HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-3.2.3.jar \
-input ulysses \
-output ulysses-output \
-mapper wordmapper.py \
-reducer wordreducer.py \
-file wordmapper.py \
-file wordreducer.py

In [None]:
%%bash

cd /opt/datasets

# check output files
hdfs dfs -ls ulysses-output

# get output from HDFS
hdfs dfs -getmerge ulysses-output ulysses-output.txt

# head output
head ulysses-output.txt