Rosetta Code

A collection of MapReduce tasks translated (from Pig, Hive, MapReduce streaming, etc.) into Scalding. For fully runnable code, see the repository here.

Word Count

Hadoop Streaming (Ruby)

# Emit (word, count) pairs.
def mapper
  STDIN.each_line do |line|
    line.split.each do |word|
      puts [word, 1].join("\t")
    end
  end
end

# Aggregate all (word, count) pairs for a particular word.
#
# In Hadoop Streaming (unlike standard Hadoop), the reducer receives
# rows from the mapper *one at a time*, though the rows are guaranteed
# to be sorted by key (and every row associated to a particular key
# will be sent to the same reducer).
def reducer
  curr_word = nil
  curr_count = 0
  STDIN.each_line do |line|
    word, count = line.strip.split("\t")
    if word != curr_word
      puts [curr_word, curr_count].join("\t")
      curr_word = word
      curr_count = 0
    end
    curr_count += count.to_i
  end
  
  puts [curr_word, curr_count].join("\t") unless curr_word.nil?
end

Cascalog

; Takes a single piece of text as input.
; Outputs a tuple for each word in the text.
(defmapcatop tokenize [text]
  (seq (.split text "\\s+")))

(defn word-count [input-filename]
	(let [input (hfs-textline input-filename)]
    (<- [?word ?count]
        (input ?textline)
        (tokenize ?textline :> ?word)
        (c/count ?count))))

(?- (stdout) (word-count "tweets.tsv"))

Hive

# tokenizer.py
import sys

for line in sys.stdin:
  for word in line.split():
    print word

CREATE TABLE tweets (text STRING);
LOAD DATA LOCAL INPATH 'tweets.tsv' OVERWRITE INTO TABLE tweets;

SELECT word, COUNT(*) AS count
FROM (
  SELECT TRANSFORM(text) USING 'python tokenizer.py' AS word
  FROM tweets
) t
GROUP BY word;

Pig

tweets = LOAD 'tweets.tsv' AS (text:chararray);
words = FOREACH tweets GENERATE FLATTEN(TOKENIZE(text)) AS word;
word_groups = GROUP words BY word;
word_counts = FOREACH word_groups GENERATE group AS word, COUNT(words) AS count;

STORE word_counts INTO 'word_counts.tsv';

Scalding

Tsv("tweets.tsv", 'text)
  .flatMap('text -> 'word) { text : String => text.split("\\s+") }
  .groupBy('word) { _.size }
  .write(Tsv("word_counts.tsv"))

Distributed Grep

Pig

%declare PATTERN '.*hello.*';

tweets = LOAD 'tweets.tsv' AS (text:chararray);
results = FILTER tweets BY (text MATCHES '$PATTERN');

Scalding

val Pattern = ".*hello.*";

Tsv("tweets.tsv", 'text)
  .filter('text) { text : String => text.matches(Pattern) }

Inverted Index

Pig

tweets = LOAD 'tweets.tsv' AS (tweet_id:int, text:chararray);

words = FOREACH tweets GENERATE tweet_id, FLATTEN(TOKENIZE(text)) AS word;
word_groups = GROUP words BY word;
inverted_index = FOREACH word_groups GENERATE group AS word, words.tweet_id;

Scalding

val tweets = Tsv("tweets.tsv", ('id, 'text))

val wordToTweets =
  tweets
    .flatMap(('id, 'text) -> ('word, 'tweetId)) { 
      fields : (Long, String) => 
      val (tweetId, text) = fields
      text.split("\\s+").map { word => (word, tweetId) }
    }

val invertedIndex =
  wordToTweets.groupBy('word) {  _.toList[Long]('tweetId -> 'tweetIds) }

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Rosetta Code

Word Count

Hadoop Streaming (Ruby)

Cascalog

Hive

Pig

Scalding

Distributed Grep

Pig

Scalding

Inverted Index

Pig

Scalding

Contents

Getting help

Documentation

Matrix API

Third Party Modules

Videos

How-tos

Tutorials

Articles

Other

Clone this wiki locally