In [None]:
%load_ext dockermagic

# Pig
![Pig](https://pig.apache.org/images/pig-logo.gif)

- https://pig.apache.org

## Setup

- version 0.17

In [None]:
%%dockerexec hadoop

# Download package
mkdir -p /opt/pkgs
cd /opt/pkgs
wget -q -c https://downloads.apache.org/pig/pig-0.17.0/pig-0.17.0.tar.gz

# unpack file and create link
tar -zxf pig-0.17.0.tar.gz -C /opt
ln -s /opt/pig-0.17.0 /opt/pig

# update envvars.sh
cat >> /opt/envvars.sh << EOF
# Pig
export PIG_HOME=/opt/pig
export PATH=\${PATH}:\${PIG_HOME}/bin

EOF

cat /opt/envvars.sh

## Example

In [None]:
%%dockerexec hadoop

source /opt/envvars.sh

mkdir -p /opt/datasets_pig
cd /opt/datasets_pig
wget -q -c https://tinyurl.com/y5roz8kz -O stations.csv

## Local execution

In [None]:
%%dockerwrite hadoop /opt/src/list_stations_local.pig

stations = LOAD 'stations.csv' USING PigStorage(',') AS 
(station_id:int, name:chararray, lat:float, long:float, dockcount:int, landmark:chararray, installation:chararray);
station_ids_names = FOREACH stations GENERATE station_id, name;
ordered = ORDER station_ids_names BY name;
DESCRIBE stations;
ILLUSTRATE ordered;
DUMP ordered;

In [None]:
%%dockerexec hadoop

source /opt/envvars.sh

cd /opt/datasets_pig

# run local
pig -x local -f /opt/src/list_stations_local.pig 2> /dev/null

## Cluster execution

In [None]:
%%dockerwrite hadoop /opt/src/list_stations_cluster.pig

stations = LOAD 'stations' USING PigStorage(',') AS 
(station_id:int, name:chararray, lat:float, long:float, dockcount:int, landmark:chararray, installation:chararray);
station_ids_names = FOREACH stations GENERATE station_id, name;
ordered = ORDER station_ids_names BY name;
STORE ordered INTO 'ordered';

In [None]:
%%dockerexec hadoop

source /opt/envvars.sh

cd /opt/datasets_pig

# Upload to HDFS
hdfs dfs -mkdir stations
hdfs dfs -put stations.csv stations

# run in Hadoop cluster using mapreduce
pig -x mapreduce -f /opt/src/list_stations_cluster.pig 2>/dev/null

hdfs dfs -cat ordered/*

## WordCount using Pig

In [None]:
%%dockerexec hadoop

source /opt/envvars.sh

cd /opt/datasets_pig

wget -q -c https://tinyurl.com/y68jxy7f -O stop-word-list.csv
hdfs dfs -mkdir -p stopwords
hdfs dfs -put stop-word-list.csv stopwords
hdfs dfs -cat stopwords/stop-word-list.csv

# download book "The Complete Works of William Shakespeare, by William Shakespeare" from Gutenberg Project
wget -q -c http://www.gutenberg.org/files/100/100-0.txt -O shakespeare.txt

# create directory in HDFS and put file
hdfs dfs -mkdir -p shakespeare
hdfs dfs -put shakespeare.txt shakespeare
hdfs dfs -ls -h shakespeare

In [None]:
%%dockerwrite hadoop /opt/src/wordcount.pig

-- List HDFS content
fs -ls
fs -ls shakespeare

-- Job name to appear in YARN
SET job.name 'Word Count in Pig';

-- Load shakespeare dataset
shakespeare = LOAD 'shakespeare' AS (lineoftext:chararray);

-- Load stopwords
stopwords = LOAD 'stopwords' USING PigStorage() AS (stopword:chararray);

-- Create bag of words
words = FOREACH shakespeare GENERATE
        FLATTEN(TOKENIZE(REPLACE(LOWER(TRIM(lineoftext)),
        '[\\p{Punct},\\p{Cntrl}]',''))) AS word;

-- Remove empty words
realwords = FILTER words BY SIZE(word) > 0;

-- Create bag of stop words
flattened_stopwords = FOREACH stopwords GENERATE
       FLATTEN(TOKENIZE(stopword)) AS stopword;

-- Associate words with respective stop words
right_joined = JOIN flattened_stopwords
               BY stopword RIGHT OUTER,
               realwords BY word;

-- Remove stop words
meaningful_words = FILTER right_joined BY
          (flattened_stopwords::stopword IS NULL);

-- Retrieve remaining words
shakespeare_real_words = FOREACH meaningful_words
          GENERATE realwords::word AS word;

-- Group words
grouped = GROUP shakespeare_real_words BY word;

-- Count grouped words
counted = FOREACH grouped GENERATE group AS word,
          COUNT(shakespeare_real_words) AS wordcount;

-- Sort bag in descending order
ordered = ORDER counted BY wordcount DESC;

-- Select 30 first words
top30 = LIMIT ordered 30;

-- Store output
STORE top30 INTO 'shakespeare_top30';

-- Show output from HDFS
fs -cat shakespeare_top30/*


In [None]:
%%dockerexec hadoop

source /opt/envvars.sh

# run in Hadoop cluster using mapreduce
pig -x mapreduce -f /opt/src/wordcount.pig 2> wordcount.log

# remove output in HDFS
# hdfs dfs -rm -r shakespeare_top30