In [1]:
%load_ext dockermagic

# Spark
![Spark](https://spark.apache.org/images/spark-logo-trademark.png)

- https://spark.apache.org/

## Setup

- version 3.5.0 (Pre-built for Apache Hadoop 3.3 and later)

In [2]:
%%dockerexec hadoop

# Download package
cd /opt/pkgs
wget -q -c https://dlcdn.apache.org/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz

# unpack file and create link
tar -zxf spark-3.5.0-bin-hadoop3.tgz -C /opt
ln -s /opt/spark-3.5.0-bin-hadoop3 /opt/spark

# update envvars.sh
cat >> /opt/envvars.sh << EOF
# Spark
export SPARK_HOME=/opt/spark
export PYSPARK_PYTHON=python3
export PYSPARK_DRIVER_PYTHON=python3
export PYTHONIOENCODING=utf8
export PATH=\${PATH}:\${SPARK_HOME}/bin

EOF

cat /opt/envvars.sh

export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk-amd64
export PDSH_RCMD_TYPE=ssh
export HADOOP_HOME=/opt/hadoop
export HADOOP_VERSION=3.3.6
export HADOOP_COMMON_HOME=${HADOOP_HOME}
export HADOOP_CONF_DIR=${HADOOP_HOME}/etc/hadoop
export HADOOP_HDFS_HOME=${HADOOP_HOME}
export HADOOP_MAPRED_HOME=${HADOOP_HOME}
export HADOOP_YARN_HOME=${HADOOP_HOME}
export PATH=${PATH}:${HADOOP_HOME}/bin:${HADOOP_HOME}/sbin
# Hive
export HIVE_HOME=/opt/hive
export PATH=${PATH}:${HIVE_HOME}/bin

# Spark
export SPARK_HOME=/opt/spark
export PYSPARK_PYTHON=python3
export PYSPARK_DRIVER_PYTHON=python3
export PYTHONIOENCODING=utf8
export PATH=${PATH}:${SPARK_HOME}/bin



## Example with Pi

In [10]:
%%dockerexec hadoop

source /opt/envvars.sh

# Local execution
# $SPARK_HOME/bin/run-example --master local SparkPi 10 2> /dev/null

# Local execution with 4 processes
# $SPARK_HOME/bin/run-example --master local[4] SparkPi 10 2> /dev/null

# Execution using YARN
# $SPARK_HOME/bin/run-example --master yarn SparkPi 10

# Execution using spark-submit
$SPARK_HOME/bin/spark-submit --class org.apache.spark.examples.SparkPi --master yarn \
 $SPARK_HOME/examples/jars/spark-examples_2.12-3.5.0.jar 10

23/12/18 22:32:42 INFO SparkContext: Running Spark version 3.5.0
23/12/18 22:32:42 INFO SparkContext: OS info Linux, 6.2.0-1018-azure, amd64
23/12/18 22:32:42 INFO SparkContext: Java version 1.8.0_392
23/12/18 22:32:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/12/18 22:32:42 INFO ResourceUtils: No custom resources configured for spark.driver.
23/12/18 22:32:42 INFO SparkContext: Submitted application: Spark Pi
23/12/18 22:32:42 INFO ResourceProfile: Default ResourceProfile created, executor resources: Map(cores -> name: cores, amount: 1, script: , vendor: , memory -> name: memory, amount: 1024, script: , vendor: , offHeap -> name: offHeap, amount: 0, script: , vendor: ), task resources: Map(cpus -> name: cpus, amount: 1.0)
23/12/18 22:32:42 INFO ResourceProfile: Limiting resource is cpus at 1 tasks per executor
23/12/18 22:32:42 INFO ResourceProfileManager: Added ResourceProfile id: 0
23/12/18 22:32:4

## Using pyspark

```bash
source /opt/envvars.sh
pyspark --master yarn
```

- Spark application UI - http://localhost:4040

```python
text_file = sc.textFile("hdfs:///user/hadoop/shakespeare")
counts = text_file.flatMap(lambda line: line.split(" ")) \
             .map(lambda word: (word, 1)) \
             .reduceByKey(lambda a, b: a + b)
counts.saveAsTextFile("hdfs:///user/hadoop/shakespeare_result")
counts.collect()
```

```python
exit()
```

In [20]:
%%dockerexec hadoop

source /opt/envvars.sh

mkdir -p /opt/src/spark

wget -q -c https://tinyurl.com/y68jxy7f -O stop-word-list.csv
hdfs dfs -mkdir -p stopwords
hdfs dfs -put stop-word-list.csv stopwords
hdfs dfs -cat stopwords/stop-word-list.csv

# download book "The Complete Works of William Shakespeare, by William Shakespeare" from Gutenberg Project
wget -q -c http://www.gutenberg.org/files/100/100-0.txt -O shakespeare.txt

# create directory in HDFS and put file
hdfs dfs -mkdir -p shakespeare
hdfs dfs -put shakespeare.txt shakespeare
hdfs dfs -ls -h shakespeare

a, able, about, across, after, all, almost, also, am, among, an, and, any, are, as, at, be, because, been, but, by, can, cannot, could, dear, did, do, does, either, else, ever, every, for, from, get, got, had, has, have, he, her, hers, him, his, how, however, i, if, in, into, is, it, its, just, least, let, like, likely, may, me, might, most, must, my, neither, no, nor, not, of, off, often, on, only, or, other, our, own, rather, said, say, says, she, should, since, so, some, than, that, the, their, them, then, there, these, they, this, tis, to, too, twas, us, wants, was, we, were, what, when, where, which, while, who, whom, why, will, with, would, yet, you, yourFound 1 items
-rw-r--r--   2 hadoop hadoop      5.4 M 2023-12-18 23:08 shakespeare/shakespeare.txt


In [12]:
%%dockerwrite hadoop /opt/src/spark/wordcount.py

from pyspark import SparkContext

sc = SparkContext("local", "WordCount")

text_file = sc.textFile("hdfs:///user/hadoop/shakespeare")
counts = text_file.flatMap(lambda line: line.split(" ")) \
             .map(lambda word: (word, 1)) \
             .reduceByKey(lambda a, b: a + b)
counts.saveAsTextFile("hdfs:///user/hadoop/shakespeare_result")
counts.collect()

[sPreparing to copy...[?25l[u[2KCopying to container - 0B[?25h[u[2KSuccessfully copied 2.05kB to hadoop:/opt/src/spark/wordcount.py


In [42]:
%%dockerwrite hadoop /opt/src/spark/wordcount.py

from pyspark import SparkContext
import re
import string

# Initialize SparkContext
sc = SparkContext("local", "WordCount")

# Function to clean text: remove punctuation and control characters
def clean_text(text):
    # Remove punctuation
    text = re.sub(f'[{string.punctuation}]', '', text)
    # Remove control characters
    text = re.sub(r'[\r\n\t]', ' ', text)
    return text.lower()

# Read stopwords from HDFS
stopwords_path = "hdfs:///user/hadoop/stopwords/stop-word-list.csv"
stopwords = sc.textFile(stopwords_path) \
              .flatMap(lambda line: line.split(",")) \
              .map(lambda word: word.strip()) \
              .collect()
stopwords_broadcast = sc.broadcast(set(stopwords))

# Read the Shakespeare text file
text_file = sc.textFile("hdfs:///user/hadoop/shakespeare")

# Word count excluding stopwords
counts = text_file.flatMap(lambda line: clean_text(line).split(" ")) \
                  .filter(lambda word: word and word not in stopwords_broadcast.value) \
                  .map(lambda word: (word, 1)) \
                  .reduceByKey(lambda a, b: a + b) \
                  .sortBy(lambda word_count: word_count[1], ascending=False) \
                  .take(30) 

# Save the results to HDFS
# counts.saveAsTextFile("hdfs:///user/hadoop/shakespeare_result")

# Print the top 30 words and their counts
for word, count in counts:
    print(f"{word}: {count}")

[sPreparing to copy...[?25l[u[2KCopying to container - 0B[?25h[u[2KSuccessfully copied 3.07kB to hadoop:/opt/src/spark/wordcount.py


In [43]:
%%dockerexec hadoop

source /opt/envvars.sh

# pyspark --master yarn

hdfs dfs -rm -r /user/hadoop/shakespeare_result

cd /opt/src/spark

spark-submit --master local wordcount.py 2> /dev/null

# pyspark --name WordCount --master local
# pyspark --name WordCount --master yarn

rm: `/user/hadoop/shakespeare_result': No such file or directory
thou: 5836
thy: 4349
shall: 3837
thee: 3403
lord: 3089
king: 3004
now: 2996
sir: 2952
good: 2941
come: 2613
more: 2505
enter: 2408
o: 2319
well: 2271
love: 2271
here: 2253
hath: 2056
one: 1948
man: 1901
i’ll: 1885
upon: 1864
make: 1787
go: 1785
know: 1752
scene: 1631
see: 1548
such: 1531
’tis: 1498
out: 1459
give: 1417


In [35]:
%%dockerexec hadoop

source /opt/envvars.sh

hdfs dfs -ls /user/hadoop/shakespeare_result
hdfs dfs -head /user/hadoop/shakespeare_result/part-00000

Found 2 items
-rw-r--r--   2 hadoop hadoop          0 2023-12-18 23:19 /user/hadoop/shakespeare_result/_SUCCESS
-rw-r--r--   2 hadoop hadoop     543162 2023-12-18 23:19 /user/hadoop/shakespeare_result/part-00000
('project', 95)
('gutenberg', 26)
('ebook', 13)
('complete', 24)
('works', 65)
('william', 90)
('shakespeare', 10)
('use', 360)
('anyone', 15)
('anywhere', 8)
('united', 20)
('states', 34)
('parts', 121)
('world', 671)
('cost', 54)
('restrictions', 2)
('whatsoever', 17)
('copy', 24)
('give', 1417)
('away', 900)
('reuse', 2)
('under', 316)
('terms', 102)
('license', 24)
('included', 4)
('online', 4)
('wwwgutenbergorg', 5)
('located', 7)
('check', 31)
('laws', 51)
('country', 166)
('before', 981)
('using', 14)
('title', 107)
('author', 15)
('release', 8)
('date', 26)
('january', 3)
('1994', 1)
('100', 3)
('recently', 1)
('updated', 2)
('december', 6)
('16', 3)
('2023', 1)
('language', 44)
('english', 166)
('start', 39)
('contents', 66)
('sonnets', 6)
('all’s', 44)
('well', 2271)


## pyspark-pictures

- https://github.com/jkthompson/pyspark-pictures/