In [None]:
%load_ext dockermagic

# Spark
![Spark](https://spark.apache.org/images/spark-logo-trademark.png)

- https://spark.apache.org/

## Setup

- version 3.5.0 (Pre-built for Apache Hadoop 3.3 and later)

In [None]:
%%dockerexec hadoop

# Download package
cd /opt/pkgs
wget -q -c https://dlcdn.apache.org/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz

# unpack file and create link
tar -zxf spark-3.5.0-bin-hadoop3.tgz -C /opt
ln -s /opt/spark-3.5.0-bin-hadoop3 /opt/spark

# update envvars.sh
cat >> /opt/envvars.sh << EOF
# Spark
export SPARK_HOME=/opt/spark
export PYSPARK_PYTHON=python3
export PYSPARK_DRIVER_PYTHON=python3
export PYTHONIOENCODING=utf8
export PATH=\${PATH}:\${SPARK_HOME}/bin

EOF

cat /opt/envvars.sh

## Example with Pi

In [None]:
%%dockerexec hadoop

source /opt/envvars.sh

# Local execution
# $SPARK_HOME/bin/run-example --master local SparkPi 10 2> /dev/null

# Local execution with 4 processes
# $SPARK_HOME/bin/run-example --master local[4] SparkPi 10 2> /dev/null

# Execution using YARN
# $SPARK_HOME/bin/run-example --master yarn SparkPi 10

# Execution using spark-submit
$SPARK_HOME/bin/spark-submit --class org.apache.spark.examples.SparkPi --master yarn \
 $SPARK_HOME/examples/jars/spark-examples_2.12-3.5.0.jar 10

## Using pyspark

```bash
source /opt/envvars.sh
pyspark --master yarn
```

- Spark application UI - http://localhost:4040

```python
text_file = sc.textFile("hdfs:///user/hadoop/shakespeare")
counts = text_file.flatMap(lambda line: line.split(" ")) \
             .map(lambda word: (word, 1)) \
             .reduceByKey(lambda a, b: a + b)
counts.saveAsTextFile("hdfs:///user/hadoop/shakespeare_result")
counts.collect()
```

```python
exit()
```

In [None]:
%%dockerexec hadoop

source /opt/envvars.sh

mkdir -p /opt/src/spark

wget -q -c https://tinyurl.com/y68jxy7f -O stop-word-list.csv
hdfs dfs -mkdir -p stopwords
hdfs dfs -put stop-word-list.csv stopwords
hdfs dfs -cat stopwords/stop-word-list.csv

# download book "The Complete Works of William Shakespeare, by William Shakespeare" from Gutenberg Project
wget -q -c http://www.gutenberg.org/files/100/100-0.txt -O shakespeare.txt

# create directory in HDFS and put file
hdfs dfs -mkdir -p shakespeare
hdfs dfs -put shakespeare.txt shakespeare
hdfs dfs -ls -h shakespeare

In [None]:
%%dockerwrite hadoop /opt/src/spark/wordcount.py

from pyspark import SparkContext

sc = SparkContext("local", "WordCount")

text_file = sc.textFile("hdfs:///user/hadoop/shakespeare")
counts = text_file.flatMap(lambda line: line.split(" ")) \
             .map(lambda word: (word, 1)) \
             .reduceByKey(lambda a, b: a + b)
counts.saveAsTextFile("hdfs:///user/hadoop/shakespeare_result")
counts.collect()

In [None]:
%%dockerwrite hadoop /opt/src/spark/wordcount.py

from pyspark import SparkContext
import re
import string

# Initialize SparkContext
sc = SparkContext("local", "WordCount")

# Function to clean text: remove punctuation and control characters
def clean_text(text):
    # Remove punctuation
    text = re.sub(f'[{string.punctuation}]', '', text)
    # Remove control characters
    text = re.sub(r'[\r\n\t]', ' ', text)
    return text.lower()

# Read stopwords from HDFS
stopwords_path = "hdfs:///user/hadoop/stopwords/stop-word-list.csv"
stopwords = sc.textFile(stopwords_path) \
              .flatMap(lambda line: line.split(",")) \
              .map(lambda word: word.strip()) \
              .collect()
stopwords_broadcast = sc.broadcast(set(stopwords))

# Read the Shakespeare text file
text_file = sc.textFile("hdfs:///user/hadoop/shakespeare")

# Word count excluding stopwords
counts = text_file.flatMap(lambda line: clean_text(line).split(" ")) \
                  .filter(lambda word: word and word not in stopwords_broadcast.value) \
                  .map(lambda word: (word, 1)) \
                  .reduceByKey(lambda a, b: a + b) \
                  .sortBy(lambda word_count: word_count[1], ascending=False) \
                  .take(30) 

# Save the results to HDFS
# counts.saveAsTextFile("hdfs:///user/hadoop/shakespeare_result")

# Print the top 30 words and their counts
for word, count in counts:
    print(f"{word}: {count}")

In [None]:
%%dockerexec hadoop

source /opt/envvars.sh

# pyspark --master yarn

hdfs dfs -rm -r /user/hadoop/shakespeare_result

cd /opt/src/spark

spark-submit --master local wordcount.py 2> /dev/null

# pyspark --name WordCount --master local
# pyspark --name WordCount --master yarn

In [None]:
%%dockerexec hadoop

source /opt/envvars.sh

hdfs dfs -ls /user/hadoop/shakespeare_result
hdfs dfs -head /user/hadoop/shakespeare_result/part-00000

## pyspark-pictures

- https://github.com/jkthompson/pyspark-pictures/