In [None]:
%load_ext dockermagic

# Spark
![Spark](https://spark.apache.org/images/spark-logo-trademark.png)

- https://spark.apache.org/

## Setup

- version 3.5.0 (Pre-built for Apache Hadoop 3.3 and later)

In [None]:
%%dockerexec hadoop

# Download package
mkdir -p /opt/pkgs
cd /opt/pkgs
wget -q -c https://dlcdn.apache.org/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz

# unpack file and create link
tar -zxf spark-3.5.0-bin-hadoop3.tgz -C /opt
ln -s /opt/spark-3.5.0-bin-hadoop3 /opt/spark

# update envvars.sh
cat >> /opt/envvars.sh << EOF
# Spark
export SPARK_HOME=/opt/spark
export PYSPARK_PYTHON=python3
export PYSPARK_DRIVER_PYTHON=python3
export PYTHONIOENCODING=utf8
export PATH=\${PATH}:\${SPARK_HOME}/bin

EOF

cat /opt/envvars.sh

## Example with Pi

In [None]:
%%dockerexec hadoop

source /opt/envvars.sh

# Local execution
$SPARK_HOME/bin/run-example --master local SparkPi 10 2> /dev/null

# Local execution with 4 processes
# $SPARK_HOME/bin/run-example --master local[4] SparkPi 10 2> /dev/null

# Execution using YARN
# $SPARK_HOME/bin/run-example --master yarn SparkPi 10 2> /dev/null

# Execution using spark-submit
# $SPARK_HOME/bin/spark-submit --class org.apache.spark.examples.SparkPi --master yarn \
#  $SPARK_HOME/examples/jars/spark-examples_2.12-3.5.0.jar 10

## Using pyspark (interactive)

In [None]:
%%dockerexec hadoop

source /opt/envvars.sh

mkdir -p /opt/src/spark

# download book "The Complete Works of William Shakespeare, by William Shakespeare" from Gutenberg Project
wget -q -c http://www.gutenberg.org/files/100/100-0.txt -O shakespeare.txt

# create directory in HDFS and put file
hdfs dfs -mkdir -p shakespeare
hdfs dfs -put shakespeare.txt shakespeare
hdfs dfs -ls -h shakespeare

1. Enter master node using the terminal

```bash
docker exec -it hadoop /bin/bash
```

2. Execute pyspark

```bash
source /opt/envvars.sh
pyspark --master local
```
3. Access Spark application UI

- http://localhost:4040

4. Write Python code in pyspark

```python
text_file = sc.textFile("hdfs:///user/hadoop/shakespeare")
counts = text_file.flatMap(lambda line: line.split(" ")) \
             .filter(lambda word: len(word) > 0) \
             .map(lambda word: (word, 1)) \
             .reduceByKey(lambda a, b: a + b) \
             .sortBy(lambda word_count: word_count[1], ascending=False)
counts.take(30)
# counts.saveAsTextFile("hdfs:///user/hadoop/shakespeare_result")
```

5. Exit from pyspark

```python
exit()
```

## Using spark-submit (batch job)

In [None]:
%%dockerwrite hadoop /opt/src/spark/wordcount.py

from pyspark.sql import SparkSession
from operator import add

# Initialize Spark Session
spark = SparkSession.builder.appName("WordCount").getOrCreate()

text_file = spark.sparkContext.textFile("hdfs:///user/hadoop/shakespeare")
counts = text_file.flatMap(lambda line: line.split(" ")) \
             .filter(lambda word: len(word) > 0) \
             .map(lambda word: (word, 1)) \
             .reduceByKey(add) \
             .sortBy(lambda word_count: word_count[1], ascending=False)

# Collect and print the results
for word, count in counts.take(30):
    print(f"{word}: {count}")

spark.stop()
# counts.saveAsTextFile("hdfs:///user/hadoop/shakespeare_result")

In [None]:
%%dockerexec hadoop

# http://localhost:8088

source /opt/envvars.sh

cd /opt/src/spark
spark-submit --master yarn wordcount.py # 2> /dev/null

In [None]:
%%dockerexec hadoop

source /opt/envvars.sh

mkdir -p /opt/src/spark

wget -q -c https://tinyurl.com/y68jxy7f -O stop-word-list.csv
hdfs dfs -mkdir -p stopwords
hdfs dfs -put stop-word-list.csv stopwords
hdfs dfs -cat stopwords/stop-word-list.csv

In [None]:
%%dockerwrite hadoop /opt/src/spark/wordcount.py

from pyspark.sql import SparkSession
from operator import add
import re
import string

# Initialize Spark Session
spark = SparkSession.builder.appName("WordCount").getOrCreate()

# Initialize SparkContext
sc = spark.sparkContext

# Function to clean text: remove punctuation and control characters
def clean_text(text):
    # Remove punctuation
    text = re.sub(f'[{string.punctuation}]', '', text)
    # Remove control characters
    text = re.sub(r'[\r\n\t]', ' ', text)
    return text.lower()

# Read stopwords from HDFS
stopwords_path = "hdfs:///user/hadoop/stopwords/stop-word-list.csv"
stopwords = sc.textFile(stopwords_path) \
              .flatMap(lambda line: line.split(",")) \
              .map(lambda word: word.strip()) \
              .collect()
stopwords_broadcast = sc.broadcast(set(stopwords))

# Read the Shakespeare text file
text_file = sc.textFile("hdfs:///user/hadoop/shakespeare")

# Word count excluding stopwords
counts = text_file.flatMap(lambda line: clean_text(line).split(" ")) \
                  .filter(lambda word: word and word not in stopwords_broadcast.value) \
                  .map(lambda word: (word, 1)) \
                  .reduceByKey(add) \
                  .sortBy(lambda word_count: word_count[1], ascending=False)

# Save the results to HDFS
# counts.saveAsTextFile("hdfs:///user/hadoop/shakespeare_result")

# Collect and print the results
for word, count in counts.take(30):
    print(f"{word}: {count}")

spark.stop()

In [None]:
%%dockerexec hadoop

source /opt/envvars.sh

cd /opt/src/spark

spark-submit --master local wordcount.py # 2> /dev/null
# spark-submit --master yarn wordcount.py # 2> /dev/null


## WordCount using DataFrame API

In [None]:
%%dockerwrite hadoop /opt/src/spark/wordcount_df.py

from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split, col

def main():
    # Initialize Spark Session
    spark = SparkSession.builder.appName("WordCountDataFrame").getOrCreate()

    # Read data from HDFS
    text_df = spark.read.text("hdfs:///user/hadoop/shakespeare")

    # Split each line into words and create a new DataFrame
    words_df = text_df.select(explode(split(col("value"), "\\s+")).alias("word"))

    # Filter out empty strings
    filtered_words_df = words_df.filter(words_df.word != "")

    # Count each word
    word_counts = filtered_words_df.groupBy("word").count()

    # Sort by count in descending order and take the top 30
    top_words = word_counts.sort(col("count").desc()).limit(30)

    # Show the results
    top_words.show(30)

    spark.stop()

if __name__ == "__main__":
    main()


In [None]:
%%dockerexec hadoop

source /opt/envvars.sh

cd /opt/src/spark

spark-submit --master local wordcount_df.py # 2> /dev/null
# spark-submit --master yarn wordcount_df.py # 2> /dev/null


## WordCount using SparkSQL

In [None]:
%%dockerwrite hadoop /opt/src/spark/wordcount_sql.py

from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split, col

def main():
    # Initialize Spark Session
    spark = SparkSession.builder.appName("WordCountSparkSQL").getOrCreate()

    # Read data from HDFS
    text_df = spark.read.text("hdfs:///user/hadoop/shakespeare")

    # Split each line into words and create a new DataFrame
    words_df = text_df.select(explode(split(col("value"), "\\s+")).alias("word"))

    # Filter out empty strings
    filtered_words_df = words_df.filter(words_df.word != "")

    # Register the DataFrame as a SQL temporary view
    filtered_words_df.createOrReplaceTempView("words")

    # Perform SQL query to count, sort, and limit the words
    top_words = spark.sql("""
        SELECT word, COUNT(*) as count
        FROM words
        GROUP BY word
        ORDER BY count DESC
        LIMIT 30
    """)

    # Show the results
    top_words.show(30)

    spark.stop()

if __name__ == "__main__":
    main()


In [None]:
%%dockerexec hadoop

source /opt/envvars.sh

cd /opt/src/spark

spark-submit --master local wordcount_sql.py # 2> /dev/null
# spark-submit --master yarn wordcount_sql.py # 2> /dev/null