In [None]:
# Download Java Virtual Machine (JVM)
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

# Download Spark
!wget -q https://dlcdn.apache.org/spark/spark-3.3.1/spark-3.3.1-bin-hadoop3.tgz
# Unzip the file
!tar xf spark-3.3.1-bin-hadoop3.tgz

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = '/content/spark-3.3.1-bin-hadoop3'

# Install library for finding Spark
!pip install -q findspark
# Import the libary
import findspark
# Initiate findspark
findspark.init()
# Check the location for Spark
findspark.find()

'/content/spark-3.3.1-bin-hadoop3'

In [None]:
 # Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("StopWords").getOrCreate()

In [None]:
# Create DataFrame
sentenceData = spark.createDataFrame([
    (0, ["Big", "data", "is", "super", "powerful"]),
    (1, ["This", "is", "going", "to", "be", "epic"])
],["id", "raw"])
sentenceData.show(truncate=False)

+---+--------------------------------+
|id |raw                             |
+---+--------------------------------+
|0  |[Big, data, is, super, powerful]|
|1  |[This, is, going, to, be, epic] |
+---+--------------------------------+



In [None]:
# Import stop words library
from pyspark.ml.feature import StopWordsRemover

In [None]:
# Run the Remover
remover = StopWordsRemover(inputCol="raw", outputCol="filtered")

In [None]:
# Transform and show data
remover.transform(sentenceData).show(truncate=False)

+---+--------------------------------+----------------------------+
|id |raw                             |filtered                    |
+---+--------------------------------+----------------------------+
|0  |[Big, data, is, super, powerful]|[Big, data, super, powerful]|
|1  |[This, is, going, to, be, epic] |[going, epic]               |
+---+--------------------------------+----------------------------+



In [None]:
# SKILL DRILL 
# Non Tokenized DataFrame

# Create sample DataFrame
dataframe = spark.createDataFrame([
    (0, "Spark is great"),
    (1, "We are learning Spark"),
    (2, "Spark is better than hadoop no doub")
], ["id", "sentence"])
dataframe.show(truncate=False)

+---+-----------------------------------+
|id |sentence                           |
+---+-----------------------------------+
|0  |Spark is great                     |
|1  |We are learning Spark              |
|2  |Spark is better than hadoop no doub|
+---+-----------------------------------+



In [None]:
# TOKENIZATION
# ------------

# Import stop words library
from pyspark.ml.feature import Tokenizer

# Tokenize sentences
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")

# Transform and show DataFrame
tokenized_df = tokenizer.transform(dataframe)
tokenized_df.show(truncate=False)

+---+-----------------------------------+-------------------------------------------+
|id |sentence                           |words                                      |
+---+-----------------------------------+-------------------------------------------+
|0  |Spark is great                     |[spark, is, great]                         |
|1  |We are learning Spark              |[we, are, learning, spark]                 |
|2  |Spark is better than hadoop no doub|[spark, is, better, than, hadoop, no, doub]|
+---+-----------------------------------+-------------------------------------------+



In [None]:
# REMOVING STOP WORDS
# -------------------

# Run the Remover
remover = StopWordsRemover(inputCol="words", outputCol="filtered")

# Transform and show data
remover.transform(tokenized_df).show(truncate=False)


+---+-----------------------------------+-------------------------------------------+-----------------------------+
|id |sentence                           |words                                      |filtered                     |
+---+-----------------------------------+-------------------------------------------+-----------------------------+
|0  |Spark is great                     |[spark, is, great]                         |[spark, great]               |
|1  |We are learning Spark              |[we, are, learning, spark]                 |[learning, spark]            |
|2  |Spark is better than hadoop no doub|[spark, is, better, than, hadoop, no, doub]|[spark, better, hadoop, doub]|
+---+-----------------------------------+-------------------------------------------+-----------------------------+

