In [1]:
import os
# Find the latest version of spark 3.0 from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.0.3'
spark_version = 'spark-3.2.0'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Get:2 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Get:3 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
Hit:4 http://archive.ubuntu.com/ubuntu bionic InRelease
Get:5 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Hit:6 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Ign:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Hit:8 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease
Get:9 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [74.6 kB]
Hit:10 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease
Get:11 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic/main Sources [1,819 kB]
Ign:12 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:13 https://developer.download.nvidia.co

In [2]:
# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("StopWords").getOrCreate()


In [3]:
# DataFrame that's already a list of words. By creating a list of words, we can skip the tokenization step for now
sentenceData = spark.createDataFrame([(0, ["Big", "data", "is", "super", "powerful"]), (1, ["This", "is", "going", "to","be", "epic"])], ["id","raw"])
sentenceData.show(truncate = False)

+---+--------------------------------+
|id |raw                             |
+---+--------------------------------+
|0  |[Big, data, is, super, powerful]|
|1  |[This, is, going, to, be, epic] |
+---+--------------------------------+



In [4]:
# Import stop words library
from pyspark.ml.feature import StopWordsRemover


In [5]:
# Run the Remover
# input column + output column arguments
remover = StopWordsRemover(inputCol="raw", outputCol="filtered")


In [6]:
# transform and show data
remover.transform(sentenceData).show(truncate = False)

+---+--------------------------------+----------------------------+
|id |raw                             |filtered                    |
+---+--------------------------------+----------------------------+
|0  |[Big, data, is, super, powerful]|[Big, data, super, powerful]|
|1  |[This, is, going, to, be, epic] |[going, epic]               |
+---+--------------------------------+----------------------------+



In [7]:
# do both truncate and remover
from pyspark.ml.feature import Tokenizer
# create sample df
df = spark.createDataFrame([(0, "Spark is great"), (1, "We are learning spark"), (2, "Spark is better than Hadoop no doubt")], ["id", "sentence"])
df.show()

+---+--------------------+
| id|            sentence|
+---+--------------------+
|  0|      Spark is great|
|  1|We are learning s...|
|  2|Spark is better t...|
+---+--------------------+



In [8]:
# Tokenize sentences
tok = Tokenizer(inputCol="sentence", outputCol="words")
tok

Tokenizer_52e8ba72cc02

In [10]:
tok_df = tok.transform(df)
tok_df.show(truncate=False)

+---+------------------------------------+--------------------------------------------+
|id |sentence                            |words                                       |
+---+------------------------------------+--------------------------------------------+
|0  |Spark is great                      |[spark, is, great]                          |
|1  |We are learning spark               |[we, are, learning, spark]                  |
|2  |Spark is better than Hadoop no doubt|[spark, is, better, than, hadoop, no, doubt]|
+---+------------------------------------+--------------------------------------------+



In [13]:
# Import stop words library
from pyspark.ml.feature import StopWordsRemover

# Run the Remover
# input column + output column arguments
remover = StopWordsRemover(inputCol="words", outputCol="filtered")

remover.transform(tok_df).show(truncate = False)

+---+------------------------------------+--------------------------------------------+------------------------------+
|id |sentence                            |words                                       |filtered                      |
+---+------------------------------------+--------------------------------------------+------------------------------+
|0  |Spark is great                      |[spark, is, great]                          |[spark, great]                |
|1  |We are learning spark               |[we, are, learning, spark]                  |[learning, spark]             |
|2  |Spark is better than Hadoop no doubt|[spark, is, better, than, hadoop, no, doubt]|[spark, better, hadoop, doubt]|
+---+------------------------------------+--------------------------------------------+------------------------------+

