In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, StopWordsRemover

In [2]:
# create your spark app and session
spark = SparkSession.builder.appName('stopwords').getOrCreate()

In [3]:
# Load in data
from pyspark import SparkFiles
url ="https://s3.amazonaws.com/dataviz-curriculum/day_2/food_reviews.csv"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("food_reviews.csv"), sep=",", header=True)
df.show()

+--------------------+
|             Reviews|
+--------------------+
|The pasta was a d...|
|We ate the fish i...|
|My family did not...|
|The girl even tri...|
|this is his job a...|
|I'm always greete...|
+--------------------+



In [4]:
# Tokenize dataframe  
tk_data = Tokenizer(inputCol='Reviews', outputCol='words')
# Transform dataframe
tk_df = tk_data.transform(df)

In [6]:
tk_df.show()

+--------------------+--------------------+
|             Reviews|               words|
+--------------------+--------------------+
|The pasta was a d...|[the, pasta, was,...|
|We ate the fish i...|[we, ate, the, fi...|
|My family did not...|[my, family, did,...|
|The girl even tri...|[the, girl, even,...|
|this is his job a...|[this, is, his, j...|
|I'm always greete...|[i'm, always, gre...|
+--------------------+--------------------+



In [8]:
# Remove stop words
remove_words = StopWordsRemover(inputCol='words', outputCol='removed')
# Transform new dataframe 
remove_df = remove_words.transform(tk_df)

In [9]:
# Show simplified review
remove_df.show()

+--------------------+--------------------+--------------------+
|             Reviews|               words|             removed|
+--------------------+--------------------+--------------------+
|The pasta was a d...|[the, pasta, was,...|       [pasta, dish]|
|We ate the fish i...|[we, ate, the, fi...|  [ate, fish, tasty]|
|My family did not...|[my, family, did,...|[family, like, food]|
|The girl even tri...|[the, girl, even,...|[girl, even, trie...|
|this is his job a...|[this, is, his, j...|[job, since, prob...|
|I'm always greete...|[i'm, always, gre...|[always, greeted,...|
+--------------------+--------------------+--------------------+



In [11]:
# Stop spark
spark.stop()