In [1]:
# Ref: https://spark.apache.org/docs/latest/quick-start.html
textFile = spark.read.text("README.md") # Creates a DataFrame from the input file

In [2]:
print (textFile)

DataFrame[value: string]


In [3]:
textFile.count()  # Number of rows in this DataFrame

89

In [4]:
textFile.first()  # First row in this DataFrame

Row(value='# PySpark In Jupyter')

In [5]:
# transform this DataFrame to a new one
linesWithSpark = textFile.filter(textFile.value.contains("Spark")) 
print (linesWithSpark)

DataFrame[value: string]


In [6]:
# action count() is called on the new DataFrame
print (linesWithSpark.count()) # How many lines contain "Spark"?

7


In [8]:
# This first maps a line to an integer value and aliases it as “numWords”, creating a new DataFrame. 
# agg is called on that DataFrame to find the largest word count. 
# The arguments to select and agg are both Column, we can use df.colName to get a column from a DataFrame. 
# We can also import pyspark.sql.functions, which provides a lot of convenient functions to build a new Column from an old one.

from pyspark.sql.functions import *
textFile.select(size(split(textFile.value, "\s+")).name("numWords")).agg(max(col("numWords"))).collect()

[Row(max(numWords)=40)]

In [10]:
# Example: MapReduce, Spark can implement MapReduce flows easily
# Here, we use the explode function in select, to transform a Dataset of lines to a Dataset of words, 
# and then combine groupBy and count to compute the per-word counts in the file as a DataFrame of 2 columns: “word” and “count”. 
wordCounts = textFile.select(explode(split(textFile.value, "\s+")).alias("word")).groupBy("word").count()

# To collect the word counts in our shell, we can call collect
wordCounts.collect()

[Row(word='`$', count=3),
 Row(word='version,', count=1),
 Row(word='/usr/local/Cellar/spark/x.y.z', count=1),
 Row(word='port', count=2),
 Row(word='[Jupyter', count=1),
 Row(word='local', count=2),
 Row(word='Copy', count=1),
 Row(word='server', count=1),
 Row(word='down', count=1),
 Row(word='Jupyter](https://towardsdatascience.com/how-to-use-pyspark-on-your-computer-9c7180075617)', count=1),
 Row(word='done,', count=1),
 Row(word='8888', count=1),
 Row(word="PYSPARK_DRIVER_PYTHON_OPTS=notebook'", count=1),
 Row(word='1.8.0.242', count=1),
 Row(word='will', count=1),
 Row(word='particular', count=1),
 Row(word='container', count=1),
 Row(word='process', count=1),
 Row(word='Jypyter', count=1),
 Row(word='using', count=1),
 Row(word='1.8.0.242,', count=1),
 Row(word="'export", count=6),
 Row(word='variable', count=1),
 Row(word='ln', count=1),
 Row(word='In', count=1),
 Row(word='[jEnv](https://github.com/jenv/jenv)', count=1),
 Row(word='>>', count=7),
 Row(word='Docker', count=1),
