In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *

In [2]:
spark = SparkSession\
    .builder\
    .appName("word-count")\
    .getOrCreate()

In [3]:
spark

## 3 approaches to word-count problems

Be careful with the choice of `split` char, we got different answers with `\s+`, using single space char gives the same answer among 3 approaches

### Dataframe

In [4]:
linesDF = spark.read.text("spark.README.md")

In [5]:
type(linesDF)

pyspark.sql.dataframe.DataFrame

In [6]:
linesDF.select("value").show(5, False)

+------------------------------------------------------------+
|value                                                       |
+------------------------------------------------------------+
|[Databricks Sandbox](https://community.cloud.databricks.com)|
|                                                            |
|2019-07-30                                                  |
|                                                            |
|reinstall Anaconda                                          |
+------------------------------------------------------------+
only showing top 5 rows



In [7]:
wordCounts = (
    linesDF
    .select(F.explode(F.split(F.col("value"), " ")).alias("word"))
    .groupBy("word").count()
    .orderBy(F.desc("count"))
)

In [8]:
wordCounts.show(10, truncate=False)

+-------+-----+
|word   |count|
+-------+-----+
|       |85   |
|$      |21   |
|>>>    |14   |
|```    |8    |
|pyspark|8    |
|#      |7    |
|python |7    |
|##     |6    |
|with   |6    |
|in     |6    |
+-------+-----+
only showing top 10 rows



In [9]:
wordCounts.count()

235

In [10]:
wordCounts.rdd.getNumPartitions()

11

In [11]:
# repartition to 2
wordCounts.coalesce(2).write.format("csv").mode("overwrite").save("/tmp/wc_df.csv")

#### filter

In [12]:
linesWithSpark = linesDF.filter(F.lower(F.col("value")).contains("spark"))

linesWithSpark.show(5, False)

+----------------------------------------------------+
|value                                               |
+----------------------------------------------------+
|Apache Spark                                        |
|http://spark.apache.org/                            |
|$ mkdir -p ~/spark                                  |
|$ cd spark                                          |
|$ tar zxvf ~/Downloads/spark-2.4.3-bin-hadoop2.7.tgz|
+----------------------------------------------------+
only showing top 5 rows



### spark SQL

In [13]:
# create a table named "linesTAB"
linesDF.createOrReplaceTempView("linesTAB")   

In [14]:
spark.sql("select * from linesTAB limit 5").show(truncate=False)

+------------------------------------------------------------+
|value                                                       |
+------------------------------------------------------------+
|[Databricks Sandbox](https://community.cloud.databricks.com)|
|                                                            |
|2019-07-30                                                  |
|                                                            |
|reinstall Anaconda                                          |
+------------------------------------------------------------+



In [15]:
sql_stmt = """
    with words as (
        select 
            explode(split(value, " ")) as word 
        from linesTAB
    )
    select 
        word, 
        count(*) as count
    from words
    group by word
    order by count desc
    limit 10
"""

spark.sql(sql_stmt).show()

+-------+-----+
|   word|count|
+-------+-----+
|       |   85|
|      $|   21|
|    >>>|   14|
|pyspark|    8|
|    ```|    8|
|      #|    7|
| python|    7|
|     in|    6|
|   with|    6|
|     ##|    6|
+-------+-----+



#### filter line with `spark` word

In [16]:
sql_stmt = """
    select 
        value
    from linesTAB
    where 
    --lower(value) like '%spark%'
    instr(lower(value),'spark') > 0
"""

spark.sql(sql_stmt).show(5, False)

+----------------------------------------------------+
|value                                               |
+----------------------------------------------------+
|Apache Spark                                        |
|http://spark.apache.org/                            |
|$ mkdir -p ~/spark                                  |
|$ cd spark                                          |
|$ tar zxvf ~/Downloads/spark-2.4.3-bin-hadoop2.7.tgz|
+----------------------------------------------------+
only showing top 5 rows



### RDD

In [17]:
sc = spark.sparkContext

In [18]:
linesRDD = sc.textFile("spark.README.md")

In [19]:
type(linesRDD)

pyspark.rdd.RDD

In [20]:
linesRDD.take(5)

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 19.0 failed 1 times, most recent failure: Lost task 0.0 in stage 19.0 (TID 1213, 192.168.0.114, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/home/wengong/spark/spark-3.0.1-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 477, in main
    ("%d.%d" % sys.version_info[:2], version))
Exception: Python in worker has different version 3.7 than that in driver 3.8, PySpark cannot run with different minor versions. Please check environment variables PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON are correctly set.

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:503)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:638)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:621)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:456)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:941)
	at scala.collection.Iterator.foreach$(Iterator.scala:941)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:315)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:313)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:307)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:307)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:294)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:288)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.api.python.PythonRDD$.$anonfun$runJob$1(PythonRDD.scala:154)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2139)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:127)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:446)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:449)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2059)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2008)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2007)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2007)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:973)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:973)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:973)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2239)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2188)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2177)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:775)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2099)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2120)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2139)
	at org.apache.spark.api.python.PythonRDD$.runJob(PythonRDD.scala:154)
	at org.apache.spark.api.python.PythonRDD.runJob(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/home/wengong/spark/spark-3.0.1-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 477, in main
    ("%d.%d" % sys.version_info[:2], version))
Exception: Python in worker has different version 3.7 than that in driver 3.8, PySpark cannot run with different minor versions. Please check environment variables PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON are correctly set.

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:503)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:638)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:621)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:456)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:941)
	at scala.collection.Iterator.foreach$(Iterator.scala:941)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:315)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:313)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:307)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:307)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:294)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:288)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.api.python.PythonRDD$.$anonfun$runJob$1(PythonRDD.scala:154)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2139)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:127)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:446)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:449)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [None]:
wc = (
    linesRDD.flatMap(lambda x: x.split(" "))
    .map(lambda x: (x, 1))
    .reduceByKey(lambda a,b: a+b)
)

In [None]:
wc.take(5)

In [None]:
sorted(wc.collect(), key = lambda x: x[1], reverse=True)[:10]

rdd.save() does not support `overwrite` mode (see https://community.cloudera.com/t5/Support-Questions/Apache-SPARK-Overwrite-data-file/m-p/105253), one must remove it before hand.

In [21]:
!rm -rf /tmp/wc_rdd.txt

In [22]:
wc.saveAsTextFile("/tmp/wc_rdd.txt")

NameError: name 'wc' is not defined

In [23]:
!ls /tmp/wc_*

part-00000-e2e92187-f129-4e00-9c13-086f1eb7f05c-c000.csv  _SUCCESS
part-00001-e2e92187-f129-4e00-9c13-086f1eb7f05c-c000.csv


In [24]:
spark.stop()