In [3]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *

In [4]:
spark = SparkSession\
    .builder\
    .appName("word-count")\
    .getOrCreate()

## Dataframe

In [5]:
textFile = spark.read.text("spark.README.md")

In [6]:
type(textFile)

pyspark.sql.dataframe.DataFrame

In [10]:
textFile.select("value").show(5, False)

+------------------------------------------------------------+
|value                                                       |
+------------------------------------------------------------+
|[Databricks Sandbox](https://community.cloud.databricks.com)|
|                                                            |
|2019-07-30                                                  |
|                                                            |
|reinstall Anaconda                                          |
+------------------------------------------------------------+
only showing top 5 rows



In [12]:
linesWithSpark = textFile.filter(F.lower(F.col("value")).contains("spark"))

In [13]:
linesWithSpark.show(5, False)

+----------------------------------------------------+
|value                                               |
+----------------------------------------------------+
|Apache Spark                                        |
|http://spark.apache.org/                            |
|$ mkdir -p ~/spark                                  |
|$ cd spark                                          |
|$ tar zxvf ~/Downloads/spark-2.4.3-bin-hadoop2.7.tgz|
+----------------------------------------------------+
only showing top 5 rows



In [23]:
wordCounts = (
    textFile
    .select(F.explode(F.split(textFile.value, "\s+")).alias("word"))
    .groupBy("word").count()
    .orderBy(F.desc("count"))
)

In [44]:
wordCounts.show(10, truncate=False)

+-------+-----+
|word   |count|
+-------+-----+
|       |63   |
|$      |21   |
|>>>    |14   |
|```    |8    |
|pyspark|8    |
|#      |7    |
|python |7    |
|Spark  |6    |
|with   |6    |
|in     |6    |
+-------+-----+
only showing top 10 rows



In [18]:
wordCounts.count()

235

### spark SQL

In [47]:
textFile.createOrReplaceTempView("textFile")   # create a table "textFile"

In [48]:
spark.sql("select * from textFile limit 5").show()

+--------------------+
|               value|
+--------------------+
|[Databricks Sandb...|
|                    |
|          2019-07-30|
|                    |
|  reinstall Anaconda|
+--------------------+



In [49]:
sql_stmt = """
    with words as (
        select 
            explode(split(value, " ")) as word 
        from textFile
    )
    select 
        word, count(*) as count
    from words
    group by word
    order by count desc
    limit 10
"""

spark.sql(sql_stmt).show()

+-------+-----+
|   word|count|
+-------+-----+
|       |   85|
|      $|   21|
|    >>>|   14|
|pyspark|    8|
|    ```|    8|
|      #|    7|
| python|    7|
|     in|    6|
|   with|    6|
|     ##|    6|
+-------+-----+



### RDD

In [28]:
sc = spark.sparkContext

In [29]:
f = sc.textFile("spark.README.md")

In [31]:
f.take(5)

['[Databricks Sandbox](https://community.cloud.databricks.com)',
 '',
 '2019-07-30',
 '',
 'reinstall Anaconda']

In [32]:
wc = (
    f.flatMap(lambda x: x.split(" "))
    .map(lambda x: (x, 1))
    .reduceByKey(lambda a,b: a+b)
)

In [33]:
wc.take(5)

[('[Databricks', 1),
 ('Sandbox](https://community.cloud.databricks.com)', 1),
 ('', 85),
 ('2019-07-30', 1),
 ('https://www.digitalocean.com/community/tutorials/how-to-install-anaconda-on-ubuntu-18-04-quickstart',
  2)]

In [46]:
sorted(wc.collect(), key = lambda x: x[1], reverse=True)[:10]

[('', 85),
 ('$', 21),
 ('>>>', 14),
 ('```', 8),
 ('pyspark', 8),
 ('python', 7),
 ('#', 7),
 ('Spark', 6),
 ('in', 6),
 ('##', 6)]