In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *

In [2]:
spark = SparkSession\
    .builder\
    .appName("word-count")\
    .getOrCreate()

## Dataframe

In [3]:
textFile = spark.read.text("spark.README.md")

In [4]:
type(textFile)

pyspark.sql.dataframe.DataFrame

In [5]:
textFile.select("value").show(5, False)

+------------------------------------------------------------+
|value                                                       |
+------------------------------------------------------------+
|[Databricks Sandbox](https://community.cloud.databricks.com)|
|                                                            |
|2019-07-30                                                  |
|                                                            |
|reinstall Anaconda                                          |
+------------------------------------------------------------+
only showing top 5 rows



In [6]:
linesWithSpark = textFile.filter(F.lower(F.col("value")).contains("spark"))

In [7]:
linesWithSpark.show(5, False)

+----------------------------------------------------+
|value                                               |
+----------------------------------------------------+
|Apache Spark                                        |
|http://spark.apache.org/                            |
|$ mkdir -p ~/spark                                  |
|$ cd spark                                          |
|$ tar zxvf ~/Downloads/spark-2.4.3-bin-hadoop2.7.tgz|
+----------------------------------------------------+
only showing top 5 rows



In [8]:
wordCounts = (
    textFile
    .select(F.explode(F.split(textFile.value, "\s+")).alias("word"))
    .groupBy("word").count()
    .orderBy(F.desc("count"))
)

In [9]:
wordCounts.show(10, truncate=False)

+-------+-----+
|word   |count|
+-------+-----+
|       |63   |
|$      |21   |
|>>>    |14   |
|pyspark|8    |
|```    |8    |
|#      |7    |
|python |7    |
|Spark  |6    |
|##     |6    |
|in     |6    |
+-------+-----+
only showing top 10 rows



In [10]:
wordCounts.count()

235

In [24]:
wordCounts.rdd.getNumPartitions()

11

In [25]:
wordCounts.coalesce(2).write.format("csv").mode("overwrite").save("/tmp/wc_df.csv")

### spark SQL

In [11]:
textFile.createOrReplaceTempView("textFile")   # create a table "textFile"

In [12]:
spark.sql("select * from textFile limit 5").show()

+--------------------+
|               value|
+--------------------+
|[Databricks Sandb...|
|                    |
|          2019-07-30|
|                    |
|  reinstall Anaconda|
+--------------------+



In [13]:
sql_stmt = """
    with words as (
        select 
            explode(split(value, " ")) as word 
        from textFile
    )
    select 
        word, count(*) as count
    from words
    group by word
    order by count desc
    limit 10
"""

spark.sql(sql_stmt).show()

+-------+-----+
|   word|count|
+-------+-----+
|       |   85|
|      $|   21|
|    >>>|   14|
|pyspark|    8|
|    ```|    8|
|      #|    7|
| python|    7|
|     in|    6|
|   with|    6|
|     ##|    6|
+-------+-----+



### RDD

In [14]:
sc = spark.sparkContext

In [15]:
f = sc.textFile("spark.README.md")

In [16]:
f.take(5)

['[Databricks Sandbox](https://community.cloud.databricks.com)',
 '',
 '2019-07-30',
 '',
 'reinstall Anaconda']

In [17]:
wc = (
    f.flatMap(lambda x: x.split(" "))
    .map(lambda x: (x, 1))
    .reduceByKey(lambda a,b: a+b)
)

In [18]:
wc.take(5)

[('[Databricks', 1),
 ('Sandbox](https://community.cloud.databricks.com)', 1),
 ('', 85),
 ('2019-07-30', 1),
 ('https://www.digitalocean.com/community/tutorials/how-to-install-anaconda-on-ubuntu-18-04-quickstart',
  2)]

In [19]:
sorted(wc.collect(), key = lambda x: x[1], reverse=True)[:10]

[('', 85),
 ('$', 21),
 ('>>>', 14),
 ('```', 8),
 ('pyspark', 8),
 ('python', 7),
 ('#', 7),
 ('Spark', 6),
 ('in', 6),
 ('##', 6)]

In [22]:
wc.saveAsTextFile("/tmp/wc_rdd.txt")

In [26]:
!ls /tmp/wc_*

/tmp/wc_df.csv:
part-00000-bad9b944-c0e4-4386-ab79-202cd626fad0-c000.csv  _SUCCESS
part-00001-bad9b944-c0e4-4386-ab79-202cd626fad0-c000.csv

/tmp/wc_rdd.txt:
part-00000  part-00001	_SUCCESS
