# Chapter 3: Submitting and scaling your first PySpark program

In [12]:
# Set up
from pyspark.sql import SparkSession

spark = (SparkSession
         .builder
         .appName("Analyzing the vocabulary of Pride and Prejudice.")
         .getOrCreate())

spark.sparkContext.setLogLevel('ERROR')

In [13]:
# Data Frame Setup
# Set up
from pyspark.sql.functions import col, split, lower, explode, regexp_extract

book = spark.read.text("data/Ch02/1342-0.txt")
lines = book.select(split(col('value'), " ").alias("line"))
words = lines.select(explode(col("line")).alias("word"))
words_lower = words.select(lower("word").alias("word_lower"))
word_norm = words_lower.select(
    regexp_extract(col("word_lower"), "[a-z]*", 0).alias("word_normalized"))
word_nonull = word_norm.filter(col("word_normalized") != "") \
                       .withColumnRenamed('word_normalized', 'word_nonull')

## `Count`

- `GroupedData` allows you to perform an aggregate function on each group. 
- Use `groupby` to count record occurrence, passing columns we want to group.  Returned value is a `GroupedData` object, not a `DataFrame`.  Once you apply a function to it like `count()`, it returns a  `DataFrame`.
    - Note that `groupby` and `groupBy` are the same thing.
- You can sort the output by `orderBy`
    - Note that `orderBy` only exists as camel case.

In [16]:
groups = word_nonull.groupBy(col("word_nonull"))
display(groups)

results = groups.count().orderBy("count", ascending=False)
results.show()

<pyspark.sql.group.GroupedData at 0x11b77b910>

+-----------+-----+
|word_nonull|count|
+-----------+-----+
|        the| 4480|
|         to| 4218|
|         of| 3711|
|        and| 3504|
|        her| 2199|
|          a| 1982|
|         in| 1909|
|        was| 1838|
|          i| 1750|
|        she| 1668|
|       that| 1487|
|         it| 1482|
|        not| 1427|
|        you| 1301|
|         he| 1296|
|         be| 1257|
|        his| 1247|
|         as| 1174|
|        had| 1170|
|       with| 1092|
+-----------+-----+
only showing top 20 rows

