In [1]:
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder \
    .appName("example") \
    .getOrCreate()

# Loading a dataframe from a parquet file

A dataframe file called sherlock_sentences.parquet is available in your workspace. Each row of this dataframe contains a single clause. Each clause is a sequence of words that is separated from other clauses by punctuation, such as periods, quotes, and other natural language delimiters that signify a sentence or sentence fragment. Your mission, if you choose to accept it, is to load this file.

In [2]:
# Load the dataframe
df = spark.read.parquet('dataset/sherlock.parquet')

# Filter and show the first 5 rows
df.where('id > 70').show(5, truncate=False)

+------+---+
|word  |id |
+------+---+
|it    |71 |
|do    |72 |
|not   |73 |
|change|74 |
|or    |75 |
+------+---+
only showing top 5 rows



# Split and explode a text column

Each clause is a string containing one or more words separated by spaces.

In [3]:
df = spark.read.csv('dataset/sherlock.txt', header=False)
df.show(5, truncate=False)

+--------------------------------------------------------------------+
|_c0                                                                 |
+--------------------------------------------------------------------+
|The Project Gutenberg EBook of The Adventures of Sherlock Holmes    |
|by Sir Arthur Conan Doyle                                           |
|(#15 in our series by Sir Arthur Conan Doyle)                       |
|Copyright laws are changing all over the world. Be sure to check the|
|copyright laws for your country before downloading or redistributing|
+--------------------------------------------------------------------+
only showing top 5 rows



In [4]:
from pyspark.sql.functions import split, explode

# Split the clause column into a column called words 
punctuation = "_|.\?\!\",\'\[\]\*()"
split_df = df.select(split('_c0', '[ %s]' % punctuation).alias('words'))
split_df.show(5, truncate=False)


+-----------------------------------------------------------------------------------+
|words                                                                              |
+-----------------------------------------------------------------------------------+
|[The, Project, Gutenberg, EBook, of, The, Adventures, of, Sherlock, Holmes]        |
|[by, Sir, Arthur, Conan, Doyle]                                                    |
|[, #15, in, our, series, by, Sir, Arthur, Conan, Doyle, ]                          |
|[Copyright, laws, are, changing, all, over, the, world, , Be, sure, to, check, the]|
|[copyright, laws, for, your, country, before, downloading, or, redistributing]     |
+-----------------------------------------------------------------------------------+
only showing top 5 rows



In [5]:

# Explode the words column into a column called word 
exploded_df = split_df.select(explode('words').alias('word'))
exploded_df.show(10)

# Count the resulting number of rows in exploded_df
print("\nNumber of rows: ", exploded_df.count())

+----------+
|      word|
+----------+
|       The|
|   Project|
| Gutenberg|
|     EBook|
|        of|
|       The|
|Adventures|
|        of|
|  Sherlock|
|    Holmes|
+----------+
only showing top 10 rows


Number of rows:  939694


In [6]:

# Explode the words column into a column called word 
exploded_df = exploded_df.filter("LEN(word) > 0")

# Count the resulting number of rows in exploded_df
print("\nNumber of rows: ", exploded_df.count())


Number of rows:  807668


# Using monotonically_increasing_id()

monotonically_increasing_id is efficient at generating a column of integers that is always increasing. It is useful for creating a unique id per row. Which of the following sequences would not be generated by the `monotonically_increasing_id` operation?

- 1, 2, 3, 3, 4, 5, 5, 6

# Creating context window feature data

The moving window technique is useful for machine learning algorithms models that use context window feature data.

A table text having columns id, word, part, title is available in your workspace. It contains chapters 9, 10, 11 and 12 of the Sherlock Holmes book. The words are already processed and organized into one word per row. Each word has a unique integer index provided by the column id. The id column is lower for words that appear earlier in the text and greater for words appearing later in the text.

The first 10 rows of the dataset for chapter 12 are printed to the console as Table1. The first ten rows of the desired result, constrained to show part 12 (Chapter 12) are printed to the console as Table2. In Table2, the "given" word for the row is provided in column w3. Columns w1 and w2 give the two words immediately prior to the given word. Columns w4 and w5 give the two words immediately after the given word.

In [7]:
from pyspark.sql.functions import monotonically_increasing_id
exploded_df = exploded_df.withColumn("id", monotonically_increasing_id())
exploded_df.createOrReplaceTempView("text")
query = """
SELECT  *
FROM text
"""
spark.sql(query).show(10)
print(exploded_df.count())


+----------+---+
|      word| id|
+----------+---+
|       The|  0|
|   Project|  1|
| Gutenberg|  2|
|     EBook|  3|
|        of|  4|
|       The|  5|
|Adventures|  6|
|        of|  7|
|  Sherlock|  8|
|    Holmes|  9|
+----------+---+
only showing top 10 rows

807668


In [8]:
from pyspark.sql import functions as F

# Calculate total count of rows
total_count = exploded_df.count()

# Calculate number of rows per part
rows_per_part = total_count // 12

# Calculate part number for each row
df_with_part = exploded_df.withColumn("part", F.expr(f"ntile(12) over (order by id)"))

# Show DataFrame with part column
df_with_part.show(10)
df_with_part.createOrReplaceTempView("text")


+----------+---+----+
|      word| id|part|
+----------+---+----+
|       The|  0|   1|
|   Project|  1|   1|
| Gutenberg|  2|   1|
|     EBook|  3|   1|
|        of|  4|   1|
|       The|  5|   1|
|Adventures|  6|   1|
|        of|  7|   1|
|  Sherlock|  8|   1|
|    Holmes|  9|   1|
+----------+---+----+
only showing top 10 rows



In [9]:
query = """
SELECT
part,
LAG(word, 2) OVER(PARTITION BY part ORDER BY id) AS w1,
LAG(word, 1) OVER(PARTITION BY part ORDER BY id) AS w2,
word AS w3,
LEAD(word, 1) OVER(PARTITION BY part ORDER BY id) AS w4,
LEAD(word, 2) OVER(PARTITION BY part ORDER BY id) AS w5
FROM text
"""
text_df = spark.sql(query).where("part = 12")
text_df.show(10)


+----+-------+-------+-------+-------+-------+
|part|     w1|     w2|     w3|     w4|     w5|
+----+-------+-------+-------+-------+-------+
|  12|   NULL|   NULL|  Petya|     Go|     go|
|  12|   NULL|  Petya|     Go|     go|    she|
|  12|  Petya|     Go|     go|    she|     is|
|  12|     Go|     go|    she|     is|calling|
|  12|     go|    she|     is|calling|    and|
|  12|    she|     is|calling|    and|weeping|
|  12|     is|calling|    and|weeping|   like|
|  12|calling|    and|weeping|   like|      a|
|  12|    and|weeping|   like|      a|  child|
|  12|weeping|   like|      a|  child|    and|
+----+-------+-------+-------+-------+-------+
only showing top 10 rows



# Repartitioning the data

The dataframe text_df is currently in a single partition. Suppose that you know that the upcoming processing steps are going to be grouping the data on chapters. Processing the data will be most efficient if each chapter stays within a single machine. To avoid unnecessary shuffling of the data from one machine to another, let's repartition the dataframe into one partition per chapter, using the repartition and getNumPartitions commands taught in the first video lesson to this chapter.

In [10]:
# Repartition text_df into 12 partitions on 'chapter' column
repart_df = text_df.repartition(12, 'part')

# Prove that repart_df has 12 partitions
repart_df.rdd.getNumPartitions()

12

# What type of data is this

Words, song ids, and video ids are all examples of what type of data?

- Categorical data

# Finding common word sequences

 Our objective is to create a dataset where each row corresponds to a 5-tuple, having a count indicating how many times the tuple occurred in the dataset.

In [11]:
query = """
SELECT *
FROM text
"""
ttext_df = spark.sql(query)
ttext_df.show(10)


+----------+---+----+
|      word| id|part|
+----------+---+----+
|       The|  0|   1|
|   Project|  1|   1|
| Gutenberg|  2|   1|
|     EBook|  3|   1|
|        of|  4|   1|
|       The|  5|   1|
|Adventures|  6|   1|
|        of|  7|   1|
|  Sherlock|  8|   1|
|    Holmes|  9|   1|
+----------+---+----+
only showing top 10 rows



In [12]:
# Find the top 10 sequences of five words
query = """
SELECT w1, w2, w3, w4, w5, COUNT(*) AS count FROM (
   SELECT word AS w1,
   LEAD(word,1) OVER(PARTITION BY part ORDER BY id ) AS w2,
   LEAD(word,2) OVER(PARTITION BY part ORDER BY id ) AS w3,
   LEAD(word,3) OVER(PARTITION BY part ORDER BY id ) AS w4,
   LEAD(word,4) OVER(PARTITION BY part ORDER BY id ) AS w5
   FROM text
)
GROUP BY w1, w2, w3, w4, w5
ORDER BY count DESC
LIMIT 10
"""
df = spark.sql(query)
df.show()

+-------------+---------+---------+---------+----------+-----+
|           w1|       w2|       w3|       w4|        w5|count|
+-------------+---------+---------+---------+----------+-----+
|      Project|Gutenberg| Literary|  Archive|Foundation|   31|
|          the|    other|     side|       of|       the|   24|
|           in|      the|   region|       of|       the|   24|
|           on|      the|     same|    lines|        as|   21|
|Illustration:|     From|       an|      old|     print|   18|
|           in|      the|   middle|       of|       the|   18|
|          the|  Project|Gutenberg| Literary|   Archive|   18|
|    Copyright|       by|Underwood|      and| Underwood|   17|
|Illustration:|Copyright|       by|Underwood|       and|   17|
|           up|      and|     down|      the|      room|   17|
+-------------+---------+---------+---------+----------+-----+



# Unique 5-tuples in sorted order

A previous lesson taught an operation that eliminates duplicates, fetching unique records. In a previous exercise you obtained common 5-tuples. We will combine these two capabilities to find the unique 5-tuples, sorted alphabetically in descending order.

In [13]:
# Unique 5-tuples sorted in descending order
query = """
SELECT DISTINCT w1, w2, w3, w4, w5 FROM (
   SELECT word AS w1,
   LEAD(word, 1) OVER(PARTITION BY PART ORDER BY id ) AS w2,
   LEAD(word, 2) OVER(PARTITION BY PART ORDER BY id ) AS w3,
   LEAD(word, 3) OVER(PARTITION BY PART ORDER BY id ) AS w4,
   LEAD(word, 4) OVER(PARTITION BY PART ORDER BY id ) AS w5
   FROM text
)
ORDER BY w1 DESC, w2 DESC, w3 DESC, w4 DESC, w5 DESC 
LIMIT 10
"""
df = spark.sql(query)
df.show()

+---------+----------+-----------+----------+------------+
|       w1|        w2|         w3|        w4|          w5|
+---------+----------+-----------+----------+------------+
|        ~|        be|       used|        to|      convey|
|zygomatic|       and|    frontal|     bones|       vault|
|   zygoma|        in|      front|        of|         the|
|       zu|      sein|       Vera|        at|         the|
|  zoology|       was|        not|    merely|acknowledged|
|  zoology|   observe|       only|       the|    muscular|
|zone--not|       the|        red|margin--an|  artificial|
|     zone|     which|       lies|     about|        half|
|     zone|     which|corresponds|        to|         the|
|     zone|separating|        the|    shadow|          of|
+---------+----------+-----------+----------+------------+



# Most frequent 3-tuples per chapter

We will now use a query as a subquery in a larger query. Spark SQL supports advanced features of SQL. Previously you learned how to find the most common word sequences over an entire book having 12 chapters. Now you will obtain the most frequent 3-tuple for each of the 12 chapters. You will do this using a window function to retrieve the top row per group.

In [14]:
df = spark.table("text")
df = df.withColumn("chapter", df["part"])
df = df.select(df.id, df.word, df.part, df.chapter)
df.createOrReplaceTempView("text")
df.show(5)

+---+---------+----+-------+
| id|     word|part|chapter|
+---+---------+----+-------+
|  0|      The|   1|      1|
|  1|  Project|   1|      1|
|  2|Gutenberg|   1|      1|
|  3|    EBook|   1|      1|
|  4|       of|   1|      1|
+---+---------+----+-------+
only showing top 5 rows



In [15]:
subquery = """
SELECT chapter, w1, w2, w3, COUNT(*) as count
FROM
(
    SELECT
    chapter,
    word AS w1,
    LEAD(word, 1) OVER(PARTITION BY chapter ORDER BY id ) AS w2,
    LEAD(word, 2) OVER(PARTITION BY chapter ORDER BY id ) AS w3
    FROM text
)
GROUP BY chapter, w1, w2, w3
ORDER BY chapter, count DESC
"""
spark.sql(subquery).show(5)

+-------+----+-----+-----+-----+
|chapter|  w1|   w2|   w3|count|
+-------+----+-----+-----+-----+
|      1|   I|think| that|   34|
|      1| one|   of|  the|   24|
|      1|that|   it|  was|   24|
|      1|Lord|   St|Simon|   23|
|      1|  It|   is|    a|   22|
+-------+----+-----+-----+-----+
only showing top 5 rows



In [16]:

#   Most frequent 3-tuple per chapter
query = """
SELECT chapter, w1, w2, w3, count FROM
(
  SELECT
  chapter,
  ROW_NUMBER() OVER (PARTITION BY chapter ORDER BY count DESC) AS row,
  w1, w2, w3, count
  FROM ( %s )
)
WHERE row = 1
ORDER BY chapter ASC
""" % subquery

spark.sql(query).show()

+-------+---------+------+------+-----+
|chapter|       w1|    w2|    w3|count|
+-------+---------+------+------+-----+
|      1|        I| think|  that|   34|
|      2|      the|United|States|   57|
|      3|      the|United|States|  100|
|      4|      the|United|States|  100|
|      5|      met|  with|    in|   77|
|      6|       of|   the|  bone|   56|
|      7|commander|    in| chief|   51|
|      8|     said|Prince|Andrew|   26|
|      9|        I|   don|     t|   38|
|     10|      out|    of|   the|   38|
|     11|       of|   the|French|   40|
|     12|      the|  will|    of|   30|
+-------+---------+------+------+-----+

