## Examples from Data Analysis with Python and PySpark

(A.K.A PySpark in Action)

In [2]:
import numpy as np
import pandas as pd
import pyspark
import urllib

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.classification import *
from pyspark.ml.evaluation import *
from pyspark.ml.feature import *

In [3]:
spark = (SparkSession
         .builder
         .appName("Analyzing vocabulary")
         .getOrCreate())

# getOrCreate avoids creation of a new session, if one exists 
# but this may mean you can't change some JVM config options

## Read Data

In [4]:
url = "https://www.gutenberg.org/files/205/205-0.txt"
urllib.request.urlretrieve(url, './thoreau.txt')

('./thoreau.txt', <http.client.HTTPMessage at 0x406f5da650>)

In [5]:
book = spark.read.text("./thoreau.txt")

book.printSchema()

DataFrame[value: string]

In [8]:
# See docs
!pyspark spark.read?

[0;31mType:[0m        property
[0;31mString form:[0m <property object at 0x406f810720>
[0;31mDocstring:[0m  
Returns a :class:`DataFrameReader` that can be used to read data
in as a :class:`DataFrame`.

.. versionadded:: 2.0.0

Returns
-------
:class:`DataFrameReader`


In [11]:
book.show(10, truncate=50)

+--------------------------------------------------+
|                                             value|
+--------------------------------------------------+
|The Project Gutenberg eBook of Walden, by Henry...|
|                                                  |
|This eBook is for the use of anyone anywhere in...|
|most other parts of the world at no cost and wi...|
|whatsoever. You may copy it, give it away or re...|
|of the Project Gutenberg License included with ...|
|www.gutenberg.org. If you are not located in th...|
|will have to check the laws of the country wher...|
|                                 using this eBook.|
|                                                  |
+--------------------------------------------------+
only showing top 10 rows



## Tokenization

In [14]:
from pyspark.sql.functions import split

# split the string by spaces, and change the name of the column
# then select that one column
lines = book.select(split(book.value, " ").alias("line"))

lines.show(5, truncate=50)

+--------------------------------------------------+
|                                              line|
+--------------------------------------------------+
|[The, Project, Gutenberg, eBook, of, Walden,, b...|
|                                                []|
|[This, eBook, is, for, the, use, of, anyone, an...|
|[most, other, parts, of, the, world, at, no, co...|
|[whatsoever., You, may, copy, it,, give, it, aw...|
+--------------------------------------------------+
only showing top 5 rows



In [15]:
book.select(book.value)

DataFrame[value: string]

In [18]:
book.select(book["value"])

DataFrame[value: string]

In [19]:
book.select(col("value"))

DataFrame[value: string]

In [16]:
book.select("value")

DataFrame[value: string]

In [25]:
from pyspark.sql.functions import explode, col

# explode turns a column of vectors into a column, like ravel() in numpy

words = lines.select(explode(col("line")).alias("word"))

words.show(15)

+---------+
|     word|
+---------+
|      The|
|  Project|
|Gutenberg|
|    eBook|
|       of|
|  Walden,|
|       by|
|    Henry|
|    David|
|  Thoreau|
|         |
|     This|
|    eBook|
|       is|
|      for|
+---------+
only showing top 15 rows



## Cleaning Data

In [27]:
from pyspark.sql.functions import lower

words_lower = words.select(lower(col("word")).alias("word_lower"))
words_lower.show()

+----------+
|word_lower|
+----------+
|       the|
|   project|
| gutenberg|
|     ebook|
|        of|
|   walden,|
|        by|
|     henry|
|     david|
|   thoreau|
|          |
|      this|
|     ebook|
|        is|
|       for|
|       the|
|       use|
|        of|
|    anyone|
|  anywhere|
+----------+
only showing top 20 rows



In [28]:
from pyspark.sql.functions import regexp_extract
words_clean = words_lower.select(
    regexp_extract(col("word_lower"), "[a-z]+", 0).alias("word")
)

words_clean.show()

+---------+
|     word|
+---------+
|      the|
|  project|
|gutenberg|
|    ebook|
|       of|
|   walden|
|       by|
|    henry|
|    david|
|  thoreau|
|         |
|     this|
|    ebook|
|       is|
|      for|
|      the|
|      use|
|       of|
|   anyone|
| anywhere|
+---------+
only showing top 20 rows



In [30]:
# use ~ operator to invert the filter expression
words_nonull = words_clean.filter(col("word") != "")

words_nonull.show()

+---------+
|     word|
+---------+
|      the|
|  project|
|gutenberg|
|    ebook|
|       of|
|   walden|
|       by|
|    henry|
|    david|
|  thoreau|
|     this|
|    ebook|
|       is|
|      for|
|      the|
|      use|
|       of|
|   anyone|
| anywhere|
|       in|
+---------+
only showing top 20 rows



## Count