In [0]:
dbutils.fs.cp(
  'dbfs:/FileStore/Spark_The_Definitive_Guide_master.zip', 
  'file:/tmp/Spark_The_Definitive_Guide_master.zip')

Out[16]: True

In [0]:
%sh unzip /tmp/Spark_The_Definitive_Guide_master.zip -d /tmp

Archive:  /tmp/Spark_The_Definitive_Guide_master.zip
4ba5601eb9b9aed1d01ab79775e3af228216ff6f
   creating: /tmp/Spark-The-Definitive-Guide-master/
  inflating: /tmp/Spark-The-Definitive-Guide-master/.gitignore  
  inflating: /tmp/Spark-The-Definitive-Guide-master/README.md  
   creating: /tmp/Spark-The-Definitive-Guide-master/code/
  inflating: /tmp/Spark-The-Definitive-Guide-master/code/A_Gentle_Introduction_to_Spark-Chapter_1_Defining_Spark.scala  
  inflating: /tmp/Spark-The-Definitive-Guide-master/code/A_Gentle_Introduction_to_Spark-Chapter_2_A_Gentle_Introduction_to_Spark.py  
  inflating: /tmp/Spark-The-Definitive-Guide-master/code/A_Gentle_Introduction_to_Spark-Chapter_2_A_Gentle_Introduction_to_Spark.scala  
  inflating: /tmp/Spark-The-Definitive-Guide-master/code/A_Gentle_Introduction_to_Spark-Chapter_3_A_Tour_of_Sparks_Toolset.py  
  inflating: /tmp/Spark-The-Definitive-Guide-master/code/A_Gentle_Introduction_to_Spark-Chapter_3_A_Tour_of_Sparks_Toolset.r  
  inflating: /tmp/S

In [0]:
df = spark.read.format("csv")\
.option("header", "true")\
.option("inferSchema", "true")\
.load("file:///tmp/Spark-The-Definitive-Guide-master/data/retail-data/by-day/2010-12-01.csv")
df.printSchema()
df.createOrReplaceTempView("dfTable")

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: timestamp (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)



In [0]:
# Converting to Spark Types
# convert native types to Spark types
# one such function is lit: This function converts a type in another language to its correspnding Spark representation.

from pyspark.sql.functions import lit
df.select(lit(5), lit("five"), lit(5.0)).printSchema()

root
 |-- 5: integer (nullable = false)
 |-- five: string (nullable = false)
 |-- 5.0: double (nullable = false)



In [0]:
# Working with Booleans
from pyspark.sql.functions import col
df.where(col("InvoiceNo") != 536365)\
.select("InvoiceNo", "Description")\
.show(5, False)

df.where("InvoiceNo = 536365") \
.show(5, False)

+---------+-----------------------------+
|InvoiceNo|Description                  |
+---------+-----------------------------+
|536366   |HAND WARMER UNION JACK       |
|536366   |HAND WARMER RED POLKA DOT    |
|536367   |ASSORTED COLOUR BIRD ORNAMENT|
|536367   |POPPY'S PLAYHOUSE BEDROOM    |
|536367   |POPPY'S PLAYHOUSE KITCHEN    |
+---------+-----------------------------+
only showing top 5 rows

+---------+---------+-----------------------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|Description                        |Quantity|InvoiceDate        |UnitPrice|CustomerID|Country       |
+---------+---------+-----------------------------------+--------+-------------------+---------+----------+--------------+
|536365   |85123A   |WHITE HANGING HEART T-LIGHT HOLDER |6       |2010-12-01 08:26:00|2.55     |17850.0   |United Kingdom|
|536365   |71053    |WHITE METAL LANTERN                |6       |2010-12-01 08:26:00|3.39     |17850.0  

In [0]:
# In Spark, you should always chain together and filters as a sequential filter.
# The reason for this is that even if Boolean statements are expressed serially (one after the other), Spark will flatten all of these filters into one statement and perform the filter at the same time, creating the and statement for us
# or statements need to be specified in the same statement

from pyspark.sql.functions import instr
priceFilter = col("UnitPrice") > 600
descripFilter = instr(df.Description, "POSTAGE") >= 1
df.where(df.StockCode.isin("DOT")).where(priceFilter | descripFilter).show()

+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|   Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|   536544|      DOT|DOTCOM POSTAGE|       1|2010-12-01 14:32:00|   569.77|      null|United Kingdom|
|   536592|      DOT|DOTCOM POSTAGE|       1|2010-12-01 17:06:00|   607.49|      null|United Kingdom|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+



In [0]:
# Boolean expressions are not just reserved to filters. To filter a DataFrame, you can also just specify a Boolean column:
from pyspark.sql.functions import instr, col
DOTCodeFilter = col("StockCode") == "DOT"
priceFilter = col("UnitPrice") > 600
descripFilter = instr(col("Description"), "POSTAGE") >= 1
df.withColumn("isExpensive", DOTCodeFilter & (priceFilter | descripFilter))\
.where("isExpensive")\
.select("unitPrice", "isExpensive").show(5)


from pyspark.sql.functions import expr
df.withColumn("isExpensive", expr("NOT UnitPrice <= 250"))\
.where("isExpensive")\
.select("Description", "UnitPrice").show(5)


+---------+-----------+
|unitPrice|isExpensive|
+---------+-----------+
|   569.77|       true|
|   607.49|       true|
+---------+-----------+

+--------------+---------+
|   Description|UnitPrice|
+--------------+---------+
|DOTCOM POSTAGE|   569.77|
|DOTCOM POSTAGE|   607.49|
+--------------+---------+



In [0]:

# One “gotcha” that can come up is if you’re working with null data when creating Boolean expressions.
# If there is a null in your data, you’ll need to treat things a bit differently. Here’s how you can ensure
# that you perform a null-safe equivalence test:

from pyspark.sql import Row
df1 = spark.createDataFrame([
    Row(id=1, value='foo'),
    Row(id=2, value=None)
])
df1.select(
    df1['value'] == 'foo',
    df1['value'].eqNullSafe('foo'),
    df1['value'].eqNullSafe(None)
).show()






df2 = spark.createDataFrame([
    Row(value = 'bar'),
    Row(value = None)
])
print(df1.join(df2, df1["value"] == df2["value"]).count())

print(df1.join(df2, df1["value"].eqNullSafe(df2["value"])).count())

df2 = spark.createDataFrame([
    Row(id=1, value=float('NaN')),
    Row(id=2, value=42.0),
    Row(id=3, value=None)
])
df2.show()
df2.select(
    df2['value'].eqNullSafe(None),
    df2['value'].eqNullSafe(float('NaN')),
    df2['value'].eqNullSafe(42.0)
).show()

+-------------+---------------+----------------+
|(value = foo)|(value <=> foo)|(value <=> NULL)|
+-------------+---------------+----------------+
|         true|           true|           false|
|         null|          false|            true|
+-------------+---------------+----------------+

0
1
+---+-----+
| id|value|
+---+-----+
|  1|  NaN|
|  2| 42.0|
|  3| null|
+---+-----+

+----------------+---------------+----------------+
|(value <=> NULL)|(value <=> NaN)|(value <=> 42.0)|
+----------------+---------------+----------------+
|           false|           true|           false|
|           false|          false|            true|
|            true|          false|           false|
+----------------+---------------+----------------+



In [0]:
# Working with Numbers
from pyspark.sql.functions import expr, pow
fabricatedQuantity = pow(col("Quantity") * col("UnitPrice"), 2) + 5
df.select(expr("CustomerId"), fabricatedQuantity.alias("realQuantity")).show(2)

# Same Can Be Done Using SQL Expression
df.selectExpr(
"CustomerId",
"(POWER((Quantity * UnitPrice), 2.0) + 5) as realQuantity").show(2)

+----------+------------------+
|CustomerId|      realQuantity|
+----------+------------------+
|   17850.0|239.08999999999997|
|   17850.0|          418.7156|
+----------+------------------+
only showing top 2 rows

+----------+------------------+
|CustomerId|      realQuantity|
+----------+------------------+
|   17850.0|239.08999999999997|
|   17850.0|          418.7156|
+----------+------------------+
only showing top 2 rows



In [0]:
# the round function rounds up if you’re exactly in between two numbers. You can round down by using the bround:
from pyspark.sql.functions import lit, round, bround
df.select(round(lit("2.5")), bround(lit("2.5"))).show(2)

+-------------+--------------+
|round(2.5, 0)|bround(2.5, 0)|
+-------------+--------------+
|          3.0|           2.0|
|          3.0|           2.0|
+-------------+--------------+
only showing top 2 rows



In [0]:
# numerical task is to compute the correlation of two columns. For example, we can see
# the Pearson correlation coefficient for two columns to see if cheaper things are typically bought
# in greater quantities. We can do this through a function as well as through the DataFrame
# statistic methods:

from pyspark.sql.functions import corr
df.stat.corr("Quantity", "UnitPrice")
df.select(corr("Quantity", "UnitPrice")).show()

+-------------------------+
|corr(Quantity, UnitPrice)|
+-------------------------+
|     -0.04112314436835551|
+-------------------------+



In [0]:
# Another common task is to compute summary statistics for a column or set of columns. We can
# use the describe method to achieve exactly this. This will take all numeric columns and
# calculate the count, mean, standard deviation, min, and max
df.describe().show()

+-------+-----------------+------------------+--------------------+------------------+------------------+------------------+--------------+
|summary|        InvoiceNo|         StockCode|         Description|          Quantity|         UnitPrice|        CustomerID|       Country|
+-------+-----------------+------------------+--------------------+------------------+------------------+------------------+--------------+
|  count|             3108|              3108|                3098|              3108|              3108|              1968|          3108|
|   mean| 536516.684944841|27834.304044117645|                null| 8.627413127413128| 4.151946589446603|15661.388719512195|          null|
| stddev|72.89447869788873|17407.897548583845|                null|26.371821677029203|15.638659854603892|1854.4496996893627|          null|
|    min|           536365|             10002| 4 PURPLE FLOCK D...|               -24|               0.0|           12431.0|     Australia|
|    max|          C

In [0]:
# As a last note, we can also add a unique ID to each row by using the function
# monotonically_increasing_id. This function generates a unique value for each row, starting
# with 0:
from pyspark.sql.functions import monotonically_increasing_id
df.select(monotonically_increasing_id()).show(2)


+-----------------------------+
|monotonically_increasing_id()|
+-----------------------------+
|                            0|
|                            1|
+-----------------------------+
only showing top 2 rows



In [0]:
# Working with Strings
# String Manipulation Functions


from pyspark.sql.functions import initcap 
df.select(initcap(col("Description"))).show(5) # Capitlize First Char Of Word

from pyspark.sql.functions import lower, upper
df.select(col("Description"),
lower(col("Description")),
upper(lower(col("Description")))).show(5) 

from pyspark.sql.functions import lit, ltrim, rtrim, rpad, lpad, trim
df.select(
ltrim(lit(" HELLO ")).alias("ltrim"),
rtrim(lit(" HELLO ")).alias("rtrim"),
trim(lit(" HELLO ")).alias("trim"),
lpad(lit("HELLLLLLL"), 2, " ").alias("lp"), # Add Remaining Number Of Values TO make String Len given
rpad(lit("HELLO"), 10, " ").alias("rp")).show(2)
# Note that if lpad or rpad takes a number less than the length of the string, it will always remove
# values from the right side of the string.

+--------------------+
|initcap(Description)|
+--------------------+
|White Hanging Hea...|
| White Metal Lantern|
|Cream Cupid Heart...|
|Knitted Union Fla...|
|Red Woolly Hottie...|
+--------------------+
only showing top 5 rows

+--------------------+--------------------+-------------------------+
|         Description|  lower(Description)|upper(lower(Description))|
+--------------------+--------------------+-------------------------+
|WHITE HANGING HEA...|white hanging hea...|     WHITE HANGING HEA...|
| WHITE METAL LANTERN| white metal lantern|      WHITE METAL LANTERN|
|CREAM CUPID HEART...|cream cupid heart...|     CREAM CUPID HEART...|
|KNITTED UNION FLA...|knitted union fla...|     KNITTED UNION FLA...|
|RED WOOLLY HOTTIE...|red woolly hottie...|     RED WOOLLY HOTTIE...|
+--------------------+--------------------+-------------------------+
only showing top 5 rows

+------+------+-----+---+----------+
| ltrim| rtrim| trim| lp|        rp|
+------+------+-----+---+----------+
|H

In [0]:
# Regex Expression
from pyspark.sql.functions import regexp_replace
regex_string = "BLACK|WHITE|RED|GREEN|BLUE"

df.select(df.Description, regexp_replace(df.Description, regex_string, "COLOR REPLACEMENT").alias("REGEXP_EXAMPLE")).show(5, truncate=False)

+-----------------------------------+--------------------------------------------------------+
|Description                        |REGEXP_EXAMPLE                                          |
+-----------------------------------+--------------------------------------------------------+
|WHITE HANGING HEART T-LIGHT HOLDER |COLOR REPLACEMENT HANGING HEART T-LIGHT HOLDER          |
|WHITE METAL LANTERN                |COLOR REPLACEMENT METAL LANTERN                         |
|CREAM CUPID HEARTS COAT HANGER     |CREAM CUPID HEARTS COAT HANGER                          |
|KNITTED UNION FLAG HOT WATER BOTTLE|KNITTED UNION FLAG HOT WATER BOTTLE                     |
|RED WOOLLY HOTTIE WHITE HEART.     |COLOR REPLACEMENT WOOLLY HOTTIE COLOR REPLACEMENT HEART.|
+-----------------------------------+--------------------------------------------------------+
only showing top 5 rows



In [0]:
from pyspark.sql.functions import translate
df.select(translate(col("Description"), "LEET", "1347"),col("Description"))\
.show(2, truncate=False)

+----------------------------------+----------------------------------+
|translate(Description, LEET, 1347)|Description                       |
+----------------------------------+----------------------------------+
|WHI73 HANGING H3AR7 7-1IGH7 HO1D3R|WHITE HANGING HEART T-LIGHT HOLDER|
|WHI73 M37A1 1AN73RN               |WHITE METAL LANTERN               |
+----------------------------------+----------------------------------+
only showing top 2 rows



In [0]:
from pyspark.sql.functions import regexp_extract
regex_string = "(BLACK|WHITE|RED|GREEN|BLUE)"

df.select(df.Description, regexp_extract(df.Description, regex_string, 1).alias("result")).show(5, truncate=False)

+-----------------------------------+------+
|Description                        |result|
+-----------------------------------+------+
|WHITE HANGING HEART T-LIGHT HOLDER |WHITE |
|WHITE METAL LANTERN                |WHITE |
|CREAM CUPID HEARTS COAT HANGER     |      |
|KNITTED UNION FLAG HOT WATER BOTTLE|      |
|RED WOOLLY HOTTIE WHITE HEART.     |RED   |
+-----------------------------------+------+
only showing top 5 rows



In [0]:
from pyspark.sql.functions import instr
containsBlack = instr(col("Description"), "BLACK") >= 1
containsWhite = instr(col("Description"), "WHITE") >= 1
df.withColumn("hasSimpleColor", containsBlack | containsWhite)\
.where("hasSimpleColor")\
.select("Description").show(3, False)

+----------------------------------+
|Description                       |
+----------------------------------+
|WHITE HANGING HEART T-LIGHT HOLDER|
|WHITE METAL LANTERN               |
|RED WOOLLY HOTTIE WHITE HEART.    |
+----------------------------------+
only showing top 3 rows



In [0]:
from pyspark.sql.functions import expr, locate
simpleColors = ["black", "white", "red", "green", "blue"]
def color_locator(column, color_string):
    return locate(color_string.upper(), column).cast("boolean").alias("is_" + color_string)

selectedColumns = [color_locator(df.Description, c) for c in simpleColors]
print (selectedColumns)
selectedColumns.append(expr("*")) # has to a be Column type
print (selectedColumns)
df.select(*selectedColumns).show()
df.select(*selectedColumns).where(expr("is_white OR is_red"))\
.select("Description").show(3, False)

[Column<'CAST(locate(BLACK, Description, 1) AS BOOLEAN) AS is_black'>, Column<'CAST(locate(WHITE, Description, 1) AS BOOLEAN) AS is_white'>, Column<'CAST(locate(RED, Description, 1) AS BOOLEAN) AS is_red'>, Column<'CAST(locate(GREEN, Description, 1) AS BOOLEAN) AS is_green'>, Column<'CAST(locate(BLUE, Description, 1) AS BOOLEAN) AS is_blue'>]
[Column<'CAST(locate(BLACK, Description, 1) AS BOOLEAN) AS is_black'>, Column<'CAST(locate(WHITE, Description, 1) AS BOOLEAN) AS is_white'>, Column<'CAST(locate(RED, Description, 1) AS BOOLEAN) AS is_red'>, Column<'CAST(locate(GREEN, Description, 1) AS BOOLEAN) AS is_green'>, Column<'CAST(locate(BLUE, Description, 1) AS BOOLEAN) AS is_blue'>, Column<'unresolvedstar()'>]
+--------+--------+------+--------+-------+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|is_black|is_white|is_red|is_green|is_blue|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerI

In [0]:
# Spark’s TimestampType class supports only second-level precision, which means that if you’re going to be working with milliseconds or microseconds, you’ll need to work around this problem by potentially operating on them as longs. Any more precision when coercing to a TimestampType will be removed.

# Spark can be a bit particular about what format you have at any given point in time. It’s important to be explicit when parsing or converting to ensure that there are no issues in doing so. At the end of the day, Spark is working with Java dates and timestamps and therefore conforms to those standards

# By Default TimeStamp will be shown in UTC
from pyspark.sql.functions import current_date, current_timestamp
spark.range(10).withColumn("curr_date", current_date()).withColumn("curr_timestamp", current_timestamp()).show(truncate=False)

from pyspark.sql.functions import col, date_add, date_sub
spark.range(10).withColumn('curr_date', current_date())\
    .withColumn("5add", date_add(col("curr_date"),5))\
    .withColumn("5sub", date_sub(col("curr_date"), 5))\
    .show(truncate=False)


from pyspark.sql.functions import datediff
spark.range(10)\
    .withColumn('today', current_date())\
    .withColumn('5add', date_add(col('today'), 5))\
    .withColumn('date_diff', datediff(col('today'), col('5add')))\
    .show(truncate=False)

from pyspark.sql.functions import months_between
spark.range(10)\
    .withColumn('today', current_date())\
    .withColumn('60add', date_add(col('today'), 60))\
    .withColumn('55add', date_add(col('today'), 55))\
    .withColumn('months_between60', months_between(col('today'), col('60add')))\
    .withColumn('months_between55', months_between(col('today'), col('55add')))\
    .show(truncate=False)

+---+----------+-----------------------+
|id |curr_date |curr_timestamp         |
+---+----------+-----------------------+
|0  |2024-02-18|2024-02-18 04:37:34.964|
|1  |2024-02-18|2024-02-18 04:37:34.964|
|2  |2024-02-18|2024-02-18 04:37:34.964|
|3  |2024-02-18|2024-02-18 04:37:34.964|
|4  |2024-02-18|2024-02-18 04:37:34.964|
|5  |2024-02-18|2024-02-18 04:37:34.964|
|6  |2024-02-18|2024-02-18 04:37:34.964|
|7  |2024-02-18|2024-02-18 04:37:34.964|
|8  |2024-02-18|2024-02-18 04:37:34.964|
|9  |2024-02-18|2024-02-18 04:37:34.964|
+---+----------+-----------------------+

+---+----------+----------+----------+
|id |curr_date |5add      |5sub      |
+---+----------+----------+----------+
|0  |2024-02-18|2024-02-23|2024-02-13|
|1  |2024-02-18|2024-02-23|2024-02-13|
|2  |2024-02-18|2024-02-23|2024-02-13|
|3  |2024-02-18|2024-02-23|2024-02-13|
|4  |2024-02-18|2024-02-23|2024-02-13|
|5  |2024-02-18|2024-02-23|2024-02-13|
|6  |2024-02-18|2024-02-23|2024-02-13|
|7  |2024-02-18|2024-02-23|2024-02-

In [0]:
from pyspark.sql.functions import to_date, lit, col
spark.range(5).withColumn("date", lit("2017-01-01"))\
.select(to_date(col("date"))).show(1)
spark.range(5).withColumn("date", lit("2017-01-01"))\
.select(to_date(col("date"))).printSchema()

# Spark will not throw an error if it cannot parse the date; rather, it will just return null. This can be a bit tricky in larger pipelines because you might be expecting your data in one format and getting it in another.

spark.range(5).select(to_date(lit("2016-20-12")),to_date(lit("2017-12-11"))).show(1)

# We find this to be an especially tricky situation for bugs because some dates might match the correct format, whereas others do not. In the previous example, notice how the second date appears as Decembers 11th instead of the correct day, November 12th. Spark doesn’t throw an error because it cannot know whether the days are mixed up or that specific row is incorrect.
# Step by step, and come up with a robust way to avoid these issues entirely.
# The first step is to remember that we need to specify our date format according to the Java SimpleDateFormat standard.

from pyspark.sql.functions import to_date
dateFormat = "yyyy-dd-MM"
cleanDateDF = spark.range(1).select(
to_date(lit("2017-12-11"), dateFormat).alias("date"),
to_date(lit("2017-20-12"), dateFormat).alias("date2"))
cleanDateDF.show()

from pyspark.sql.functions import to_timestamp
cleanDateDF.select(to_timestamp(col("date"), dateFormat)).show()

# After we have our date or timestamp in the correct format and type, comparing between them is actually quite easy. We just need to be sure to either use a date/timestamp type or specify our string according to the right format of yyyy-MM-dd if we’re comparing a date:

cleanDateDF.filter(col("date2") > lit("2017-12-12")).show()

# One minor point is that we can also set this as a string, which Spark parses to a literal:
cleanDateDF.filter(col("date2") > "'2017-12-12'").show()

# Implicit type casting is an easy way to shoot yourself in the foot, especially when dealing with null values or dates in different timezones or formats. We recommend that you parse them explicitly instead of relying on implicit conversions.

+-------------+
|to_date(date)|
+-------------+
|   2017-01-01|
+-------------+
only showing top 1 row

root
 |-- to_date(date): date (nullable = true)

+-------------------+-------------------+
|to_date(2016-20-12)|to_date(2017-12-11)|
+-------------------+-------------------+
|               null|         2017-12-11|
+-------------------+-------------------+
only showing top 1 row

+----------+----------+
|      date|     date2|
+----------+----------+
|2017-11-12|2017-12-20|
+----------+----------+

+------------------------------+
|to_timestamp(date, yyyy-dd-MM)|
+------------------------------+
|           2017-11-12 00:00:00|
+------------------------------+

+----------+----------+
|      date|     date2|
+----------+----------+
|2017-11-12|2017-12-20|
+----------+----------+

+----+-----+
|date|date2|
+----+-----+
+----+-----+



In [0]:
# Working with Nulls in Data
# As a best practice, you should always use nulls to represent missing or empty data in your DataFrames
# Spark can optimize working with null values more than it can if you use empty strings or other values
# The primary way of interacting with null values, at DataFrame scale, is to use the .na subpackage on a DataFrame

# Nulls are a challenging part of all programming, and Spark is no exception. In our opinion, being
# explicit is always better than being implicit when handling null values. For instance, in this part of the
# book, we saw how we can define columns as having null types. However, this comes with a catch.
# When we declare a column as not having a null time, that is not actually enforced. To reiterate, when
# you define a schema in which all columns are declared to not have null values, Spark will not enforce
# that and will happily let null values into that column. The nullable signal is simply to help Spark SQL
# optimize for handling that column. If you have null values in columns that should not have null values,
# you can get an incorrect result or see strange exceptions that can be difficult to debug.

# There are two things you can do with null values: you can explicitly drop nulls or you can fill them with a value (globally or on a per-column basis). Let’s experiment with each of these now.

# Spark includes a function to allow you to select the first non-null value from a set of columns by using the coalesce function. In this case, there are no null values, so it simply returns the first column:
    
from pyspark.sql.functions import coalesce, col
df.select(coalesce(col("Description"), col("CustomerId"))).show()

# spark.sql("SELECT ifnull(null, 'return_value'), ifnull('ifnull_default', 'return_value'), nullif('value', 'value'), nvl(null, 'return_value'), nvl2('not_null', 'return_value', 'else_value') FROM dfTable LIMIT 1").show()

# ifnull allows you to select the second value if the first is null, and defaults to the first. 
spark.sql("SELECT ifnull(null, 'return_value'), ifnull('ifnull_default', 'return_value') FROM dfTable LIMIT 1").show()

# use nullif, which returns null if the two values are equal or else returns the second if they are not. 
spark.sql("SELECT nullif(null, null), nullif('value', 'value'), nullif(null, 'value'), nullif('notvalue', 'value') FROM dfTable LIMIT 1").show()

# nvl returns the second value if the first is null, but defaults to the first
spark.sql("SELECT nvl(null, 'second_value'), nvl('first_value', 'second_value') FROM dfTable LIMIT 1").show()

# nvl2 returns the second value if the first is not null; otherwise, it will return the last specified value
spark.sql("SELECT nvl2(null, 'second_value','second_value2'), nvl2('first_value', 'second_value','second_value2') FROM dfTable LIMIT 1").show()

+---------------------------------+
|coalesce(Description, CustomerId)|
+---------------------------------+
|             WHITE HANGING HEA...|
|              WHITE METAL LANTERN|
|             CREAM CUPID HEART...|
|             KNITTED UNION FLA...|
|             RED WOOLLY HOTTIE...|
|             SET 7 BABUSHKA NE...|
|             GLASS STAR FROSTE...|
|             HAND WARMER UNION...|
|             HAND WARMER RED P...|
|             ASSORTED COLOUR B...|
|             POPPY'S PLAYHOUSE...|
|             POPPY'S PLAYHOUSE...|
|             FELTCRAFT PRINCES...|
|             IVORY KNITTED MUG...|
|             BOX OF 6 ASSORTED...|
|             BOX OF VINTAGE JI...|
|             BOX OF VINTAGE AL...|
|             HOME BUILDING BLO...|
|             LOVE BUILDING BLO...|
|             RECIPE BOX WITH M...|
+---------------------------------+
only showing top 20 rows

+--------------------------+------------------------------------+
|ifnull(NULL, return_value)|ifnull(ifnull_de

In [0]:
# Drop Null Values
# The simplest function is drop, which removes rows that contain nulls. The default is to drop any row in which any value is null:
df.na.drop().show(5)
df.na.drop("any").show(5)

# Using “all” drops the row only if all values are null or NaN for that row:
df.na.drop("all").show(5)

# We can also apply this to certain sets of columns by passing in an array of columns
df.na.drop("all", subset=["StockCode", "InvoiceNo"]).show(5)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|2010-12-01 08:26:00|     2.75|   17850.0|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
only showing top 5 rows

+--

In [0]:
# Fill null Values
# you can fill one or more columns with a set of values. This can be done by specifying a map—that is a particular value and a set of columns.

from pyspark.sql.types import ArrayType, StringType, LongType, StructType, StructField, MapType
smapleListSchema = StructType([
    StructField("RollNo", LongType(), True),
    StructField("FName", StringType(), True),
    StructField("LName", StringType(), True),
    StructField("Score", MapType(StringType(), LongType()), True)
])

sampleList = [
    (None, "Uddhav", "Savani", {"P" : 20}),
    (2, "Dev", "Patel", {"P" :23}),
    (1, None, "Savani", {"M" : 21}),
    (2, "Dev", "Patel", {"M" :25}),
    (1, "Uddhav", "Savani", {"C" : 50}),
    (2, "Dev", "Patel", {"C" :80}),
    (2, "Dev", None, None)
]
sampleDF = spark.createDataFrame(data=sampleList, schema=smapleListSchema)
sampleDF.show()
sampleDF.printSchema()
sampleDF.na.fill("all2").show()
sampleDF.na.fill(5).show()

# sampleDF.select(create_map(lit("P"), lit(10))).show()
print (lit(19))
from pyspark.sql.functions import create_map, lit
# sampleDF.na.fill(
#     {
#         "RollNo": 0, 
#         "FName": "Unknown", 
#         "LName": "Unknown",
#         "Score": create_map([lit("P"),lit(10)]) 
#     }
# ).show()

# DataFrame.fillna(value, subset=None)[source]
# Parameters
#     valueint, float, string, bool or dict

# This Is Not Supported As Per API Docs.  Value to replace null values with. If the value is a dict, then subset is ignored and value must be a mapping from column name (string) to replacement value. The replacement value must be an int, float, boolean, or string.

sampleDF.na.fill(
    {
        "RollNo": 0, 
        "FName": "Unknown", 
        "LName": "Unknown"
    }
).show()

+------+------+------+---------+
|RollNo| FName| LName|    Score|
+------+------+------+---------+
|  null|Uddhav|Savani|{P -> 20}|
|     2|   Dev| Patel|{P -> 23}|
|     1|  null|Savani|{M -> 21}|
|     2|   Dev| Patel|{M -> 25}|
|     1|Uddhav|Savani|{C -> 50}|
|     2|   Dev| Patel|{C -> 80}|
|     2|   Dev|  null|     null|
+------+------+------+---------+

root
 |-- RollNo: long (nullable = true)
 |-- FName: string (nullable = true)
 |-- LName: string (nullable = true)
 |-- Score: map (nullable = true)
 |    |-- key: string
 |    |-- value: long (valueContainsNull = true)

+------+------+------+---------+
|RollNo| FName| LName|    Score|
+------+------+------+---------+
|  null|Uddhav|Savani|{P -> 20}|
|     2|   Dev| Patel|{P -> 23}|
|     1|  all2|Savani|{M -> 21}|
|     2|   Dev| Patel|{M -> 25}|
|     1|Uddhav|Savani|{C -> 50}|
|     2|   Dev| Patel|{C -> 80}|
|     2|   Dev|  all2|     null|
+------+------+------+---------+

+------+------+------+---------+
|RollNo| FName| LN

In [0]:
# replace 
df.show(5,truncate=False)
df.na.replace(["WHITE HANGING HEART T-LIGHT HOLDER", "CREAM CUPID HEARTS COAT HANGER"], ["REPLACED", "REPLACED2"], "Description").show(5, truncate=False)

+---------+---------+-----------------------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|Description                        |Quantity|InvoiceDate        |UnitPrice|CustomerID|Country       |
+---------+---------+-----------------------------------+--------+-------------------+---------+----------+--------------+
|536365   |85123A   |WHITE HANGING HEART T-LIGHT HOLDER |6       |2010-12-01 08:26:00|2.55     |17850.0   |United Kingdom|
|536365   |71053    |WHITE METAL LANTERN                |6       |2010-12-01 08:26:00|3.39     |17850.0   |United Kingdom|
|536365   |84406B   |CREAM CUPID HEARTS COAT HANGER     |8       |2010-12-01 08:26:00|2.75     |17850.0   |United Kingdom|
|536365   |84029G   |KNITTED UNION FLAG HOT WATER BOTTLE|6       |2010-12-01 08:26:00|3.39     |17850.0   |United Kingdom|
|536365   |84029E   |RED WOOLLY HOTTIE WHITE HEART.     |6       |2010-12-01 08:26:00|3.39     |17850.0   |United Kingdom|
+---------+-----

In [0]:
# Ordering
# use asc_nulls_first, desc_nulls_first, asc_nulls_last, or desc_nulls_last to specify where you would like your null values to appear in an ordered DataFrame.

In [0]:
# Complex Types
# Structs
# We can create a struct by wrapping a set of columns in parenthesis in a query
df.selectExpr("(Description, InvoiceNo) as complex", "*").show(2, truncate=False)
df.selectExpr("struct(Description, InvoiceNo) as complex", "*").show(2, truncate=False)
df.selectExpr("(Description, InvoiceNo) as complex", "*").printSchema()

+--------------------------------------------+---------+---------+----------------------------------+--------+-------------------+---------+----------+--------------+
|complex                                     |InvoiceNo|StockCode|Description                       |Quantity|InvoiceDate        |UnitPrice|CustomerID|Country       |
+--------------------------------------------+---------+---------+----------------------------------+--------+-------------------+---------+----------+--------------+
|{WHITE HANGING HEART T-LIGHT HOLDER, 536365}|536365   |85123A   |WHITE HANGING HEART T-LIGHT HOLDER|6       |2010-12-01 08:26:00|2.55     |17850.0   |United Kingdom|
|{WHITE METAL LANTERN, 536365}               |536365   |71053    |WHITE METAL LANTERN               |6       |2010-12-01 08:26:00|3.39     |17850.0   |United Kingdom|
+--------------------------------------------+---------+---------+----------------------------------+--------+-------------------+---------+----------+--------------

In [0]:
from pyspark.sql.functions import struct, col
complexDF = df.select(struct("Description", "InvoiceNo").alias("complex"))
complexDF.createOrReplaceTempView("complexDF")

# Use Dot Syntex Or GetField Function to access inner Field
complexDF.select("complex.Description").show()
complexDF.select(col("complex").getField("Description")).show()

# We can also query all values in the struct by using *. This brings up all the columns to the top-level DataFrame:
complexDF.select("complex.*").show(2, truncate=False)
    

+--------------------+
|         Description|
+--------------------+
|WHITE HANGING HEA...|
| WHITE METAL LANTERN|
|CREAM CUPID HEART...|
|KNITTED UNION FLA...|
|RED WOOLLY HOTTIE...|
|SET 7 BABUSHKA NE...|
|GLASS STAR FROSTE...|
|HAND WARMER UNION...|
|HAND WARMER RED P...|
|ASSORTED COLOUR B...|
|POPPY'S PLAYHOUSE...|
|POPPY'S PLAYHOUSE...|
|FELTCRAFT PRINCES...|
|IVORY KNITTED MUG...|
|BOX OF 6 ASSORTED...|
|BOX OF VINTAGE JI...|
|BOX OF VINTAGE AL...|
|HOME BUILDING BLO...|
|LOVE BUILDING BLO...|
|RECIPE BOX WITH M...|
+--------------------+
only showing top 20 rows

+--------------------+
| complex.Description|
+--------------------+
|WHITE HANGING HEA...|
| WHITE METAL LANTERN|
|CREAM CUPID HEART...|
|KNITTED UNION FLA...|
|RED WOOLLY HOTTIE...|
|SET 7 BABUSHKA NE...|
|GLASS STAR FROSTE...|
|HAND WARMER UNION...|
|HAND WARMER RED P...|
|ASSORTED COLOUR B...|
|POPPY'S PLAYHOUSE...|
|POPPY'S PLAYHOUSE...|
|FELTCRAFT PRINCES...|
|IVORY KNITTED MUG...|
|BOX OF 6 ASSORTED...|
|BOX OF 

In [0]:
# Arrays As Complex Data Types
# 1) Split
from pyspark.sql.functions import split, col
df.select(split(df.Description, " ")).show(2, truncate=False)
df.select(split(df.Description, " ")).printSchema()

# Array Element Level Access
df.withColumn("description_split", split(col("Description"), " ")).selectExpr("description_split[0]").show(2, truncate=False)

# Array Length
from pyspark.sql.functions import size
df.select(size(split(col("Description"), " "))).show(2) # shows 5 and 3

# array_contains
from pyspark.sql.functions import array_contains
df.select(array_contains(split(col("Description"), " "), "WHIT")).show(2) 
df.select(array_contains(split(col("Description"), " "), "WHITE")).show(2) 

# explode : It Crates One Row For Each Split in the feild in below example
from pyspark.sql.functions import explode
df.select(explode(split(col("Description"), " "))).show(2) 

+----------------------------------------+
|split(Description,  , -1)               |
+----------------------------------------+
|[WHITE, HANGING, HEART, T-LIGHT, HOLDER]|
|[WHITE, METAL, LANTERN]                 |
+----------------------------------------+
only showing top 2 rows

root
 |-- split(Description,  , -1): array (nullable = true)
 |    |-- element: string (containsNull = false)

+--------------------+
|description_split[0]|
+--------------------+
|WHITE               |
|WHITE               |
+--------------------+
only showing top 2 rows

+-------------------------------+
|size(split(Description,  , -1))|
+-------------------------------+
|                              5|
|                              3|
+-------------------------------+
only showing top 2 rows

+-----------------------------------------------+
|array_contains(split(Description,  , -1), WHIT)|
+-----------------------------------------------+
|                                          false|
|             

In [0]:
# Maps

from pyspark.sql.functions import lit, create_map
df.select(create_map(lit("P"), lit(10))).show(5)

df.select(create_map(col("Description"), col("InvoiceNo")).alias("complex_map"))\
.show(2)

# You can query them by using the proper key. A missing key returns null:
df.select(create_map(lit("P"), lit(10)).alias('complex_map'))\
.selectExpr('complex_map["P"]').show(2)

# explode Map, 1 Key Of Map Will Create One New Line For each Row
df.select(create_map(col("Description"), col("InvoiceNo")).alias("complex_map"))\
.selectExpr("explode(complex_map)").show(2)




+----------+
|map(P, 10)|
+----------+
| {P -> 10}|
| {P -> 10}|
| {P -> 10}|
| {P -> 10}|
| {P -> 10}|
+----------+
only showing top 5 rows

+--------------------+
|         complex_map|
+--------------------+
|{WHITE HANGING HE...|
|{WHITE METAL LANT...|
+--------------------+
only showing top 2 rows

+--------------+
|complex_map[P]|
+--------------+
|            10|
|            10|
+--------------+
only showing top 2 rows

+--------------------+------+
|                 key| value|
+--------------------+------+
|WHITE HANGING HEA...|536365|
| WHITE METAL LANTERN|536365|
+--------------------+------+
only showing top 2 rows



In [0]:
# Working with JSON
jsonDF = spark.range(1).selectExpr("""
'{"myJSONKey" : {"myJSONValue" : [1, 2, 3]}}' as jsonString""")

from pyspark.sql.functions import get_json_object, json_tuple
jsonDF.select(
get_json_object(col("jsonString"), "$.myJSONKey.myJSONValue[1]").alias("column"),
json_tuple(col("jsonString"), "myJSONKey")).show(2, truncate=False)

jsonDF.select(
get_json_object(col("jsonString"), "$.myJSONKey.myJSONValue[1]").alias("column"),
json_tuple(col("jsonString"), "myJSONKey")).printSchema()


# We can also  turn a StructType into a JSON string by using the to_json function
from pyspark.sql.functions import to_json
df.selectExpr("(InvoiceNo, Description) as myStruct")\
.select(to_json(col("myStruct"))).show(5, truncate=False)

+------+-----------------------+
|column|c0                     |
+------+-----------------------+
|2     |{"myJSONValue":[1,2,3]}|
+------+-----------------------+

root
 |-- column: string (nullable = true)
 |-- c0: string (nullable = true)

+--------------------------------------------------------------------------+
|to_json(myStruct)                                                         |
+--------------------------------------------------------------------------+
|{"InvoiceNo":"536365","Description":"WHITE HANGING HEART T-LIGHT HOLDER"} |
|{"InvoiceNo":"536365","Description":"WHITE METAL LANTERN"}                |
|{"InvoiceNo":"536365","Description":"CREAM CUPID HEARTS COAT HANGER"}     |
|{"InvoiceNo":"536365","Description":"KNITTED UNION FLAG HOT WATER BOTTLE"}|
|{"InvoiceNo":"536365","Description":"RED WOOLLY HOTTIE WHITE HEART."}     |
+--------------------------------------------------------------------------+
only showing top 5 rows



In [0]:
# You can use the from_json function to parse this (or other JSON data) back in. This naturally requires you to specify a schema, and optionally you can specify a map of options, as well:
from pyspark.sql.functions import from_json
from pyspark.sql.types import *
parseSchema = StructType((
StructField("InvoiceNo",StringType(),True),
StructField("Description",StringType(),True)))
df.selectExpr("(InvoiceNo, Description) as myStruct")\
.select(to_json(col("myStruct")).alias("newJSON"))\
.select(from_json(col("newJSON"), parseSchema), col("newJSON")).show(2)

+--------------------+--------------------+
|  from_json(newJSON)|             newJSON|
+--------------------+--------------------+
|{536365, WHITE HA...|{"InvoiceNo":"536...|
|{536365, WHITE ME...|{"InvoiceNo":"536...|
+--------------------+--------------------+
only showing top 2 rows



In [0]:
# UDFs
# By default, these functions are registered as temporary functions to be used in that specific SparkSession or Context.
# When you use the function, there are essentially two different things that occur. If the function is written in Scala or Java, you can use it within the Java Virtual Machine (JVM). This means that there will be little performance penalty aside from the fact that you can’t take advantage of code generation capabilities that Spark has for built-in functions. There can be performance issues if you create or use a lot of objects; we cover that in the section on optimization in Chapter 19.

# If the function is written in Python, something quite different happens. Spark starts a Python process on the worker, serializes all of the data to a format that Python can understand (remember, it was in the JVM earlier), executes the function row by row on that data in the Python process, and then finally returns the results of the row operations to the JVM and Spark



Starting this Python process is expensive, but the real cost is in serializing the data to Python. This is
costly for two reasons: it is an expensive computation, but also, after the data enters Python, Spark
cannot manage the memory of the worker. This means that you could potentially cause a worker to fail
if it becomes resource constrained (because both the JVM and Python are competing for memory on
the same machine). We recommend that you write your UDFs in Scala or Java—the small amount of
time it should take you to write the function in Scala will always yield significant speed ups, and on
top of that, you can still use the function from Python

In [0]:
udfExampleDF = spark.range(5).toDF("num")

def power3(double_value):
    return double_value ** 3

power3(2.0) 

# First, we need to register the function to make it available as a DataFrame function:
from pyspark.sql.functions import udf
power3udf = udf(power3)

from pyspark.sql.functions import col
udfExampleDF.select(power3udf(col("num"))).show(2)

+-----------+
|power3(num)|
+-----------+
|          0|
|          1|
+-----------+
only showing top 2 rows



While using UDF. It is important to note that specifying the return type is not necessary, but it is a best practice.

If you specify the type that doesn’t align with the actual type returned by the function, Spark will
not throw an error but will just return null to designate a failure.


In [0]:

from pyspark.sql.types import IntegerType, DoubleType
spark.udf.register("power3py", power3, DoubleType())

udfExampleDF.selectExpr("power3py(num)").show(2)

# This is because the range creates integers. When integers are operated on in Python, Python won’t convert them into floats (the corresponding type to Spark’s double type), therefore we see null. We can remedy this by ensuring that our Python function returns a float instead of an integer and the function will behave correctly.

+-------------+
|power3py(num)|
+-------------+
|         null|
|         null|
+-------------+
only showing top 2 rows



As a last note, you can also use UDF/UDAF creation via a Hive syntax. To allow for this, first
you must enable Hive support when they create their SparkSession (via
SparkSession.builder().enableHiveSupport()). Then you can register UDFs in SQL. This
is only supported with precompiled Scala and Java packages, so you’ll need to specify them as a
dependency:

CREATE TEMPORARY FUNCTION myFunc AS 'com.organization.hive.udf.FunctionName'

Additionally, you can register this as a permanent function in the Hive Metastore by removing
TEMPORARY.