# Working with Different Types of Data

In [1]:
import findspark
findspark.init()

import pyspark # only run after findspark.init()
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

In [2]:
df = spark.read.format("csv")\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .load("../data/retail-data/by-day/2010-12-01.csv")

df.printSchema()
df.createOrReplaceTempView("dfTable")

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: timestamp (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)



### Boolean Operations

In [3]:
from pyspark.sql.functions import col

In [4]:
df.where(col("InvoiceNo") != 536365).select("InvoiceNo", "Description").show(5, False)

+---------+-----------------------------+
|InvoiceNo|Description                  |
+---------+-----------------------------+
|536366   |HAND WARMER UNION JACK       |
|536366   |HAND WARMER RED POLKA DOT    |
|536367   |ASSORTED COLOUR BIRD ORNAMENT|
|536367   |POPPY'S PLAYHOUSE BEDROOM    |
|536367   |POPPY'S PLAYHOUSE KITCHEN    |
+---------+-----------------------------+
only showing top 5 rows



In [5]:
df.where("InvoiceNo <> 536365").select("InvoiceNo", "Description").show(5, False)

+---------+-----------------------------+
|InvoiceNo|Description                  |
+---------+-----------------------------+
|536366   |HAND WARMER UNION JACK       |
|536366   |HAND WARMER RED POLKA DOT    |
|536367   |ASSORTED COLOUR BIRD ORNAMENT|
|536367   |POPPY'S PLAYHOUSE BEDROOM    |
|536367   |POPPY'S PLAYHOUSE KITCHEN    |
+---------+-----------------------------+
only showing top 5 rows



In [6]:
## Cleaner Option 

df.where("InvoiceNo = 536365").select("InvoiceNo", "Description").show(5, False)

+---------+-----------------------------------+
|InvoiceNo|Description                        |
+---------+-----------------------------------+
|536365   |WHITE HANGING HEART T-LIGHT HOLDER |
|536365   |WHITE METAL LANTERN                |
|536365   |CREAM CUPID HEARTS COAT HANGER     |
|536365   |KNITTED UNION FLAG HOT WATER BOTTLE|
|536365   |RED WOOLLY HOTTIE WHITE HEART.     |
+---------+-----------------------------------+
only showing top 5 rows



In [7]:
from pyspark.sql.functions import instr

In [8]:
## Chained Filters - always to be used in this way 

priceFilter = col("UnitPrice") > 60

descriptionFilter = instr(df.Description, "POSTAGE") >= 1 
#OR descriptionFilter = instr(col("Description"), "POSTAGE") >= 1

stockCodeFilter = df.StockCode.isin("DOT") 
#OR stockCodeFilter = col("StockCode") == "DOT"

df.where(stockCodeFilter).where(priceFilter | descriptionFilter).show()

+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|   Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|   536544|      DOT|DOTCOM POSTAGE|       1|2010-12-01 14:32:00|   569.77|      null|United Kingdom|
|   536592|      DOT|DOTCOM POSTAGE|       1|2010-12-01 17:06:00|   607.49|      null|United Kingdom|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+



In [9]:
# Using with withColumn (above code can also be used)
from pyspark.sql.functions import expr

expensiveFilter = "NOT UnitPrice <= 250"

df.withColumn("isExpensive", expr(expensiveFilter)).where("isExpensive").show(5)

+---------+---------+--------------+--------+-------------------+---------+----------+--------------+-----------+
|InvoiceNo|StockCode|   Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|isExpensive|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+-----------+
|   536544|      DOT|DOTCOM POSTAGE|       1|2010-12-01 14:32:00|   569.77|      null|United Kingdom|       true|
|   536592|      DOT|DOTCOM POSTAGE|       1|2010-12-01 17:06:00|   607.49|      null|United Kingdom|       true|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+-----------+



In [10]:
df.withColumn("isExpensive", expr(expensiveFilter)).filter("isExpensive").show(5)

+---------+---------+--------------+--------+-------------------+---------+----------+--------------+-----------+
|InvoiceNo|StockCode|   Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|isExpensive|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+-----------+
|   536544|      DOT|DOTCOM POSTAGE|       1|2010-12-01 14:32:00|   569.77|      null|United Kingdom|       true|
|   536592|      DOT|DOTCOM POSTAGE|       1|2010-12-01 17:06:00|   607.49|      null|United Kingdom|       true|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+-----------+



### Number Operations

In [11]:
from pyspark.sql.functions import pow

fabricatedQuantity = pow(col("Quantity") * col("UnitPrice"), 2) + 5

df.select(expr("CustomerId"), fabricatedQuantity.alias("realQuantity")).show(5)

+----------+------------------+
|CustomerId|      realQuantity|
+----------+------------------+
|   17850.0|239.08999999999997|
|   17850.0|          418.7156|
|   17850.0|             489.0|
|   17850.0|          418.7156|
|   17850.0|          418.7156|
+----------+------------------+
only showing top 5 rows



In [12]:
df.selectExpr("CustomerId", "(POWER((Quantity * UnitPrice), 2) + 5) AS realQuantity").show(5)

+----------+------------------+
|CustomerId|      realQuantity|
+----------+------------------+
|   17850.0|239.08999999999997|
|   17850.0|          418.7156|
|   17850.0|             489.0|
|   17850.0|          418.7156|
|   17850.0|          418.7156|
+----------+------------------+
only showing top 5 rows



#### Correlation

In [13]:
from pyspark.sql.functions import corr

df.stat.corr("Quantity", "UnitPrice")

-0.04112314436835551

In [14]:
df.select(corr("Quantity", "UnitPrice")).show()

+-------------------------+
|corr(Quantity, UnitPrice)|
+-------------------------+
|     -0.04112314436835551|
+-------------------------+



#### Summary

In [15]:
df.select("Quantity", "UnitPrice").describe().show()

+-------+------------------+------------------+
|summary|          Quantity|         UnitPrice|
+-------+------------------+------------------+
|  count|              3108|              3108|
|   mean| 8.627413127413128| 4.151946589446603|
| stddev|26.371821677029203|15.638659854603892|
|    min|               -24|               0.0|
|    max|               600|            607.49|
+-------+------------------+------------------+



#### Cross-tabulation and Frequent Items

In [16]:
df.stat.crosstab("StockCode", "Quantity").show()

+------------------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|StockCode_Quantity| -1|-10|-12| -2|-24| -3| -4| -5| -6| -7|  1| 10|100| 11| 12|120|128| 13| 14|144| 15| 16| 17| 18| 19|192|  2| 20|200| 21|216| 22| 23| 24| 25|252| 27| 28|288|  3| 30| 32| 33| 34| 36|384|  4| 40|432| 47| 48|480|  5| 50| 56|  6| 60|600| 64|  7| 70| 72|  8| 80|  9| 96|
+------------------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|             22578|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0| 

In [17]:
df.stat.freqItems(["StockCode", "Quantity"]).show()

+--------------------+--------------------+
| StockCode_freqItems|  Quantity_freqItems|
+--------------------+--------------------+
|[90214E, 20728, 2...|[200, 128, 23, 32...|
+--------------------+--------------------+



#### Adding Unique ID's to each Row

In [18]:
from pyspark.sql.functions import monotonically_increasing_id

df.select(monotonically_increasing_id()).show(5)

+-----------------------------+
|monotonically_increasing_id()|
+-----------------------------+
|                            0|
|                            1|
|                            2|
|                            3|
|                            4|
+-----------------------------+
only showing top 5 rows



### Strings Operations

In [19]:
from pyspark.sql.functions import initcap

df.select(initcap(col("Description"))).show()

+--------------------+
|initcap(Description)|
+--------------------+
|White Hanging Hea...|
| White Metal Lantern|
|Cream Cupid Heart...|
|Knitted Union Fla...|
|Red Woolly Hottie...|
|Set 7 Babushka Ne...|
|Glass Star Froste...|
|Hand Warmer Union...|
|Hand Warmer Red P...|
|Assorted Colour B...|
|Poppy's Playhouse...|
|Poppy's Playhouse...|
|Feltcraft Princes...|
|Ivory Knitted Mug...|
|Box Of 6 Assorted...|
|Box Of Vintage Ji...|
|Box Of Vintage Al...|
|Home Building Blo...|
|Love Building Blo...|
|Recipe Box With M...|
+--------------------+
only showing top 20 rows



In [20]:
from pyspark.sql.functions import lower, upper, rtrim, ltrim, trim

df.select(lower(col("Description")), upper(col("Description"))).show(5)

+--------------------+--------------------+
|  lower(Description)|  upper(Description)|
+--------------------+--------------------+
|white hanging hea...|WHITE HANGING HEA...|
| white metal lantern| WHITE METAL LANTERN|
|cream cupid heart...|CREAM CUPID HEART...|
|knitted union fla...|KNITTED UNION FLA...|
|red woolly hottie...|RED WOOLLY HOTTIE...|
+--------------------+--------------------+
only showing top 5 rows



In [21]:
from pyspark.sql.functions import lit

df.select(ltrim(lit("      Hello       ")), rtrim(lit("      Hello       ")), trim(lit("      Hello       "))).show(1)

+-------------------------+-------------------------+------------------------+
|ltrim(      Hello       )|rtrim(      Hello       )|trim(      Hello       )|
+-------------------------+-------------------------+------------------------+
|             Hello       |                    Hello|                   Hello|
+-------------------------+-------------------------+------------------------+
only showing top 1 row



In [22]:
from pyspark.sql.functions import rpad, lpad

df.select(lpad(lit("HELLOOO"), 10, "&"), rpad(lit("HELLOOO"), 10, "&")).show(1)

+--------------------+--------------------+
|lpad(HELLOOO, 10, &)|rpad(HELLOOO, 10, &)|
+--------------------+--------------------+
|          &&&HELLOOO|          HELLOOO&&&|
+--------------------+--------------------+
only showing top 1 row



In [23]:
# If lpad or rpad takes a number less than the lenght of the string, it will always remove values from the right side
df.select(lpad(lit("HELLOOO  "), 3, "&"), rpad(lit("HELLOOO  "), 3, "&")).show(1)

+---------------------+---------------------+
|lpad(HELLOOO  , 3, &)|rpad(HELLOOO  , 3, &)|
+---------------------+---------------------+
|                  HEL|                  HEL|
+---------------------+---------------------+
only showing top 1 row



### Regular Expression

In [24]:
from pyspark.sql.functions import regexp_replace, translate, regexp_extract

regex_string = "BLACK|WHITE|RED|GREEN|BLUE"

# Replace Stirngs/Phrases 

df.select(regexp_replace(col("Description"), regex_string, "COLOR").alias("color_clean"), col("Description")).show(2)

+--------------------+--------------------+
|         color_clean|         Description|
+--------------------+--------------------+
|COLOR HANGING HEA...|WHITE HANGING HEA...|
| COLOR METAL LANTERN| WHITE METAL LANTERN|
+--------------------+--------------------+
only showing top 2 rows



In [25]:
# Replace Characters

df.select(translate(col("Description"), "LEET", "1337"), col("Description")).show(2)

+----------------------------------+--------------------+
|translate(Description, LEET, 1337)|         Description|
+----------------------------------+--------------------+
|              WHI73 HANGING H3A...|WHITE HANGING HEA...|
|               WHI73 M37A1 1AN73RN| WHITE METAL LANTERN|
+----------------------------------+--------------------+
only showing top 2 rows



In [26]:
# Pull out first word/phrase

extract_str = "(BLACK|WHITE|RED|GREEN|BLUE)"
df.select(regexp_extract(col("Description"), extract_str, 1).alias("extract_color")).show(2)

+-------------+
|extract_color|
+-------------+
|        WHITE|
|        WHITE|
+-------------+
only showing top 2 rows



In [27]:
# Contains
from pyspark.sql.functions import instr

containsBlack = instr(col("Description"), "BLACK") >= 1
containsWhite = instr(col("Description"), "WHITE") >= 1

df.withColumn("hasBlackOrWhite", containsBlack | containsWhite).where("hasBlackOrWhite").select("Description").show(5, False)

+----------------------------------+
|Description                       |
+----------------------------------+
|WHITE HANGING HEART T-LIGHT HOLDER|
|WHITE METAL LANTERN               |
|RED WOOLLY HOTTIE WHITE HEART.    |
|WHITE HANGING HEART T-LIGHT HOLDER|
|WHITE METAL LANTERN               |
+----------------------------------+
only showing top 5 rows



### Dynamic Number of Arguments

In [28]:
from pyspark.sql.functions import expr, locate

simpleColors = ["black", "white", "red", "green", "blue"]

def color_locator(column, color_string):
    return locate(color_string.upper(), column).cast("boolean").alias("is_" + color_string)

selectedColumns = [color_locator(df.Description, c) for c in simpleColors]
selectedColumns.append(expr("*")) # has to a be Column type

df.select(*selectedColumns).where(expr("is_white OR is_white")).select("Description").show(5, False)

+----------------------------------+
|Description                       |
+----------------------------------+
|WHITE HANGING HEART T-LIGHT HOLDER|
|WHITE METAL LANTERN               |
|RED WOOLLY HOTTIE WHITE HEART.    |
|WHITE HANGING HEART T-LIGHT HOLDER|
|WHITE METAL LANTERN               |
+----------------------------------+
only showing top 5 rows



In [29]:
df.select(*selectedColumns).where(expr("is_white OR is_red")).select("Description").show(5, False)

+----------------------------------+
|Description                       |
+----------------------------------+
|WHITE HANGING HEART T-LIGHT HOLDER|
|WHITE METAL LANTERN               |
|RED WOOLLY HOTTIE WHITE HEART.    |
|HAND WARMER RED POLKA DOT         |
|RED COAT RACK PARIS FASHION       |
+----------------------------------+
only showing top 5 rows



In [30]:
df.select(*selectedColumns).where(expr("is_white OR is_white or is_red")).select("Description").show(5, False)

+----------------------------------+
|Description                       |
+----------------------------------+
|WHITE HANGING HEART T-LIGHT HOLDER|
|WHITE METAL LANTERN               |
|RED WOOLLY HOTTIE WHITE HEART.    |
|HAND WARMER RED POLKA DOT         |
|RED COAT RACK PARIS FASHION       |
+----------------------------------+
only showing top 5 rows



### Date and Timestamps Operations

In [31]:
from pyspark.sql.functions import current_date, current_timestamp

dateDF = spark.range(5).withColumn("today", current_date()).withColumn("now", current_timestamp())

dateDF.printSchema()

root
 |-- id: long (nullable = false)
 |-- today: date (nullable = false)
 |-- now: timestamp (nullable = false)



In [32]:
dateDF.show()

+---+----------+--------------------+
| id|     today|                 now|
+---+----------+--------------------+
|  0|2020-06-27|2020-06-27 22:59:...|
|  1|2020-06-27|2020-06-27 22:59:...|
|  2|2020-06-27|2020-06-27 22:59:...|
|  3|2020-06-27|2020-06-27 22:59:...|
|  4|2020-06-27|2020-06-27 22:59:...|
+---+----------+--------------------+



In [33]:
# Add, Subtract Date

from pyspark.sql.functions import date_add, date_sub

dateDF.select(date_add(col("today"), 5).alias("fiveDaysFromNow"), date_sub(col("today"), 5).alias("fiveDaysAgo")).show()

+---------------+-----------+
|fiveDaysFromNow|fiveDaysAgo|
+---------------+-----------+
|     2020-07-02| 2020-06-22|
|     2020-07-02| 2020-06-22|
|     2020-07-02| 2020-06-22|
|     2020-07-02| 2020-06-22|
|     2020-07-02| 2020-06-22|
+---------------+-----------+



In [34]:
from pyspark.sql.functions import datediff, months_between, to_date

dateDF.withColumn("week_ago", date_sub(col("today"), 7)).select(datediff(col("week_ago"), col("today"))).show(1)
dateDF.select(to_date(lit("2016-01-01")).alias("start"),to_date(lit("2017-05-22")).alias("end"))\
        .select(months_between(col("start"), col("end"))).show(1)


+-------------------------+
|datediff(week_ago, today)|
+-------------------------+
|                       -7|
+-------------------------+
only showing top 1 row

+--------------------------------+
|months_between(start, end, true)|
+--------------------------------+
|                    -16.67741935|
+--------------------------------+
only showing top 1 row



In [35]:
from pyspark.sql.functions import to_date, lit

spark.range(5).withColumn("date", lit("2017-01-01")).select(to_date(col("date"))).show(1)

+---------------+
|to_date(`date`)|
+---------------+
|     2017-01-01|
+---------------+
only showing top 1 row



In [36]:
## Wrong format yyyy-dd-mm 
dateDF.select(to_date(lit("2016-20-12")),to_date(lit("2017-12-11"))).show(1)

+---------------------+---------------------+
|to_date('2016-20-12')|to_date('2017-12-11')|
+---------------------+---------------------+
|                 null|           2017-12-11|
+---------------------+---------------------+
only showing top 1 row



In [37]:
from pyspark.sql.functions import to_date

# Custom format 
dateFormat = "yyyy-dd-MM"

cleanDateDF = spark.range(1).select(
    to_date(lit("2017-12-11"), dateFormat).alias("date"),
    to_date(lit("2017-20-12"), dateFormat).alias("date2"))

cleanDateDF.show(1)


+----------+----------+
|      date|     date2|
+----------+----------+
|2017-11-12|2017-12-20|
+----------+----------+



In [38]:
from pyspark.sql.functions import to_timestamp

cleanDateDF.select(to_timestamp(col("date"), dateFormat)).show()

+----------------------------------+
|to_timestamp(`date`, 'yyyy-dd-MM')|
+----------------------------------+
|               2017-11-12 00:00:00|
+----------------------------------+



### Working with Nulls

In [39]:
# Coalesce - select the first non-null value from a set of columns

from pyspark.sql.functions import coalesce

df.select(coalesce(col("Description"), col("CustomerId"))).show(5)

+---------------------------------+
|coalesce(Description, CustomerId)|
+---------------------------------+
|             WHITE HANGING HEA...|
|              WHITE METAL LANTERN|
|             CREAM CUPID HEART...|
|             KNITTED UNION FLA...|
|             RED WOOLLY HOTTIE...|
+---------------------------------+
only showing top 5 rows



In [40]:
# ifnull - select the second value if the first is null, and defaults to the first
# nullif - returns null if the two values are equal or else returns the second if they are not
# nvl    - returns the second value if the first is null, but defaults to the first
# nvl2   - returns the second value if the first is not null; otherwise, it will return the last specified value
# All above functions are in Spark SQL not in Pyspark

spark.range(1).select(expr("nullif('value', 'value')"),
                      expr("nvl(null, 'return_value')"),
                      expr("nvl2('not_null', 'return_value', 'else_value')")).show()

+------------------------+-------------------------+----------------------------------------------+
|nullif('value', 'value')|nvl(NULL, 'return_value')|nvl2('not_null', 'return_value', 'else_value')|
+------------------------+-------------------------+----------------------------------------------+
|                    null|             return_value|                                  return_value|
+------------------------+-------------------------+----------------------------------------------+



In [41]:
# Drop

# "any" - drops a row if any of the values are null (default)
# "all" - drops the row only if all values are null or NaN for that row

df.na.drop("all", subset=["StockCode", "InvoiceNo"])


DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: timestamp, UnitPrice: double, CustomerID: double, Country: string]

In [42]:
# Fill - fill one or more columns with a set of values

fill_cols_vals = {"StockCode": 5, "Description" : "No Value"}
df.na.fill(fill_cols_vals)

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: timestamp, UnitPrice: double, CustomerID: double, Country: string]

In [43]:
# Replace - more flexible than fill

df.na.replace([""], ["UNKNOWN"], "Description")

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: timestamp, UnitPrice: double, CustomerID: double, Country: string]

### Complex Types

#### Structs

In [44]:
from pyspark.sql.functions import struct

complexDF = df.select(struct("Description", "InvoiceNo").alias("complex"))
complexDF.show(5, False)

+---------------------------------------------+
|complex                                      |
+---------------------------------------------+
|[WHITE HANGING HEART T-LIGHT HOLDER, 536365] |
|[WHITE METAL LANTERN, 536365]                |
|[CREAM CUPID HEARTS COAT HANGER, 536365]     |
|[KNITTED UNION FLAG HOT WATER BOTTLE, 536365]|
|[RED WOOLLY HOTTIE WHITE HEART., 536365]     |
+---------------------------------------------+
only showing top 5 rows



In [45]:
complexDF.select("complex.Description").show(5, False)

+-----------------------------------+
|Description                        |
+-----------------------------------+
|WHITE HANGING HEART T-LIGHT HOLDER |
|WHITE METAL LANTERN                |
|CREAM CUPID HEARTS COAT HANGER     |
|KNITTED UNION FLAG HOT WATER BOTTLE|
|RED WOOLLY HOTTIE WHITE HEART.     |
+-----------------------------------+
only showing top 5 rows



In [46]:
complexDF.select(col("complex").getField("Description")).show(5, False)

+-----------------------------------+
|complex.Description                |
+-----------------------------------+
|WHITE HANGING HEART T-LIGHT HOLDER |
|WHITE METAL LANTERN                |
|CREAM CUPID HEARTS COAT HANGER     |
|KNITTED UNION FLAG HOT WATER BOTTLE|
|RED WOOLLY HOTTIE WHITE HEART.     |
+-----------------------------------+
only showing top 5 rows



In [47]:
complexDF.select("complex.*").show(5, False)

+-----------------------------------+---------+
|Description                        |InvoiceNo|
+-----------------------------------+---------+
|WHITE HANGING HEART T-LIGHT HOLDER |536365   |
|WHITE METAL LANTERN                |536365   |
|CREAM CUPID HEARTS COAT HANGER     |536365   |
|KNITTED UNION FLAG HOT WATER BOTTLE|536365   |
|RED WOOLLY HOTTIE WHITE HEART.     |536365   |
+-----------------------------------+---------+
only showing top 5 rows



#### Arrays

In [48]:
# Split 

from pyspark.sql.functions import split

df.select(split(col("Description"), " ")).show(5, False)

+------------------------------------------+
|split(Description,  )                     |
+------------------------------------------+
|[WHITE, HANGING, HEART, T-LIGHT, HOLDER]  |
|[WHITE, METAL, LANTERN]                   |
|[CREAM, CUPID, HEARTS, COAT, HANGER]      |
|[KNITTED, UNION, FLAG, HOT, WATER, BOTTLE]|
|[RED, WOOLLY, HOTTIE, WHITE, HEART.]      |
+------------------------------------------+
only showing top 5 rows



In [49]:
df.select(split(col("Description"), " ").alias("array_col"), col("Description")).selectExpr("array_col[0]", "Description").show(5, False)

+------------+-----------------------------------+
|array_col[0]|Description                        |
+------------+-----------------------------------+
|WHITE       |WHITE HANGING HEART T-LIGHT HOLDER |
|WHITE       |WHITE METAL LANTERN                |
|CREAM       |CREAM CUPID HEARTS COAT HANGER     |
|KNITTED     |KNITTED UNION FLAG HOT WATER BOTTLE|
|RED         |RED WOOLLY HOTTIE WHITE HEART.     |
+------------+-----------------------------------+
only showing top 5 rows



In [50]:
# Length

from pyspark.sql.functions import size

df.select(size(split(col("Description"), " ")).alias("Size"), col("Description")).show(5, False)

+----+-----------------------------------+
|Size|Description                        |
+----+-----------------------------------+
|5   |WHITE HANGING HEART T-LIGHT HOLDER |
|3   |WHITE METAL LANTERN                |
|5   |CREAM CUPID HEARTS COAT HANGER     |
|6   |KNITTED UNION FLAG HOT WATER BOTTLE|
|5   |RED WOOLLY HOTTIE WHITE HEART.     |
+----+-----------------------------------+
only showing top 5 rows



In [51]:
# Contains

from pyspark.sql.functions import array_contains

df.select(array_contains(split(col("Description"), " "), "WHITE"), col("Description")).show(5, False)

+--------------------------------------------+-----------------------------------+
|array_contains(split(Description,  ), WHITE)|Description                        |
+--------------------------------------------+-----------------------------------+
|true                                        |WHITE HANGING HEART T-LIGHT HOLDER |
|true                                        |WHITE METAL LANTERN                |
|false                                       |CREAM CUPID HEARTS COAT HANGER     |
|false                                       |KNITTED UNION FLAG HOT WATER BOTTLE|
|true                                        |RED WOOLLY HOTTIE WHITE HEART.     |
+--------------------------------------------+-----------------------------------+
only showing top 5 rows



In [52]:
# Explode

from pyspark.sql.functions import split, explode

df.withColumn("splitted", split(col("Description"), " "))\
    .withColumn("exploded", explode(col("splitted")))\
    .select("Description", "InvoiceNo", "exploded").show(5, False)

+----------------------------------+---------+--------+
|Description                       |InvoiceNo|exploded|
+----------------------------------+---------+--------+
|WHITE HANGING HEART T-LIGHT HOLDER|536365   |WHITE   |
|WHITE HANGING HEART T-LIGHT HOLDER|536365   |HANGING |
|WHITE HANGING HEART T-LIGHT HOLDER|536365   |HEART   |
|WHITE HANGING HEART T-LIGHT HOLDER|536365   |T-LIGHT |
|WHITE HANGING HEART T-LIGHT HOLDER|536365   |HOLDER  |
+----------------------------------+---------+--------+
only showing top 5 rows



#### Maps

In [53]:
from pyspark.sql.functions import create_map

complexDF = df.select(create_map(col("Description"), col("InvoiceNo")).alias("complex_map"))
complexDF.show(5, False)

+-----------------------------------------------+
|complex_map                                    |
+-----------------------------------------------+
|[WHITE HANGING HEART T-LIGHT HOLDER -> 536365] |
|[WHITE METAL LANTERN -> 536365]                |
|[CREAM CUPID HEARTS COAT HANGER -> 536365]     |
|[KNITTED UNION FLAG HOT WATER BOTTLE -> 536365]|
|[RED WOOLLY HOTTIE WHITE HEART. -> 536365]     |
+-----------------------------------------------+
only showing top 5 rows



In [54]:
complexDF.selectExpr("complex_map['WHITE METAL LANTERN']").show(5)

+--------------------------------+
|complex_map[WHITE METAL LANTERN]|
+--------------------------------+
|                            null|
|                          536365|
|                            null|
|                            null|
|                            null|
+--------------------------------+
only showing top 5 rows



In [55]:
# Explode Map

complexDF.selectExpr("explode(complex_map)").show(10, False)


+-----------------------------------+------+
|key                                |value |
+-----------------------------------+------+
|WHITE HANGING HEART T-LIGHT HOLDER |536365|
|WHITE METAL LANTERN                |536365|
|CREAM CUPID HEARTS COAT HANGER     |536365|
|KNITTED UNION FLAG HOT WATER BOTTLE|536365|
|RED WOOLLY HOTTIE WHITE HEART.     |536365|
|SET 7 BABUSHKA NESTING BOXES       |536365|
|GLASS STAR FROSTED T-LIGHT HOLDER  |536365|
|HAND WARMER UNION JACK             |536366|
|HAND WARMER RED POLKA DOT          |536366|
|ASSORTED COLOUR BIRD ORNAMENT      |536367|
+-----------------------------------+------+
only showing top 10 rows



### JSON

In [56]:
jsonDF = spark.range(1).selectExpr("""'{"myJSONKey" : {"myJSONValue" : [1, 2, 3]}}' as jsonString""")

In [57]:
jsonDF.show(1, False)

+-------------------------------------------+
|jsonString                                 |
+-------------------------------------------+
|{"myJSONKey" : {"myJSONValue" : [1, 2, 3]}}|
+-------------------------------------------+



In [58]:
from pyspark.sql.functions import get_json_object, json_tuple

jsonDF.select(get_json_object(col("jsonString"), "$.myJSONKey.myJSONValue[0]").alias("column"),\
              json_tuple(col("jsonString"), "myJSONKey")).show(1, False)

+------+-----------------------+
|column|c0                     |
+------+-----------------------+
|1     |{"myJSONValue":[1,2,3]}|
+------+-----------------------+



In [59]:
# Converting struct to JSON

from pyspark.sql.functions import to_json

df.selectExpr("(InvoiceNo, Description) as myStruct").select(to_json(col("myStruct"))).show(5, False)

+--------------------------------------------------------------------------+
|structstojson(myStruct)                                                   |
+--------------------------------------------------------------------------+
|{"InvoiceNo":"536365","Description":"WHITE HANGING HEART T-LIGHT HOLDER"} |
|{"InvoiceNo":"536365","Description":"WHITE METAL LANTERN"}                |
|{"InvoiceNo":"536365","Description":"CREAM CUPID HEARTS COAT HANGER"}     |
|{"InvoiceNo":"536365","Description":"KNITTED UNION FLAG HOT WATER BOTTLE"}|
|{"InvoiceNo":"536365","Description":"RED WOOLLY HOTTIE WHITE HEART."}     |
+--------------------------------------------------------------------------+
only showing top 5 rows



### UDF Example

In [60]:
udfExampleDF = spark.range(5).toDF("num")

# 1. Create function 
def power3(double_value):
    return double_value ** 3
    power3(2.0)

In [61]:
# 2. Register as UDF

from pyspark.sql.functions import udf

power3udf = udf(power3)

In [62]:
# 3. Use
udfExampleDF.select(power3udf(col("num"))).show(5, False)

Py4JJavaError: An error occurred while calling o457.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 82.0 failed 1 times, most recent failure: Lost task 0.0 in stage 82.0 (TID 321, localhost, executor driver): java.io.IOException: Cannot run program "C:\Users\utkar\Anaconda3\": CreateProcess error=5, Access is denied
	at java.lang.ProcessBuilder.start(Unknown Source)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:155)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:97)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:117)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:109)
	at org.apache.spark.sql.execution.python.BatchEvalPythonExec.evaluate(BatchEvalPythonExec.scala:77)
	at org.apache.spark.sql.execution.python.EvalPythonExec$$anonfun$doExecute$1.apply(EvalPythonExec.scala:127)
	at org.apache.spark.sql.execution.python.EvalPythonExec$$anonfun$doExecute$1.apply(EvalPythonExec.scala:89)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:801)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:801)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	at java.lang.Thread.run(Unknown Source)
Caused by: java.io.IOException: CreateProcess error=5, Access is denied
	at java.lang.ProcessImpl.create(Native Method)
	at java.lang.ProcessImpl.<init>(Unknown Source)
	at java.lang.ProcessImpl.start(Unknown Source)
	... 30 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1889)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1876)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1876)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2110)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2059)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2048)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:365)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3389)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3370)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3369)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2764)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:254)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:291)
	at sun.reflect.GeneratedMethodAccessor64.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
	at java.lang.reflect.Method.invoke(Unknown Source)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Unknown Source)
Caused by: java.io.IOException: Cannot run program "C:\Users\utkar\Anaconda3\": CreateProcess error=5, Access is denied
	at java.lang.ProcessBuilder.start(Unknown Source)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:155)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:97)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:117)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:109)
	at org.apache.spark.sql.execution.python.BatchEvalPythonExec.evaluate(BatchEvalPythonExec.scala:77)
	at org.apache.spark.sql.execution.python.EvalPythonExec$$anonfun$doExecute$1.apply(EvalPythonExec.scala:127)
	at org.apache.spark.sql.execution.python.EvalPythonExec$$anonfun$doExecute$1.apply(EvalPythonExec.scala:89)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:801)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:801)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	... 1 more
Caused by: java.io.IOException: CreateProcess error=5, Access is denied
	at java.lang.ProcessImpl.create(Native Method)
	at java.lang.ProcessImpl.<init>(Unknown Source)
	at java.lang.ProcessImpl.start(Unknown Source)
	... 30 more
