In [1]:
from functools import partial

from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, when, regexp_replace, flatten, explode, struct,
    create_map, array
)
from pyspark.sql.types import (
    IntegerType, StringType, ArrayType, TimestampType,
    StructType, StructField
)

In [2]:
spark = SparkSession.builder.appName("chap2").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/19 12:07:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Cleaning & Preparing Your Data

In [3]:
df1 = spark.createDataFrame(
    [
        ("Alisson", "GK", 1),
        ("Walter", "DM", 3),
        ("Virgil", "DF", 4),
        ("Ibrahim", "DF", None),
        ("Endo", None, 3),
        ("Alisson", "GK", 1),
        ("Luis", "LW", 0)
    ], schema=["name", "position", "id"]
)
df1

DataFrame[name: string, position: string, id: bigint]

In [4]:
df1.show()

[Stage 0:>                                                          (0 + 1) / 1]

+-------+--------+----+
|   name|position|  id|
+-------+--------+----+
|Alisson|      GK|   1|
| Walter|      DM|   3|
| Virgil|      DF|   4|
|Ibrahim|      DF|null|
|   Endo|    null|   3|
|Alisson|      GK|   1|
|   Luis|      LW|   0|
+-------+--------+----+



                                                                                

## Duplicate Values

Show only unique rows:

In [5]:
df1.distinct().show()

+-------+--------+----+
|   name|position|  id|
+-------+--------+----+
|Alisson|      GK|   1|
| Walter|      DM|   3|
| Virgil|      DF|   4|
|Ibrahim|      DF|null|
|   Endo|    null|   3|
|   Luis|      LW|   0|
+-------+--------+----+



Filter out duplicates based on entire rows (same as using `distinct()`):

In [6]:
df1.dropDuplicates().show()

+-------+--------+----+
|   name|position|  id|
+-------+--------+----+
|Alisson|      GK|   1|
| Walter|      DM|   3|
| Virgil|      DF|   4|
|Ibrahim|      DF|null|
|   Endo|    null|   3|
|   Luis|      LW|   0|
+-------+--------+----+



Filter out duplicates based on a subset of columns:

In [7]:
df1.dropDuplicates(["id"]).show()

+-------+--------+----+
|   name|position|  id|
+-------+--------+----+
|Ibrahim|      DF|null|
|   Luis|      LW|   0|
|Alisson|      GK|   1|
| Walter|      DM|   3|
| Virgil|      DF|   4|
+-------+--------+----+



## Working With Nulls

Filter rows based on the existence or absence of nulls in a specified column:

In [8]:
df1.filter(col("id").isNull()).show()

+-------+--------+----+
|   name|position|  id|
+-------+--------+----+
|Ibrahim|      DF|null|
+-------+--------+----+



In [9]:
df1.filter(
    col("id").isNotNull() & col("position").isNotNull()
).show()

+-------+--------+---+
|   name|position| id|
+-------+--------+---+
|Alisson|      GK|  1|
| Walter|      DM|  3|
| Virgil|      DF|  4|
|Alisson|      GK|  1|
|   Luis|      LW|  0|
+-------+--------+---+



Create a null label column:

In [10]:
df1.select("*", col("id").isNull().alias("no_id")).show()

+-------+--------+----+-----+
|   name|position|  id|no_id|
+-------+--------+----+-----+
|Alisson|      GK|   1|false|
| Walter|      DM|   3|false|
| Virgil|      DF|   4|false|
|Ibrahim|      DF|null| true|
|   Endo|    null|   3|false|
|Alisson|      GK|   1|false|
|   Luis|      LW|   0|false|
+-------+--------+----+-----+



Account for cases when nulls are set to alternative values:

In [11]:
df1.withColumn(
    "id",
    when(col("id") == 0, None).otherwise(col("id"))
).show()

+-------+--------+----+
|   name|position|  id|
+-------+--------+----+
|Alisson|      GK|   1|
| Walter|      DM|   3|
| Virgil|      DF|   4|
|Ibrahim|      DF|null|
|   Endo|    null|   3|
|Alisson|      GK|   1|
|   Luis|      LW|null|
+-------+--------+----+



## Using RegEx

In [12]:
df2 = spark.createDataFrame(
    [
        ("Alisson", "Math"),
        ("Virgil", "Computer Science"),
        ("Ibrahim", "English"),
        ("Alexis", "Computer Science")
    ], schema=["name", "class"]
)
df2.show()

+-------+----------------+
|   name|           class|
+-------+----------------+
|Alisson|            Math|
| Virgil|Computer Science|
|Ibrahim|         English|
| Alexis|Computer Science|
+-------+----------------+



Replace one value with an alternative:

In [13]:
df2.select(
    col("class").alias("initial_choice"),
    regexp_replace("class", "English", "Design").alias("final_choice")
).show()

+----------------+----------------+
|  initial_choice|    final_choice|
+----------------+----------------+
|            Math|            Math|
|Computer Science|Computer Science|
|         English|          Design|
|Computer Science|Computer Science|
+----------------+----------------+



Replace all values in a column:

In [14]:
df2.withColumn(
    "class",
    regexp_replace("class", "^Computer", "Data")
).show()

+-------+------------+
|   name|       class|
+-------+------------+
|Alisson|        Math|
| Virgil|Data Science|
|Ibrahim|     English|
| Alexis|Data Science|
+-------+------------+

