# 1. PySpark - show()

In [103]:
import json
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField,StringType,IntegerType,ArrayType,MapType,DoubleType,BooleanType
from pyspark.sql.functions import *
from pyspark.sql import Row
from pyspark.sql.window import Window
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()


In [2]:
columns = ["Seqno","Quote"]
data = [("1", "Be the change that you wish to see in the world"),
    ("2", "Everyone thinks of changing the world, but no one thinks of changing himself."),
    ("3", "The purpose of our lives is to be happy."),
    ("4", "Be cool.")]
df = spark.createDataFrame(data,columns)
df.show()

+-----+--------------------+
|Seqno|               Quote|
+-----+--------------------+
|    1|Be the change tha...|
|    2|Everyone thinks o...|
|    3|The purpose of ou...|
|    4|            Be cool.|
+-----+--------------------+



In [3]:
df.show(truncate=False)

+-----+-----------------------------------------------------------------------------+
|Seqno|Quote                                                                        |
+-----+-----------------------------------------------------------------------------+
|1    |Be the change that you wish to see in the world                              |
|2    |Everyone thinks of changing the world, but no one thinks of changing himself.|
|3    |The purpose of our lives is to be happy.                                     |
|4    |Be cool.                                                                     |
+-----+-----------------------------------------------------------------------------+



# 2. PySpark - StructType & StructField

The StructType and StructField classes in PySpark are used to specify the custom schema to the DataFrame and create complex columns like nested struct, array, and map columns. StructType is a collection of StructField objects that define column name, column data type, boolean to specify if the field can be nullable or not, and metadata

* StructType – Defines the structure of the DataFrame
* StructField – Defines the metadata of the DataFrame column: column names, data types, and whether they’re nullable


In [4]:
data = [("James","","Smith","36636","M",3000),
    ("Michael","Rose","","40288","M",4000),
    ("Robert","","Williams","42114","M",4000),
    ("Maria","Anne","Jones","39192","F",4000),
    ("Jen","Mary","Brown","","F",-1)
  ]

schema = StructType([ 
    StructField("firstname",StringType(),True), 
    StructField("middlename",StringType(),True), 
    StructField("lastname",StringType(),True), 
    StructField("id", StringType(), True), 
    StructField("gender", StringType(), True), 
    StructField("salary", IntegerType(), True) 
  ])

df = spark.createDataFrame(data = data, schema = schema)
df.printSchema()
df.show()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)

+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|   id|gender|salary|
+---------+----------+--------+-----+------+------+
|    James|          |   Smith|36636|     M|  3000|
|  Michael|      Rose|        |40288|     M|  4000|
|   Robert|          |Williams|42114|     M|  4000|
|    Maria|      Anne|   Jones|39192|     F|  4000|
|      Jen|      Mary|   Brown|     |     F|    -1|
+---------+----------+--------+-----+------+------+



### Defining Nested StructType object struct 

In [5]:
structureData = [
    (("James","","Smith"),"36636","M",3100),
    (("Michael","Rose",""),"40288","M",4300),
    (("Robert","","Williams"),"42114","M",1400),
    (("Maria","Anne","Jones"),"39192","F",5500),
    (("Jen","Mary","Brown"),"","F",-1)
  ]

structureSchema = StructType([
        StructField('name', StructType([
             StructField('firstname', StringType(), True),
             StructField('middlename', StringType(), True),
             StructField('lastname', StringType(), True)
             ])),
         StructField('id', StringType(), True),
         StructField('gender', StringType(), True),
         StructField('salary', IntegerType(), True)
         ])

df2 = spark.createDataFrame(data = structureData, schema = structureSchema)
df2.printSchema()
df2.show(truncate=False)

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)

+--------------------+-----+------+------+
|name                |id   |gender|salary|
+--------------------+-----+------+------+
|{James, , Smith}    |36636|M     |3100  |
|{Michael, Rose, }   |40288|M     |4300  |
|{Robert, , Williams}|42114|M     |1400  |
|{Maria, Anne, Jones}|39192|F     |5500  |
|{Jen, Mary, Brown}  |     |F     |-1    |
+--------------------+-----+------+------+



### Adding & Changing struct of the DataFrame

In [6]:
updatedDF = df2.withColumn("OtherInfo", 
    struct(col("id").alias("identifier"),
    col("gender").alias("gender"),
    col("salary").alias("salary"),
    when(col("salary").cast(IntegerType()) < 2000,"Low")
      .when(col("salary").cast(IntegerType()) < 4000,"Medium")
      .otherwise("High").alias("Salary_Grade")
  )).drop("id","gender","salary")

updatedDF.printSchema()
updatedDF.show(truncate=False)

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- OtherInfo: struct (nullable = false)
 |    |-- identifier: string (nullable = true)
 |    |-- gender: string (nullable = true)
 |    |-- salary: integer (nullable = true)
 |    |-- Salary_Grade: string (nullable = false)

+--------------------+------------------------+
|name                |OtherInfo               |
+--------------------+------------------------+
|{James, , Smith}    |{36636, M, 3100, Medium}|
|{Michael, Rose, }   |{40288, M, 4300, High}  |
|{Robert, , Williams}|{42114, M, 1400, Low}   |
|{Maria, Anne, Jones}|{39192, F, 5500, High}  |
|{Jen, Mary, Brown}  |{, F, -1, Low}          |
+--------------------+------------------------+



### Using SQL ArrayType and MapType

In [7]:
arrayStructureSchema = StructType([
    StructField('name', StructType([
       StructField('firstname', StringType(), True),
       StructField('middlename', StringType(), True),
       StructField('lastname', StringType(), True)
       ])),
       StructField('hobbies', ArrayType(StringType()), True),
       StructField('properties', MapType(StringType(),StringType()), True)
    ])

Output:

root
 |-- name: struct (nullable = true)  
 |    |-- firstname: string (nullable = true)  
 |    |-- middlename: string (nullable = true)  
 |    |-- lastname: string (nullable = true)  
 |-- hobbies: array (nullable = true)  
 |    |-- element: string (containsNull = true)  
 |-- properties: map (nullable = true)  
 |    |-- key: string   
 |    |-- value: string (valueContainsNull = true)   

### Creating StructType object struct from JSON file

In [8]:
print(df2.schema.json())

{"fields":[{"metadata":{},"name":"name","nullable":true,"type":{"fields":[{"metadata":{},"name":"firstname","nullable":true,"type":"string"},{"metadata":{},"name":"middlename","nullable":true,"type":"string"},{"metadata":{},"name":"lastname","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"id","nullable":true,"type":"string"},{"metadata":{},"name":"gender","nullable":true,"type":"string"},{"metadata":{},"name":"salary","nullable":true,"type":"integer"}],"type":"struct"}


### Check DataFrame Column Exists

In [9]:
if "firstname" in df.columns:
    print("Column 'firstname' exists in the DataFrame.")
else:
    print("Column 'firstname' does not exist in the DataFrame.")

Column 'firstname' exists in the DataFrame.


# 3. PySpark - Column functions

pyspark.sql.Column class provides several functions to work with DataFrame to manipulate the Column values, evaluate the boolean expression to filter rows, retrieve a value or part of a value from a DataFrame column, and to work with list, map & struct columns

In [10]:
data=[("James",23),("Ann",40)]
df=spark.createDataFrame(data).toDF("name.fname","gender")
df.printSchema()


root
 |-- name.fname: string (nullable = true)
 |-- gender: long (nullable = true)



In [11]:
# Using DataFrame object (df)
df.select(df.gender).show()
df.select(df["gender"]).show()
#Accessing column name with dot (with backticks)
df.select(df["`name.fname`"]).show()

+------+
|gender|
+------+
|    23|
|    40|
+------+

+------+
|gender|
+------+
|    23|
|    40|
+------+

+----------+
|name.fname|
+----------+
|     James|
|       Ann|
+----------+



In [12]:
#Using SQL col() function
df.select(col("gender")).show()
#Accessing column name with dot (with backticks)
df.select(col("`name.fname`")).show()

+------+
|gender|
+------+
|    23|
|    40|
+------+

+----------+
|name.fname|
+----------+
|     James|
|       Ann|
+----------+



In [13]:
#Create DataFrame with struct using Row class
data = [
    Row (name="James", prop = Row(hair = "black", eye = "blue")),
    Row (name="Ann", prop = Row(hair = "grey", eye = "black"))
       ]
df = spark.createDataFrame(data)
df.printSchema()
df.show()

root
 |-- name: string (nullable = true)
 |-- prop: struct (nullable = true)
 |    |-- hair: string (nullable = true)
 |    |-- eye: string (nullable = true)

+-----+-------------+
| name|         prop|
+-----+-------------+
|James|{black, blue}|
|  Ann|{grey, black}|
+-----+-------------+



In [14]:
df.select('name').show()
df.select(df['prop.hair']).show()
df.select(col("prop.*")).show()

+-----+
| name|
+-----+
|James|
|  Ann|
+-----+

+-----+
| hair|
+-----+
|black|
| grey|
+-----+

+-----+-----+
| hair|  eye|
+-----+-----+
|black| blue|
| grey|black|
+-----+-----+



In [15]:
data=[(100,2,1),(200,3,4),(300,4,4)]
df = spark.createDataFrame(data)
df.show()
df = spark.createDataFrame(data).toDF("col1","col2","col3")
df.show()

+---+---+---+
| _1| _2| _3|
+---+---+---+
|100|  2|  1|
|200|  3|  4|
|300|  4|  4|
+---+---+---+

+----+----+----+
|col1|col2|col3|
+----+----+----+
| 100|   2|   1|
| 200|   3|   4|
| 300|   4|   4|
+----+----+----+



### PySpark Column functions

Column Function	Function Description       
alias(*alias, **kwargs)       
name(*alias, **kwargs)	Provides alias to the column or expressions       
name() returns same as alias().       
asc()       
asc_nulls_first()       
asc_nulls_last()	Returns ascending order of the column.       
asc_nulls_first() Returns null values first then non-null values.       
asc_nulls_last() – Returns null values after non-null values.       
astype(dataType)       
cast(dataType)	Used to cast the data type to another type.       
astype() returns same as cast().       
between(lowerBound, upperBound)	Checks if the columns values are between lower and upper bound. Returns boolean value.       
bitwiseAND(other)       
bitwiseOR(other)       
bitwiseXOR(other)	Compute bitwise AND, OR & XOR of this expression with another expression respectively.       
contains(other)	Check if String contains in another string.       
desc()       
desc_nulls_first()       
desc_nulls_last()	Returns descending order of the column.       
desc_nulls_first() -null values appear before non-null values.       
desc_nulls_last() – null values appear after non-null values.       
startswith(other)       
endswith(other)	String starts with. Returns boolean expression       
String ends with. Returns boolean expression       
eqNullSafe(other)	Equality test that is safe for null values.       
getField(name)	Returns a field by name in a StructField and by key in Map.       
getItem(key)	Returns a values from Map/Key at the provided position.       
isNotNull()       
isNull()	isNotNull() – Returns True if the current expression is NOT null.       
isNull() – Returns True if the current expression is null.       
isin(*cols)	A boolean expression that is evaluated to true if the value of this expression is contained by the evaluated values of the arguments.       
like(other)       
rlike(other)	Similar to SQL like expression.       
Similar to SQL RLIKE expression (LIKE with Regex).       
over(window)	Used with window column       
substr(startPos, length)	Return a Column which is a substring of the column.       
when(condition, value)       
otherwise(value)	Similar to SQL CASE WHEN, Executes a list of conditions and returns one of multiple possible result expressions.       
dropFields(*fieldNames)	Used to drops fields in StructType by name.       
withField(fieldName, col)	An expression that adds/replaces a field in StructType by name.


### PySpark column function examples

In [16]:
data=[("James","Bond","100",None),
      ("Ann","Varsa","200",'F'),
      ("Tom Cruise","XXX","400",''),
      ("Tom Brand",None,"400",'M')] 
columns=["fname","lname","id","gender"]
df=spark.createDataFrame(data,columns)

In [17]:
#alias
df.select(df.fname).alias("First_Name").show()

#asc() and dsc()
df.sort(df.fname.asc()).show()

#between()
df.filter(df.id.between(100,300)).show()

+----------+
|     fname|
+----------+
|     James|
|       Ann|
|Tom Cruise|
| Tom Brand|
+----------+

+----------+-----+---+------+
|     fname|lname| id|gender|
+----------+-----+---+------+
|       Ann|Varsa|200|     F|
|     James| Bond|100|  NULL|
| Tom Brand| NULL|400|     M|
|Tom Cruise|  XXX|400|      |
+----------+-----+---+------+

+-----+-----+---+------+
|fname|lname| id|gender|
+-----+-----+---+------+
|James| Bond|100|  NULL|
|  Ann|Varsa|200|     F|
+-----+-----+---+------+



In [18]:
#contains
df.filter(df.fname.contains("Cruise")).show()

#isNull and isNotNull
df.filter(df.gender.isNull()).show()

#when and otherwise
df.select(df.fname, df.lname, when(df.gender=="M","Male") \
          .when(df.gender=="F","Female") \
          .when(df.gender==None,"") \
          .otherwise(df.gender).alias("new_gender") \
         ).show()

+----------+-----+---+------+
|     fname|lname| id|gender|
+----------+-----+---+------+
|Tom Cruise|  XXX|400|      |
+----------+-----+---+------+

+-----+-----+---+------+
|fname|lname| id|gender|
+-----+-----+---+------+
|James| Bond|100|  NULL|
+-----+-----+---+------+

+----------+-----+----------+
|     fname|lname|new_gender|
+----------+-----+----------+
|     James| Bond|      NULL|
|       Ann|Varsa|    Female|
|Tom Cruise|  XXX|          |
| Tom Brand| NULL|      Male|
+----------+-----+----------+



In [19]:
#isin
li=["100","200"]
df.select(df.fname,df.lname,df.id) \
  .filter(df.id.isin(li)) \
  .show()

+-----+-----+---+
|fname|lname| id|
+-----+-----+---+
|James| Bond|100|
|  Ann|Varsa|200|
+-----+-----+---+



# 4. PySpark - select columns

In PySpark, select() function is used to select single, multiple, column by index, all columns from the list and the nested columns from a DataFrame, PySpark select() is a transformation function hence it returns a new DataFrame with the selected columns.

In [20]:
# Data
data = [("James","Smith","USA","CA"),
    ("Michael","Rose","USA","NY"),
    ("Robert","Williams","USA","CA"),
    ("Maria","Jones","USA","FL")
  ]

# Column names
columns = ["firstname","lastname","country","state"]

# Create DataFrame
df = spark.createDataFrame(data = data, schema = columns)
df.show(truncate=False)

+---------+--------+-------+-----+
|firstname|lastname|country|state|
+---------+--------+-------+-----+
|James    |Smith   |USA    |CA   |
|Michael  |Rose    |USA    |NY   |
|Robert   |Williams|USA    |CA   |
|Maria    |Jones   |USA    |FL   |
+---------+--------+-------+-----+



### Select Single & Multiple Columns From PySpark

In [21]:
# Select columns by different ways
# df.select("firstname","lastname").show()
# df.select(df.firstname,df.lastname).show()
# df.select(df["firstname"],df["lastname"]).show()
# df.select(col("firstname"),col("lastname")).show()

# Select columns by regular expression
df.select(df.colRegex("`^.*name*`")).show()

+---------+--------+
|firstname|lastname|
+---------+--------+
|    James|   Smith|
|  Michael|    Rose|
|   Robert|Williams|
|    Maria|   Jones|
+---------+--------+



### Select All Columns From List

In [22]:
# Select All columns from List
df.select(*columns).show()

#Select All columns
df.select([col for col in df.columns]).show()

+---------+--------+-------+-----+
|firstname|lastname|country|state|
+---------+--------+-------+-----+
|    James|   Smith|    USA|   CA|
|  Michael|    Rose|    USA|   NY|
|   Robert|Williams|    USA|   CA|
|    Maria|   Jones|    USA|   FL|
+---------+--------+-------+-----+

+---------+--------+-------+-----+
|firstname|lastname|country|state|
+---------+--------+-------+-----+
|    James|   Smith|    USA|   CA|
|  Michael|    Rose|    USA|   NY|
|   Robert|Williams|    USA|   CA|
|    Maria|   Jones|    USA|   FL|
+---------+--------+-------+-----+



### Select Nested Struct Columns from PySpark

In [23]:
# Create DataFrame with nested columns
data = [
        (("James",None,"Smith"),"OH","M"),
        (("Anna","Rose",""),"NY","F"),
        (("Julia","","Williams"),"OH","F"),
        (("Maria","Anne","Jones"),"NY","M"),
        (("Jen","Mary","Brown"),"NY","M"),
        (("Mike","Mary","Williams"),"OH","M")
        ]

schema = StructType([
    StructField('name', StructType([
         StructField('firstname', StringType(), True),
         StructField('middlename', StringType(), True),
         StructField('lastname', StringType(), True)
         ])),
     StructField('state', StringType(), True),
     StructField('gender', StringType(), True)
     ])

df2 = spark.createDataFrame(data = data, schema = schema)
df2.printSchema()
df2.show(truncate=False) # shows all columns

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- state: string (nullable = true)
 |-- gender: string (nullable = true)

+----------------------+-----+------+
|name                  |state|gender|
+----------------------+-----+------+
|{James, NULL, Smith}  |OH   |M     |
|{Anna, Rose, }        |NY   |F     |
|{Julia, , Williams}   |OH   |F     |
|{Maria, Anne, Jones}  |NY   |M     |
|{Jen, Mary, Brown}    |NY   |M     |
|{Mike, Mary, Williams}|OH   |M     |
+----------------------+-----+------+



In [24]:
# Select struct column
df2.select("name").show(truncate=False)

# Select child columns
df2.select("name.firstname","name.lastname").show(truncate=False)

+----------------------+
|name                  |
+----------------------+
|{James, NULL, Smith}  |
|{Anna, Rose, }        |
|{Julia, , Williams}   |
|{Maria, Anne, Jones}  |
|{Jen, Mary, Brown}    |
|{Mike, Mary, Williams}|
+----------------------+

+---------+--------+
|firstname|lastname|
+---------+--------+
|James    |Smith   |
|Anna     |        |
|Julia    |Williams|
|Maria    |Jones   |
|Jen      |Brown   |
|Mike     |Williams|
+---------+--------+



# 5.PySpark - where() and filter()

In [25]:
# Create data
data = [
    (("James","","Smith"),["Java","Scala","C++"],"OH","M"),
    (("Anna","Rose",""),["Spark","Java","C++"],"NY","F"),
    (("Julia","","Williams"),["CSharp","VB"],"OH","F"),
    (("Maria","Anne","Jones"),["CSharp","VB"],"NY","M"),
    (("Jen","Mary","Brown"),["CSharp","VB"],"NY","M"),
    (("Mike","Mary","Williams"),["Python","VB"],"OH","M")
 ]

# Create schema        
schema = StructType([
     StructField('name', StructType([
        StructField('firstname', StringType(), True),
        StructField('middlename', StringType(), True),
         StructField('lastname', StringType(), True)
     ])),
     StructField('languages', ArrayType(StringType()), True),
     StructField('state', StringType(), True),
     StructField('gender', StringType(), True)
 ])

# Create dataframe
df = spark.createDataFrame(data = data, schema = schema)
df.printSchema()
df.show(truncate=False)

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- languages: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- state: string (nullable = true)
 |-- gender: string (nullable = true)

+----------------------+------------------+-----+------+
|name                  |languages         |state|gender|
+----------------------+------------------+-----+------+
|{James, , Smith}      |[Java, Scala, C++]|OH   |M     |
|{Anna, Rose, }        |[Spark, Java, C++]|NY   |F     |
|{Julia, , Williams}   |[CSharp, VB]      |OH   |F     |
|{Maria, Anne, Jones}  |[CSharp, VB]      |NY   |M     |
|{Jen, Mary, Brown}    |[CSharp, VB]      |NY   |M     |
|{Mike, Mary, Williams}|[Python, VB]      |OH   |M     |
+----------------------+------------------+-----+------+



In [26]:
#using equal condition
df.filter(df.state == "OH").show(truncate=False)

#using not equal condition
df.filter(df.state != "OH").show(truncate=False)

# Using SQL Expression
df.filter("gender == 'M'").show()

+----------------------+------------------+-----+------+
|name                  |languages         |state|gender|
+----------------------+------------------+-----+------+
|{James, , Smith}      |[Java, Scala, C++]|OH   |M     |
|{Julia, , Williams}   |[CSharp, VB]      |OH   |F     |
|{Mike, Mary, Williams}|[Python, VB]      |OH   |M     |
+----------------------+------------------+-----+------+

+--------------------+------------------+-----+------+
|name                |languages         |state|gender|
+--------------------+------------------+-----+------+
|{Anna, Rose, }      |[Spark, Java, C++]|NY   |F     |
|{Maria, Anne, Jones}|[CSharp, VB]      |NY   |M     |
|{Jen, Mary, Brown}  |[CSharp, VB]      |NY   |M     |
+--------------------+------------------+-----+------+

+--------------------+------------------+-----+------+
|                name|         languages|state|gender|
+--------------------+------------------+-----+------+
|    {James, , Smith}|[Java, Scala, C++]|   OH|  

In [27]:
# Filter multiple conditions
df.filter( (df.state  == "OH") & (df.gender  == "M") ) \
    .show(truncate=False)

#Filter IS IN List values
li=["OH","CA","DE"]
df.filter(df.state.isin(li)).show()

+----------------------+------------------+-----+------+
|name                  |languages         |state|gender|
+----------------------+------------------+-----+------+
|{James, , Smith}      |[Java, Scala, C++]|OH   |M     |
|{Mike, Mary, Williams}|[Python, VB]      |OH   |M     |
+----------------------+------------------+-----+------+

+--------------------+------------------+-----+------+
|                name|         languages|state|gender|
+--------------------+------------------+-----+------+
|    {James, , Smith}|[Java, Scala, C++]|   OH|     M|
| {Julia, , Williams}|      [CSharp, VB]|   OH|     F|
|{Mike, Mary, Will...|      [Python, VB]|   OH|     M|
+--------------------+------------------+-----+------+



### Filtering with Regular Expression

In [28]:
# Prepare Data
data2 = [(2,"Michael Rose"),(3,"Robert Williams"),
     (4,"Rames Rose"),(5,"Rames rose")
  ]

df2 = spark.createDataFrame(data=data2, schema=["id","name"])

# like - SQL LIKE pattern
df2.filter(df2.name.like("%rose%")).show()

+---+----------+
| id|      name|
+---+----------+
|  5|Rames rose|
+---+----------+



### Filtering Array column

In [29]:
df.filter(array_contains(df.languages,"Java")).show()

+----------------+------------------+-----+------+
|            name|         languages|state|gender|
+----------------+------------------+-----+------+
|{James, , Smith}|[Java, Scala, C++]|   OH|     M|
|  {Anna, Rose, }|[Spark, Java, C++]|   NY|     F|
+----------------+------------------+-----+------+



In [30]:
# Struct condition (Nested)
df.filter(df.name.lastname == "Williams") \
    .show(truncate=False)

+----------------------+------------+-----+------+
|name                  |languages   |state|gender|
+----------------------+------------+-----+------+
|{Julia, , Williams}   |[CSharp, VB]|OH   |F     |
|{Mike, Mary, Williams}|[Python, VB]|OH   |M     |
+----------------------+------------+-----+------+



# 6. PySpark orderBy() and sort()

Syntax: DataFrame.sort(*cols, **kwargs) and DataFrame.orderBy(*cols, **kwargs)

In [31]:

simpleData = [("James","Sales","NY",90000,34,10000), \
    ("Michael","Sales","NY",86000,56,20000), \
    ("Robert","Sales","CA",81000,30,23000), \
    ("Maria","Finance","CA",90000,24,23000), \
    ("Raman","Finance","CA",99000,40,24000), \
    ("Scott","Finance","NY",83000,36,19000), \
    ("Jen","Finance","NY",79000,53,15000), \
    ("Jeff","Marketing","CA",80000,25,18000), \
    ("Kumar","Marketing","NY",91000,50,21000) \
  ]
columns= ["employee_name","department","state","salary","age","bonus"]
# Create SparkSession

df = spark.createDataFrame(data = simpleData, schema = columns)
df.printSchema()
df.show(truncate=False)

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- state: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- age: long (nullable = true)
 |-- bonus: long (nullable = true)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|James        |Sales     |NY   |90000 |34 |10000|
|Michael      |Sales     |NY   |86000 |56 |20000|
|Robert       |Sales     |CA   |81000 |30 |23000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Raman        |Finance   |CA   |99000 |40 |24000|
|Scott        |Finance   |NY   |83000 |36 |19000|
|Jen          |Finance   |NY   |79000 |53 |15000|
|Jeff         |Marketing |CA   |80000 |25 |18000|
|Kumar        |Marketing |NY   |91000 |50 |21000|
+-------------+----------+-----+------+---+-----+



In [32]:
df.sort("department","state").show()

df.sort(df.department.asc(),df.state.desc()).show()

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|        Maria|   Finance|   CA| 90000| 24|23000|
|        Raman|   Finance|   CA| 99000| 40|24000|
|          Jen|   Finance|   NY| 79000| 53|15000|
|        Scott|   Finance|   NY| 83000| 36|19000|
|         Jeff| Marketing|   CA| 80000| 25|18000|
|        Kumar| Marketing|   NY| 91000| 50|21000|
|       Robert|     Sales|   CA| 81000| 30|23000|
|        James|     Sales|   NY| 90000| 34|10000|
|      Michael|     Sales|   NY| 86000| 56|20000|
+-------------+----------+-----+------+---+-----+

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|        Scott|   Finance|   NY| 83000| 36|19000|
|          Jen|   Finance|   NY| 79000| 53|15000|
|        Maria|   Finance|   CA| 90000| 24|23000|
|        Raman|   Finance|   CA| 99000| 40|24000|

### Using raw sql

In [33]:
df.createOrReplaceTempView("EMP")
spark.sql("select employee_name,department,state,salary,age,bonus from EMP ORDER BY department asc").show(truncate=False)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|Raman        |Finance   |CA   |99000 |40 |24000|
|Jen          |Finance   |NY   |79000 |53 |15000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Scott        |Finance   |NY   |83000 |36 |19000|
|Jeff         |Marketing |CA   |80000 |25 |18000|
|Kumar        |Marketing |NY   |91000 |50 |21000|
|Robert       |Sales     |CA   |81000 |30 |23000|
|Michael      |Sales     |NY   |86000 |56 |20000|
|James        |Sales     |NY   |90000 |34 |10000|
+-------------+----------+-----+------+---+-----+



In [34]:

# Data
simpleData = [("James","Sales","NY",90000,34,10000),
    ("Michael","Sales","NY",86000,56,20000),
    ("Robert","Sales","CA",81000,30,23000),
    ("Maria","Finance","CA",90000,24,23000),
    ("Raman","Finance","CA",99000,40,24000),
    ("Scott","Finance","NY",83000,36,19000),
    ("Jen","Finance","NY",79000,53,15000),
    ("Jeff","Marketing","CA",80000,25,18000),
    ("Kumar","Marketing","NY",91000,50,21000)
  ]

# Create DataFrame
schema = ["employee_name","department","state","salary","age","bonus"]
df = spark.createDataFrame(data=simpleData, schema = schema)
df.printSchema()
df.show(truncate=False)

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- state: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- age: long (nullable = true)
 |-- bonus: long (nullable = true)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|James        |Sales     |NY   |90000 |34 |10000|
|Michael      |Sales     |NY   |86000 |56 |20000|
|Robert       |Sales     |CA   |81000 |30 |23000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Raman        |Finance   |CA   |99000 |40 |24000|
|Scott        |Finance   |NY   |83000 |36 |19000|
|Jen          |Finance   |NY   |79000 |53 |15000|
|Jeff         |Marketing |CA   |80000 |25 |18000|
|Kumar        |Marketing |NY   |91000 |50 |21000|
+-------------+----------+-----+------+---+-----+



In [35]:
# Using groupBy().sum()
df.groupBy("department").sum("salary").show()

# Using groupBy().count()
df.groupBy(col("department")).count().show()

# Using groupBy().min()
df.groupBy(df['department']).min("salary").show()

+----------+-----------+
|department|sum(salary)|
+----------+-----------+
|     Sales|     257000|
|   Finance|     351000|
| Marketing|     171000|
+----------+-----------+

+----------+-----+
|department|count|
+----------+-----+
|     Sales|    3|
|   Finance|    4|
| Marketing|    2|
+----------+-----+

+----------+-----------+
|department|min(salary)|
+----------+-----------+
|     Sales|      81000|
|   Finance|      79000|
| Marketing|      80000|
+----------+-----------+



### Using Multiple columns

In [36]:
# GroupBy on multiple columns
df.groupBy("department","state").sum("salary","bonus").show()

+----------+-----+-----------+----------+
|department|state|sum(salary)|sum(bonus)|
+----------+-----+-----------+----------+
|     Sales|   NY|     176000|     30000|
|     Sales|   CA|      81000|     23000|
|   Finance|   CA|     189000|     47000|
|   Finance|   NY|     162000|     34000|
| Marketing|   NY|      91000|     21000|
| Marketing|   CA|      80000|     18000|
+----------+-----+-----------+----------+



### Running more aggregates at a time

In [37]:
# Running more aggregations
df.groupBy(col("department")).agg(
    sum("salary").alias("sum_salary"),
    avg("salary").alias("avg_salary"),
    sum("bonus").alias("sum_bonus"),
    avg("bonus").alias("avg_bonus")
).show()

+----------+----------+-----------------+---------+------------------+
|department|sum_salary|       avg_salary|sum_bonus|         avg_bonus|
+----------+----------+-----------------+---------+------------------+
|     Sales|    257000|85666.66666666667|    53000|17666.666666666668|
|   Finance|    351000|          87750.0|    81000|           20250.0|
| Marketing|    171000|          85500.0|    39000|           19500.0|
+----------+----------+-----------------+---------+------------------+



### Using filter on aggregate data

In [38]:
df.groupBy(col("department")).agg(
    sum("salary").alias("sum_salary"),
    avg("salary").alias("avg_salary"),
    sum("bonus").alias("sum_bonus"),
    avg("bonus").alias("avg_bonus")
).filter(col("sum_bonus") > 50000).show()

+----------+----------+-----------------+---------+------------------+
|department|sum_salary|       avg_salary|sum_bonus|         avg_bonus|
+----------+----------+-----------------+---------+------------------+
|     Sales|    257000|85666.66666666667|    53000|17666.666666666668|
|   Finance|    351000|          87750.0|    81000|           20250.0|
+----------+----------+-----------------+---------+------------------+



### PySpark SQL GROUP BY Query

In [39]:
df.createOrReplaceTempView("employees")

sql_string = """
select department,sum(salary) as sum_salary, avg(salary) as avg_salary,
        sum(bonus) as sum_bonus, avg(bonus) as avg_bonus
from employees
group by department
having sum(bonus)>50000
"""
df2 = spark.sql(sql_string)
df2.show()

+----------+----------+-----------------+---------+------------------+
|department|sum_salary|       avg_salary|sum_bonus|         avg_bonus|
+----------+----------+-----------------+---------+------------------+
|     Sales|    257000|85666.66666666667|    53000|17666.666666666668|
|   Finance|    351000|          87750.0|    81000|           20250.0|
+----------+----------+-----------------+---------+------------------+



# 8. PySpark - Join()

PySpark Join is used to combine two DataFrames and by chaining these you can join multiple DataFrames; it supports all basic join type operations available in traditional SQL like INNER, LEFT OUTER, RIGHT OUTER, LEFT ANTI, LEFT SEMI, CROSS, SELF JOIN. PySpark Joins are wider transformations that involve data shuffling across the network.


### Syntax
join(self, other, on=None, how=None)

join() operation takes parameters as below and returns DataFrame.

* param other: Right side of the join
* param on: a string for the join column name
* param how: default inner. Must be one of inner, cross, outer,full, full_outer, left, left_outer, right, right_outer,left_semi, and left_anti.


In [40]:
emp = [(1,"Smith",-1,"2018","10","M",3000), \
    (2,"Rose",1,"2010","20","M",4000), \
    (3,"Williams",1,"2010","10","M",1000), \
    (4,"Jones",2,"2005","10","F",2000), \
    (5,"Brown",2,"2010","40","",-1), \
      (6,"Brown",2,"2010","50","",-1) \
  ]
empColumns = ["emp_id","name","superior_emp_id","year_joined", \
       "emp_dept_id","gender","salary"]

empDF = spark.createDataFrame(data=emp, schema = empColumns)
empDF.printSchema()
empDF.show(truncate=False)

dept = [("Finance",10), \
    ("Marketing",20), \
    ("Sales",30), \
    ("IT",40) \
  ]
deptColumns = ["dept_name","dept_id"]
deptDF = spark.createDataFrame(data=dept, schema = deptColumns)
deptDF.printSchema()
deptDF.show(truncate=False)

root
 |-- emp_id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- superior_emp_id: long (nullable = true)
 |-- year_joined: string (nullable = true)
 |-- emp_dept_id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)

+------+--------+---------------+-----------+-----------+------+------+
|emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|
+------+--------+---------------+-----------+-----------+------+------+
|1     |Smith   |-1             |2018       |10         |M     |3000  |
|2     |Rose    |1              |2010       |20         |M     |4000  |
|3     |Williams|1              |2010       |10         |M     |1000  |
|4     |Jones   |2              |2005       |10         |F     |2000  |
|5     |Brown   |2              |2010       |40         |      |-1    |
|6     |Brown   |2              |2010       |50         |      |-1    |
+------+--------+---------------+-----------+-----------+------+-----

### How join works?

* **Common Key**: In order to join two or more datasets we need a common key or a column on which you want to join. This key is used to join the matching rows from the datasets.

* **Partitioning**: PySpark Datasets are distributed and partitioned across multiple nodes in a cluster. Ideally, data with the same join key should be located in the same partition. If the Datasets are not already partitioned on the join key, PySpark may perform a shuffle operation to redistribute the data, ensuring that rows with the same join key are on the same node. Shuffling can be an expensive operation, especially for large Datasets.

In [41]:
empDF.join(deptDF,empDF.emp_dept_id == deptDF.dept_id, "inner").show(truncate=False)

+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|1     |Smith   |-1             |2018       |10         |M     |3000  |Finance  |10     |
|3     |Williams|1              |2010       |10         |M     |1000  |Finance  |10     |
|4     |Jones   |2              |2005       |10         |F     |2000  |Finance  |10     |
|2     |Rose    |1              |2010       |20         |M     |4000  |Marketing|20     |
|5     |Brown   |2              |2010       |40         |      |-1    |IT       |40     |
+------+--------+---------------+-----------+-----------+------+------+---------+-------+



In [42]:
empDF.join(deptDF,empDF.emp_dept_id == deptDF.dept_id, "leftouter").show(truncate=False)

+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|1     |Smith   |-1             |2018       |10         |M     |3000  |Finance  |10     |
|2     |Rose    |1              |2010       |20         |M     |4000  |Marketing|20     |
|3     |Williams|1              |2010       |10         |M     |1000  |Finance  |10     |
|4     |Jones   |2              |2005       |10         |F     |2000  |Finance  |10     |
|5     |Brown   |2              |2010       |40         |      |-1    |IT       |40     |
|6     |Brown   |2              |2010       |50         |      |-1    |NULL     |NULL   |
+------+--------+---------------+-----------+-----------+------+------+---------+-------+



In [43]:
empDF.join(deptDF,empDF.emp_dept_id == deptDF.dept_id, "rightouter").show(truncate=False)

+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|4     |Jones   |2              |2005       |10         |F     |2000  |Finance  |10     |
|3     |Williams|1              |2010       |10         |M     |1000  |Finance  |10     |
|1     |Smith   |-1             |2018       |10         |M     |3000  |Finance  |10     |
|2     |Rose    |1              |2010       |20         |M     |4000  |Marketing|20     |
|NULL  |NULL    |NULL           |NULL       |NULL       |NULL  |NULL  |Sales    |30     |
|5     |Brown   |2              |2010       |40         |      |-1    |IT       |40     |
+------+--------+---------------+-----------+-----------+------+------+---------+-------+



In [44]:
empDF.join(deptDF,empDF.emp_dept_id == deptDF.dept_id, "fullouter").show(truncate=False)

+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|1     |Smith   |-1             |2018       |10         |M     |3000  |Finance  |10     |
|3     |Williams|1              |2010       |10         |M     |1000  |Finance  |10     |
|4     |Jones   |2              |2005       |10         |F     |2000  |Finance  |10     |
|2     |Rose    |1              |2010       |20         |M     |4000  |Marketing|20     |
|NULL  |NULL    |NULL           |NULL       |NULL       |NULL  |NULL  |Sales    |30     |
|5     |Brown   |2              |2010       |40         |      |-1    |IT       |40     |
|6     |Brown   |2              |2010       |50         |      |-1    |NULL     |NULL   |
+------+--------+---------------+-----------+-----------+------+------+---------+-------+



### Left Semi Join

A Left Semi Join in PySpark returns only the rows from the left DataFrame (the first DataFrame mentioned in the join operation) where there is a match with the right DataFrame (the second DataFrame). It does not include any columns from the right DataFrame in the resulting DataFrame.

In [45]:
empDF.join(deptDF,empDF.emp_dept_id == deptDF.dept_id, "leftsemi").show(truncate=False)

+------+--------+---------------+-----------+-----------+------+------+
|emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|
+------+--------+---------------+-----------+-----------+------+------+
|1     |Smith   |-1             |2018       |10         |M     |3000  |
|3     |Williams|1              |2010       |10         |M     |1000  |
|4     |Jones   |2              |2005       |10         |F     |2000  |
|2     |Rose    |1              |2010       |20         |M     |4000  |
|5     |Brown   |2              |2010       |40         |      |-1    |
+------+--------+---------------+-----------+-----------+------+------+



### Left Anti Join

A Left Anti Join in PySpark returns only the rows from the left DataFrame (the first DataFrame mentioned in the join operation) where there is no match with the right DataFrame (the second DataFrame). It excludes any rows from the left DataFrame that have a corresponding key in the right DataFrame. 

In [46]:
empDF.join(deptDF,empDF.emp_dept_id == deptDF.dept_id, "leftanti").show(truncate=False)

+------+-----+---------------+-----------+-----------+------+------+
|emp_id|name |superior_emp_id|year_joined|emp_dept_id|gender|salary|
+------+-----+---------------+-----------+-----------+------+------+
|6     |Brown|2              |2010       |50         |      |-1    |
+------+-----+---------------+-----------+-----------+------+------+



In [47]:
#self join 
empDF.alias("emp1").join(empDF.alias("emp2"),
    col("emp1.superior_emp_id") == col("emp2.emp_id"), "inner").select(col("emp1.emp_id"),col("emp1.name"), 
      col("emp2.emp_id").alias("superior_emp_id"), 
      col("emp2.name").alias("superior_emp_name")).show()

+------+--------+---------------+-----------------+
|emp_id|    name|superior_emp_id|superior_emp_name|
+------+--------+---------------+-----------------+
|     2|    Rose|              1|            Smith|
|     3|Williams|              1|            Smith|
|     4|   Jones|              2|             Rose|
|     5|   Brown|              2|             Rose|
|     6|   Brown|              2|             Rose|
+------+--------+---------------+-----------------+



In [48]:
#SQL Expression

empDF.createOrReplaceTempView("EMP")
deptDF.createOrReplaceTempView("DEPT")

joinDF = spark.sql("select * from EMP e, DEPT d where e.emp_dept_id == d.dept_id") \
  .show(truncate=False)

joinDF2 = spark.sql("select * from EMP e INNER JOIN DEPT d ON e.emp_dept_id == d.dept_id") \
  .show(truncate=False)

+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|1     |Smith   |-1             |2018       |10         |M     |3000  |Finance  |10     |
|3     |Williams|1              |2010       |10         |M     |1000  |Finance  |10     |
|4     |Jones   |2              |2005       |10         |F     |2000  |Finance  |10     |
|2     |Rose    |1              |2010       |20         |M     |4000  |Marketing|20     |
|5     |Brown   |2              |2010       |40         |      |-1    |IT       |40     |
+------+--------+---------------+-----------+-----------+------+------+---------+-------+

+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|
+------+-

In [49]:
# Join on multiple dataFrames

# df1.join(df2,df1.id1 == df2.id2,"inner") \
#    .join(df3,df1.id1 == df3.id3,"inner")

# 9. union() and unionall()

### Syntax
dataFrame1.union(dataFrame2)
dataFrame1.unionAll(dataFrame2)

In [50]:
simpleData = [("James","Sales","NY",90000,34,10000), \
    ("Michael","Sales","NY",86000,56,20000), \
    ("Robert","Sales","CA",81000,30,23000), \
    ("Maria","Finance","CA",90000,24,23000) \
  ]

columns= ["employee_name","department","state","salary","age","bonus"]
df = spark.createDataFrame(data = simpleData, schema = columns)
df.printSchema()
df.show(truncate=False)

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- state: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- age: long (nullable = true)
 |-- bonus: long (nullable = true)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|James        |Sales     |NY   |90000 |34 |10000|
|Michael      |Sales     |NY   |86000 |56 |20000|
|Robert       |Sales     |CA   |81000 |30 |23000|
|Maria        |Finance   |CA   |90000 |24 |23000|
+-------------+----------+-----+------+---+-----+



In [51]:
# Create DataFrame2
simpleData2 = [("James","Sales","NY",90000,34,10000), \
    ("Maria","Finance","CA",90000,24,23000), \
    ("Jen","Finance","NY",79000,53,15000), \
    ("Jeff","Marketing","CA",80000,25,18000), \
    ("Kumar","Marketing","NY",91000,50,21000) \
  ]
columns2= ["employee_name","department","state","salary","age","bonus"]

df2 = spark.createDataFrame(data = simpleData2, schema = columns2)

df2.printSchema()
df2.show(truncate=False)

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- state: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- age: long (nullable = true)
 |-- bonus: long (nullable = true)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|James        |Sales     |NY   |90000 |34 |10000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Jen          |Finance   |NY   |79000 |53 |15000|
|Jeff         |Marketing |CA   |80000 |25 |18000|
|Kumar        |Marketing |NY   |91000 |50 |21000|
+-------------+----------+-----+------+---+-----+



In [52]:
# union() to merge two DataFrames
unionDF = df.union(df2)
unionDF.show(truncate=False)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|James        |Sales     |NY   |90000 |34 |10000|
|Michael      |Sales     |NY   |86000 |56 |20000|
|Robert       |Sales     |CA   |81000 |30 |23000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|James        |Sales     |NY   |90000 |34 |10000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Jen          |Finance   |NY   |79000 |53 |15000|
|Jeff         |Marketing |CA   |80000 |25 |18000|
|Kumar        |Marketing |NY   |91000 |50 |21000|
+-------------+----------+-----+------+---+-----+



In [53]:
# unionAll() to merge two DataFrames
unionAllDF = df.unionAll(df2)
unionAllDF.show(truncate=False)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|James        |Sales     |NY   |90000 |34 |10000|
|Michael      |Sales     |NY   |86000 |56 |20000|
|Robert       |Sales     |CA   |81000 |30 |23000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|James        |Sales     |NY   |90000 |34 |10000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Jen          |Finance   |NY   |79000 |53 |15000|
|Jeff         |Marketing |CA   |80000 |25 |18000|
|Kumar        |Marketing |NY   |91000 |50 |21000|
+-------------+----------+-----+------+---+-----+



In [54]:
# Remove duplicates after union() using distinct()
disDF = df.union(df2).distinct()
disDF.show(truncate=False)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|James        |Sales     |NY   |90000 |34 |10000|
|Michael      |Sales     |NY   |86000 |56 |20000|
|Robert       |Sales     |CA   |81000 |30 |23000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Jen          |Finance   |NY   |79000 |53 |15000|
|Jeff         |Marketing |CA   |80000 |25 |18000|
|Kumar        |Marketing |NY   |91000 |50 |21000|
+-------------+----------+-----+------+---+-----+



# 10. PySpark - DataFrame.transform() and sql.functions.transform()

* The pyspark.sql.DataFrame.transform() is used to chain the custom transformations and this function returns the new DataFrame after applying the specified transformations.

### Syntax:  
DataFrame.transform(func: Callable[[…], DataFrame], *args: Any, **kwargs: Any) → pyspark.sql.dataframe.DataFrame

* The PySpark sql.functions.transform() is used to apply the transformation on a column of type Array. This function applies the specified transformation on every element of the array and returns an object of ArrayType.

### Syntax:  
pyspark.sql.functions.transform(col, f)

In [55]:
# Prepare Data
simpleData = (("Java",4000,5), \
    ("Python", 4600,10),  \
    ("Scala", 4100,15),   \
    ("Scala", 4500,15),   \
    ("PHP", 3000,20),  \
  )
columns= ["CourseName", "fee", "discount"]

# Create DataFrame
df = spark.createDataFrame(data = simpleData, schema = columns)
df.printSchema()
df.show(truncate=False)

root
 |-- CourseName: string (nullable = true)
 |-- fee: long (nullable = true)
 |-- discount: long (nullable = true)

+----------+----+--------+
|CourseName|fee |discount|
+----------+----+--------+
|Java      |4000|5       |
|Python    |4600|10      |
|Scala     |4100|15      |
|Scala     |4500|15      |
|PHP       |3000|20      |
+----------+----+--------+



In [56]:
#PySpark DataFrame.transform()

# Custom transformation 1
def to_upper_str_columns(df):
    return df.withColumn("CourseName",upper(df.CourseName))

# Custom transformation 2
def reduce_price(df,reduceBy):
    return df.withColumn("new_fee",df.fee - reduceBy)

# Custom transformation 3
def apply_discount(df):
    return df.withColumn("discounted_fee",  \
             df.new_fee - (df.new_fee * df.discount) / 100)

# PySpark transform() Usage
df2 = df.transform(to_upper_str_columns) \
        .transform(reduce_price,1000) \
        .transform(apply_discount)
df2.show()

+----------+----+--------+-------+--------------+
|CourseName| fee|discount|new_fee|discounted_fee|
+----------+----+--------+-------+--------------+
|      JAVA|4000|       5|   3000|        2850.0|
|    PYTHON|4600|      10|   3600|        3240.0|
|     SCALA|4100|      15|   3100|        2635.0|
|     SCALA|4500|      15|   3500|        2975.0|
|       PHP|3000|      20|   2000|        1600.0|
+----------+----+--------+-------+--------------+



In [57]:
#PySpark sql.functions.transform()

# Create DataFrame with Array
data = [
 ("James,,Smith",["Java","Scala","C++"],["Spark","Java"]),
 ("Michael,Rose,",["Spark","Java","C++"],["Spark","Java"]),
 ("Robert,,Williams",["CSharp","VB"],["Spark","Python"])
]
df = spark.createDataFrame(data=data,schema=["Name","Languages1","Languages2"])
df.printSchema()
df.show()

# using transform() function
df.select(transform("Languages1", lambda x: upper(x)).alias("languages1")) \
  .show()

root
 |-- Name: string (nullable = true)
 |-- Languages1: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- Languages2: array (nullable = true)
 |    |-- element: string (containsNull = true)

+----------------+------------------+---------------+
|            Name|        Languages1|     Languages2|
+----------------+------------------+---------------+
|    James,,Smith|[Java, Scala, C++]|  [Spark, Java]|
|   Michael,Rose,|[Spark, Java, C++]|  [Spark, Java]|
|Robert,,Williams|      [CSharp, VB]|[Spark, Python]|
+----------------+------------------+---------------+

+------------------+
|        languages1|
+------------------+
|[JAVA, SCALA, C++]|
|[SPARK, JAVA, C++]|
|      [CSHARP, VB]|
+------------------+



# 11. PySpark - fillna() and fill()

In PySpark,fillna() from DataFrame class or fill() from DataFrameNaFunctions is used to replace NULL/None values on all or selected multiple columns with either zero(0), empty string, space, or any constant literal values.

As part of the cleanup, sometimes you may need to Drop Rows with NULL/None Values in PySpark DataFrame and Filter Rows by checking IS NULL/NOT NULL conditions.

### Syntax:  
fillna(value, subset=None)  
fill(value, subset=None)  

In [58]:
filePath="small_zipcode.csv"
df = spark.read.options(header='true', inferSchema='true') \
          .csv(filePath)

df.printSchema()
df.show(truncate=False)

root
 |-- id: integer (nullable = true)
 |-- zipcode: integer (nullable = true)
 |-- type: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- population: integer (nullable = true)

+---+-------+--------+-------------------+-----+----------+
|id |zipcode|type    |city               |state|population|
+---+-------+--------+-------------------+-----+----------+
|1  |704    |STANDARD|NULL               |PR   |30100     |
|2  |704    |NULL    |PASEO COSTA DEL SUR|PR   |NULL      |
|3  |709    |NULL    |BDA SAN LUIS       |PR   |3700      |
|4  |76166  |UNIQUE  |CINGULAR WIRELESS  |TX   |84000     |
|5  |76177  |STANDARD|NULL               |TX   |NULL      |
+---+-------+--------+-------------------+-----+----------+



In [59]:
#Replace 0 for null for all integer columns
df.na.fill(value=0).show()

#Replace 0 for null on only population column 
df.na.fill(value=0,subset=["population"]).show()

#Replace null with empty string "" for all string columns
df.na.fill("").show()

+---+-------+--------+-------------------+-----+----------+
| id|zipcode|    type|               city|state|population|
+---+-------+--------+-------------------+-----+----------+
|  1|    704|STANDARD|               NULL|   PR|     30100|
|  2|    704|    NULL|PASEO COSTA DEL SUR|   PR|         0|
|  3|    709|    NULL|       BDA SAN LUIS|   PR|      3700|
|  4|  76166|  UNIQUE|  CINGULAR WIRELESS|   TX|     84000|
|  5|  76177|STANDARD|               NULL|   TX|         0|
+---+-------+--------+-------------------+-----+----------+

+---+-------+--------+-------------------+-----+----------+
| id|zipcode|    type|               city|state|population|
+---+-------+--------+-------------------+-----+----------+
|  1|    704|STANDARD|               NULL|   PR|     30100|
|  2|    704|    NULL|PASEO COSTA DEL SUR|   PR|         0|
|  3|    709|    NULL|       BDA SAN LUIS|   PR|      3700|
|  4|  76166|  UNIQUE|  CINGULAR WIRELESS|   TX|     84000|
|  5|  76177|STANDARD|               NU

In [60]:
df.na.fill("unknown",["city"]).na.fill("",["type"]).show()

#the above statement can also be written like this
#df.na.fill({"city": "unknown", "type": ""}).show()

+---+-------+--------+-------------------+-----+----------+
| id|zipcode|    type|               city|state|population|
+---+-------+--------+-------------------+-----+----------+
|  1|    704|STANDARD|            unknown|   PR|     30100|
|  2|    704|        |PASEO COSTA DEL SUR|   PR|      NULL|
|  3|    709|        |       BDA SAN LUIS|   PR|      3700|
|  4|  76166|  UNIQUE|  CINGULAR WIRELESS|   TX|     84000|
|  5|  76177|STANDARD|            unknown|   TX|      NULL|
+---+-------+--------+-------------------+-----+----------+



# 12. PySpark - pivot()

PySpark pivot() function is used to rotate/transpose the data from one column into multiple Dataframe columns and back using unpivot(). Pivot() It is an aggregation where one of the grouping columns values is transposed into individual columns with distinct data

### Syntax:
pivot_df = original_df.groupBy("grouping_column").pivot("pivot_column").agg({"agg_column": "agg_function"})

* grouping_column: The column used for grouping.

* pivot_column: The column whose distinct values become new columns.

* agg_column: The column for which aggregation is applied (e.g., using a function like sum, avg, etc.).


In [61]:
data = [("Banana",1000,"USA"), ("Carrots",1500,"USA"), ("Beans",1600,"USA"), \
      ("Orange",2000,"USA"),("Banana",400,"China"), \
      ("Carrots",1200,"China"),("Beans",1500,"China"),("Orange",4000,"China"), \
      ("Banana",2000,"Canada"),("Carrots",2000,"Canada"),("Beans",2000,"Mexico")]

columns= ["Product","Amount","Country"]
df = spark.createDataFrame(data = data, schema = columns)
df.printSchema()
df.show(truncate=False)

root
 |-- Product: string (nullable = true)
 |-- Amount: long (nullable = true)
 |-- Country: string (nullable = true)

+-------+------+-------+
|Product|Amount|Country|
+-------+------+-------+
|Banana |1000  |USA    |
|Carrots|1500  |USA    |
|Beans  |1600  |USA    |
|Orange |2000  |USA    |
|Banana |400   |China  |
|Carrots|1200  |China  |
|Beans  |1500  |China  |
|Orange |4000  |China  |
|Banana |2000  |Canada |
|Carrots|2000  |Canada |
|Beans  |2000  |Mexico |
+-------+------+-------+



In [62]:
pivotDF = df.groupBy("Country").pivot("Product").sum("Amount")
pivotDF.printSchema()
pivotDF.show(truncate=False)

root
 |-- Country: string (nullable = true)
 |-- Banana: long (nullable = true)
 |-- Beans: long (nullable = true)
 |-- Carrots: long (nullable = true)
 |-- Orange: long (nullable = true)

+-------+------+-----+-------+------+
|Country|Banana|Beans|Carrots|Orange|
+-------+------+-----+-------+------+
|China  |400   |1500 |1200   |4000  |
|USA    |1000  |1600 |1500   |2000  |
|Mexico |NULL  |2000 |NULL   |NULL  |
|Canada |2000  |NULL |2000   |NULL  |
+-------+------+-----+-------+------+



In [63]:
pivotDF = df.groupBy("Product").pivot("Country").sum("Amount")
pivotDF.printSchema()
pivotDF.show(truncate=False)

root
 |-- Product: string (nullable = true)
 |-- Canada: long (nullable = true)
 |-- China: long (nullable = true)
 |-- Mexico: long (nullable = true)
 |-- USA: long (nullable = true)

+-------+------+-----+------+----+
|Product|Canada|China|Mexico|USA |
+-------+------+-----+------+----+
|Orange |NULL  |4000 |NULL  |2000|
|Beans  |NULL  |1500 |2000  |1600|
|Banana |2000  |400  |NULL  |1000|
|Carrots|2000  |1200 |NULL  |1500|
+-------+------+-----+------+----+



In [64]:
# Applying unpivot()
unpivotExpr = "stack(4, 'Canada', Canada, 'China', China, 'Mexico', Mexico, 'USA', USA) as (Country,Total)"
unPivotDF = pivotDF.select("Product", expr(unpivotExpr)).where("Total is not null")
unPivotDF.show(truncate=False)

+-------+-------+-----+
|Product|Country|Total|
+-------+-------+-----+
|Orange |China  |4000 |
|Orange |USA    |2000 |
|Beans  |China  |1500 |
|Beans  |Mexico |2000 |
|Beans  |USA    |1600 |
|Banana |Canada |2000 |
|Banana |China  |400  |
|Banana |USA    |1000 |
|Carrots|Canada |2000 |
|Carrots|China  |1200 |
|Carrots|USA    |1500 |
+-------+-------+-----+



# 13. PySpark - partitionBy()

PySpark supports partition in two ways; partition in memory (DataFrame) and partition on the disk (File system).

**Partition in memory:** You can partition or repartition the DataFrame by calling repartition() or coalesce() transformations.

**Partition on disk:** While writing the PySpark DataFrame back to disk, you can choose how to partition the data based on columns using partitionBy() of pyspark.sql.DataFrameWriter. This is similar to Hives partitions scheme.

Below are some of the advantages of using PySpark partitions on memory or on disk.

* Fast access to the data
* Provides the ability to perform an operation on a smaller dataset  
  
Partition at rest (disk) is a feature of many databases and data processing frameworks and it is key to make jobs work at scale.

### Syntax:

partitionBy(self, *cols)

In [65]:
# Create DataFrame by reading CSV file
df = spark.read.option("header", True).csv("simple-zipcodes.csv")
df.printSchema()
df.show(5, truncate=False)
df.count()

root
 |-- RecordNumber: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Zipcode: string (nullable = true)
 |-- State: string (nullable = true)

+------------+-------+-------------------+-------+-----+
|RecordNumber|Country|City               |Zipcode|State|
+------------+-------+-------------------+-------+-----+
|1           |US     |PARC PARQUE        |704    |PR   |
|2           |US     |PASEO COSTA DEL SUR|704    |PR   |
|10          |US     |BDA SAN LUIS       |709    |PR   |
|49347       |US     |HOLT               |32564  |FL   |
|49348       |US     |HOMOSASSA          |34487  |FL   |
+------------+-------+-------------------+-------+-----+
only showing top 5 rows



20

In [66]:
df.write.option("header",True) \
    .partitionBy("state") \
    .mode("overwrite") \
    .csv("zipcodes-state")

In [67]:
#partitionBy multiple columns

df.write.option("header",True) \
    .partitionBy("state","city") \
    .mode("overwrite") \
    .csv("zipcodes-state-city")

### Data skew

Use option maxRecordsPerFile if you want to control the number of records for each partition. This is particularly helpful when your data is **skewed (Having some partitions with very low records and other partitions with a high number of records)**.

In [68]:
#partitionBy() control number of partitions
df.write.option("header",True) \
        .option("maxRecordsPerFile", 2) \
        .partitionBy("state") \
        .mode("overwrite") \
        .csv("zipcodes-state-skewed")

### SQL - Read partition data

In [69]:
paraqDF = spark.read.option("header",True) \
    .csv("zipcodes-state-city")
paraqDF.createOrReplaceTempView("ZIPCODE")
spark.sql("select * from ZIPCODE where state = 'AL' and city = 'SPRINGVILLE'").show()

+------------+-------+-------+-----+-----------+
|RecordNumber|Country|Zipcode|state|       city|
+------------+-------+-------+-----+-----------+
|       54355|     US|  35146|   AL|SPRINGVILLE|
+------------+-------+-------+-----+-----------+



# 14. PySpark SQL - Aggregate Functions

**PySpark Aggregate Functions**
PySpark SQL Aggregate functions are grouped as “agg_funcs” in Pyspark. Below is a list of functions defined under this group. Click on each link to learn with example.

* approx_count_distinct
* avg
* collect_list
* collect_set
* countDistinct
* count
* grouping
* first
* last
* kurtosis
* max
* min
* mean
* skewness
* stddev
* stddev_pop
* stddev_samp
* sum
* sumDistinct
* variance, var_samp, var_pop

In [70]:
simpleData = [("James", "Sales", 3000),
    ("Michael", "Sales", 4600),
    ("Robert", "Sales", 4100),
    ("Maria", "Finance", 3000),
    ("James", "Sales", 3000),
    ("Scott", "Finance", 3300),
    ("Jen", "Finance", 3900),
    ("Jeff", "Marketing", 3000),
    ("Kumar", "Marketing", 2000),
    ("Saif", "Sales", 4100)
  ]
schema = ["employee_name","department","salary"]
df = spark.createDataFrame(data=simpleData,schema=schema)
df.printSchema()
df.show()

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- salary: long (nullable = true)

+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|        James|     Sales|  3000|
|      Michael|     Sales|  4600|
|       Robert|     Sales|  4100|
|        Maria|   Finance|  3000|
|        James|     Sales|  3000|
|        Scott|   Finance|  3300|
|          Jen|   Finance|  3900|
|         Jeff| Marketing|  3000|
|        Kumar| Marketing|  2000|
|         Saif|     Sales|  4100|
+-------------+----------+------+



In [71]:
# print("approx_count_distinct: " + \
#       str(df.select(approx_count_distinct("salary")).collect()[0][0]))

print("avg: " + str(df.select(avg("salary")).collect()[0][0]))

df.select(collect_list("salary")).show(truncate=False)

df.select(collect_set("salary")).show(truncate=False)

# df2 = df.select(countDistinct("department", "salary"))
df2.show(truncate=False)
print("Distinct Count of Department & Salary: "+str(df2.collect()[0][0]))

print("count: "+str(df.select(count("salary")).collect()[0]))
df.select(first("salary")).show(truncate=False)
df.select(last("salary")).show(truncate=False)
df.select(kurtosis("salary")).show(truncate=False)
df.select(max("salary")).show(truncate=False)
df.select(min("salary")).show(truncate=False)
df.select(mean("salary")).show(truncate=False)
df.select(skewness("salary")).show(truncate=False)
df.select(stddev("salary"), stddev_samp("salary"), \
    stddev_pop("salary")).show(truncate=False)
df.select(sum("salary")).show(truncate=False)
df.select(sumDistinct("salary")).show(truncate=False)
df.select(variance("salary"),var_samp("salary"),var_pop("salary")) \
  .show(truncate=False)

avg: 3400.0
+------------------------------------------------------------+
|collect_list(salary)                                        |
+------------------------------------------------------------+
|[3000, 4600, 4100, 3000, 3000, 3300, 3900, 3000, 2000, 4100]|
+------------------------------------------------------------+

+------------------------------------+
|collect_set(salary)                 |
+------------------------------------+
|[4600, 3000, 3900, 4100, 3300, 2000]|
+------------------------------------+

+----------+----+--------+-------+--------------+
|CourseName|fee |discount|new_fee|discounted_fee|
+----------+----+--------+-------+--------------+
|JAVA      |4000|5       |3000   |2850.0        |
|PYTHON    |4600|10      |3600   |3240.0        |
|SCALA     |4100|15      |3100   |2635.0        |
|SCALA     |4500|15      |3500   |2975.0        |
|PHP       |3000|20      |2000   |1600.0        |
+----------+----+--------+-------+--------------+

Distinct Count of Departm



+--------------------+
|sum(DISTINCT salary)|
+--------------------+
|20900               |
+--------------------+

+-----------------+-----------------+---------------+
|var_samp(salary) |var_samp(salary) |var_pop(salary)|
+-----------------+-----------------+---------------+
|586666.6666666666|586666.6666666666|528000.0       |
+-----------------+-----------------+---------------+



# 15. PySpark SQL - Window functions

**WINDOW FUNCTIONS USAGE & SYNTAX -	PYSPARK WINDOW FUNCTIONS DESCRIPTION**
* row_number() -	Returns a sequential number starting from 1 within a window partition
* rank() -	Returns the rank of rows within a window partition, with gaps.
* percent_rank() -	Returns the percentile rank of rows within a window partition.
* dense_rank() -	Returns the rank of rows within a window partition without any gaps. Where as Rank() returns rank with gaps.
* ntile(n) -	Returns the ntile id in a window partition
* cume_dist() -	Returns the cumulative distribution of values within a window partition

* lag(e, offset) | lag(columnname, offset) | lag(columnname, offset, defaultvalue) -	
In PySpark, the lag() function retrieves the value of a column from a preceding row within the same window. It enables users to compare values across adjacent rows and perform calculations based on the difference or relationship between consecutive values in a DataFrame.

* lead(columnname, offset) | lead(columnname, offset, defaultvalue) - 	
The lead() function in PySpark retrieves the value of a column from a succeeding row within the same window. It enables users to access values ahead of the current row and perform comparisons or calculations based on future values in a DataFrame.

In [72]:
simpleData = (("James", "Sales", 3000), \
    ("Michael", "Sales", 4600),  \
    ("Robert", "Sales", 4100),   \
    ("Maria", "Finance", 3000),  \
    ("James", "Sales", 3000),    \
    ("Scott", "Finance", 3300),  \
    ("Jen", "Finance", 3900),    \
    ("Jeff", "Marketing", 3000), \
    ("Kumar", "Marketing", 2000),\
    ("Saif", "Sales", 4100) \
  )
 
columns= ["employee_name", "department", "salary"]
df = spark.createDataFrame(data = simpleData, schema = columns)
df.printSchema()
df.show(truncate=False)

# Partitiob by logic common to all functions below
windowSpec = Window.partitionBy("department").orderBy("salary")

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- salary: long (nullable = true)

+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|James        |Sales     |3000  |
|Michael      |Sales     |4600  |
|Robert       |Sales     |4100  |
|Maria        |Finance   |3000  |
|James        |Sales     |3000  |
|Scott        |Finance   |3300  |
|Jen          |Finance   |3900  |
|Jeff         |Marketing |3000  |
|Kumar        |Marketing |2000  |
|Saif         |Sales     |4100  |
+-------------+----------+------+



### Window Ranking functions

In [73]:
# row_number() example
df.withColumn("row_number", row_number().over(windowSpec)).show()

+-------------+----------+------+----------+
|employee_name|department|salary|row_number|
+-------------+----------+------+----------+
|        Maria|   Finance|  3000|         1|
|        Scott|   Finance|  3300|         2|
|          Jen|   Finance|  3900|         3|
|        Kumar| Marketing|  2000|         1|
|         Jeff| Marketing|  3000|         2|
|        James|     Sales|  3000|         1|
|        James|     Sales|  3000|         2|
|       Robert|     Sales|  4100|         3|
|         Saif|     Sales|  4100|         4|
|      Michael|     Sales|  4600|         5|
+-------------+----------+------+----------+



In [74]:
# rank() example
df.withColumn("rank",rank().over(windowSpec)).show()

+-------------+----------+------+----+
|employee_name|department|salary|rank|
+-------------+----------+------+----+
|        Maria|   Finance|  3000|   1|
|        Scott|   Finance|  3300|   2|
|          Jen|   Finance|  3900|   3|
|        Kumar| Marketing|  2000|   1|
|         Jeff| Marketing|  3000|   2|
|        James|     Sales|  3000|   1|
|        James|     Sales|  3000|   1|
|       Robert|     Sales|  4100|   3|
|         Saif|     Sales|  4100|   3|
|      Michael|     Sales|  4600|   5|
+-------------+----------+------+----+



In [75]:
# percent_rank() example
df.withColumn("percent_rank",percent_rank().over(windowSpec)).show()

+-------------+----------+------+------------+
|employee_name|department|salary|percent_rank|
+-------------+----------+------+------------+
|        Maria|   Finance|  3000|         0.0|
|        Scott|   Finance|  3300|         0.5|
|          Jen|   Finance|  3900|         1.0|
|        Kumar| Marketing|  2000|         0.0|
|         Jeff| Marketing|  3000|         1.0|
|        James|     Sales|  3000|         0.0|
|        James|     Sales|  3000|         0.0|
|       Robert|     Sales|  4100|         0.5|
|         Saif|     Sales|  4100|         0.5|
|      Michael|     Sales|  4600|         1.0|
+-------------+----------+------+------------+



In [76]:
# ntile() example
# ntile() window function returns the relative rank of result rows within a window partition. 
# In the below example we have used 2 as an argument to ntile hence it returns ranking between 2 values (1 and 2)
df.withColumn("ntile",ntile(2).over(windowSpec)).show()

df.withColumn("ntile",ntile(3).over(windowSpec)).show()

+-------------+----------+------+-----+
|employee_name|department|salary|ntile|
+-------------+----------+------+-----+
|        Maria|   Finance|  3000|    1|
|        Scott|   Finance|  3300|    1|
|          Jen|   Finance|  3900|    2|
|        Kumar| Marketing|  2000|    1|
|         Jeff| Marketing|  3000|    2|
|        James|     Sales|  3000|    1|
|        James|     Sales|  3000|    1|
|       Robert|     Sales|  4100|    1|
|         Saif|     Sales|  4100|    2|
|      Michael|     Sales|  4600|    2|
+-------------+----------+------+-----+

+-------------+----------+------+-----+
|employee_name|department|salary|ntile|
+-------------+----------+------+-----+
|        Maria|   Finance|  3000|    1|
|        Scott|   Finance|  3300|    2|
|          Jen|   Finance|  3900|    3|
|        Kumar| Marketing|  2000|    1|
|         Jeff| Marketing|  3000|    2|
|        James|     Sales|  3000|    1|
|        James|     Sales|  3000|    1|
|       Robert|     Sales|  4100|    2|

### Window Analytic functions

In [77]:
# cume_dist()
# This function computes the cumulative distribution of a value within a window partition. 
# It calculates the relative rank of a value within the partition.
# The result ranges from 0 to 1, where a value of 0 indicates the lowest value in the partition, and 1 indicates the highest.
# It’s useful for understanding the distribution of values compared to others within the same partition.

df.withColumn("cume_dist",cume_dist().over(windowSpec)).show()

+-------------+----------+------+------------------+
|employee_name|department|salary|         cume_dist|
+-------------+----------+------+------------------+
|        Maria|   Finance|  3000|0.3333333333333333|
|        Scott|   Finance|  3300|0.6666666666666666|
|          Jen|   Finance|  3900|               1.0|
|        Kumar| Marketing|  2000|               0.5|
|         Jeff| Marketing|  3000|               1.0|
|        James|     Sales|  3000|               0.4|
|        James|     Sales|  3000|               0.4|
|       Robert|     Sales|  4100|               0.8|
|         Saif|     Sales|  4100|               0.8|
|      Michael|     Sales|  4600|               1.0|
+-------------+----------+------+------------------+



In [78]:
# log() example
df.withColumn("lag",lag("salary",2).over(windowSpec)).show()

+-------------+----------+------+----+
|employee_name|department|salary| lag|
+-------------+----------+------+----+
|        Maria|   Finance|  3000|NULL|
|        Scott|   Finance|  3300|NULL|
|          Jen|   Finance|  3900|3000|
|        Kumar| Marketing|  2000|NULL|
|         Jeff| Marketing|  3000|NULL|
|        James|     Sales|  3000|NULL|
|        James|     Sales|  3000|NULL|
|       Robert|     Sales|  4100|3000|
|         Saif|     Sales|  4100|3000|
|      Michael|     Sales|  4600|4100|
+-------------+----------+------+----+



### Window Aggregate Functions

In [79]:
windowSpecAgg  = Window.partitionBy("department")
df.withColumn("row",row_number().over(windowSpec)) \
  .withColumn("avg", avg(col("salary")).over(windowSpecAgg)) \
  .withColumn("sum", sum(col("salary")).over(windowSpecAgg)) \
  .withColumn("min", min(col("salary")).over(windowSpecAgg)) \
  .withColumn("max", max(col("salary")).over(windowSpecAgg)) \
  .where(col("row")==1).select("department","avg","sum","min","max") \
  .show()

+----------+------+-----+----+----+
|department|   avg|  sum| min| max|
+----------+------+-----+----+----+
|   Finance|3400.0|10200|3000|3900|
| Marketing|2500.0| 5000|2000|3000|
|     Sales|3760.0|18800|3000|4600|
+----------+------+-----+----+----+



# 16. PySpark SQL - Date and Timestamp Functions

In [80]:
data=[["1","2020-02-01"],["2","2019-03-01"],["3","2021-03-01"]]
df=spark.createDataFrame(data,["id","input"])
df.show()

+---+----------+
| id|     input|
+---+----------+
|  1|2020-02-01|
|  2|2019-03-01|
|  3|2021-03-01|
+---+----------+



In [87]:
# current_date()
df.select(current_date()).alias("curr_date").show(1)

# date_format()
df.select(date_format(col("input"), "MM-dd-yyyy").alias("date_format")).show()

# to_date()
df.select(to_date(col("input"),"yyyy-MM-dd").alias("to_date")).show()

+--------------+
|current_date()|
+--------------+
|    2024-10-12|
+--------------+
only showing top 1 row

+-----------+
|date_format|
+-----------+
| 02-01-2020|
| 03-01-2019|
| 03-01-2021|
+-----------+

+----------+
|   to_date|
+----------+
|2020-02-01|
|2019-03-01|
|2021-03-01|
+----------+



In [89]:
# datediff()
df.select(col("input"), datediff(current_date(),col("input")).alias("dafediff")).show()

#months_between()
df.select(col("input"), 
    months_between(current_date(),col("input")).alias("months_between")  
  ).show()

#trunc()
df.select(col("input"), 
    trunc(col("input"),"Month").alias("Month_Trunc"), 
    trunc(col("input"),"Year").alias("Month_Year"), 
    trunc(col("input"),"Month").alias("Month_Trunc")
   ).show()

+----------+--------+
|     input|dafediff|
+----------+--------+
|2020-02-01|    1715|
|2019-03-01|    2052|
|2021-03-01|    1321|
+----------+--------+

+----------+--------------+
|     input|months_between|
+----------+--------------+
|2020-02-01|   56.35483871|
|2019-03-01|   67.35483871|
|2021-03-01|   43.35483871|
+----------+--------------+

+----------+-----------+----------+-----------+
|     input|Month_Trunc|Month_Year|Month_Trunc|
+----------+-----------+----------+-----------+
|2020-02-01| 2020-02-01|2020-01-01| 2020-02-01|
|2019-03-01| 2019-03-01|2019-01-01| 2019-03-01|
|2021-03-01| 2021-03-01|2021-01-01| 2021-03-01|
+----------+-----------+----------+-----------+



In [92]:
#add_months() , date_add(), date_sub()
df.select(col("input"), 
    add_months(col("input"),3).alias("add_months"), 
    add_months(col("input"),-3).alias("sub_months"), 
    date_add(col("input"),4).alias("date_add"), 
    date_sub(col("input"),4).alias("date_sub") 
  ).show()

df.select(col("input"), 
     year(col("input")).alias("year"), 
     month(col("input")).alias("month"), 
     next_day(col("input"),"Sunday").alias("next_day"), 
     weekofyear(col("input")).alias("weekofyear") 
  ).show()

df.select(col("input"),  
     dayofweek(col("input")).alias("dayofweek"), 
     dayofmonth(col("input")).alias("dayofmonth"), 
     dayofyear(col("input")).alias("dayofyear"), 
  ).show()

+----------+----------+----------+----------+----------+
|     input|add_months|sub_months|  date_add|  date_sub|
+----------+----------+----------+----------+----------+
|2020-02-01|2020-05-01|2019-11-01|2020-02-05|2020-01-28|
|2019-03-01|2019-06-01|2018-12-01|2019-03-05|2019-02-25|
|2021-03-01|2021-06-01|2020-12-01|2021-03-05|2021-02-25|
+----------+----------+----------+----------+----------+

+----------+----+-----+----------+----------+
|     input|year|month|  next_day|weekofyear|
+----------+----+-----+----------+----------+
|2020-02-01|2020|    2|2020-02-02|         5|
|2019-03-01|2019|    3|2019-03-03|         9|
|2021-03-01|2021|    3|2021-03-07|         9|
+----------+----+-----+----------+----------+

+----------+---------+----------+---------+
|     input|dayofweek|dayofmonth|dayofyear|
+----------+---------+----------+---------+
|2020-02-01|        7|         1|       32|
|2019-03-01|        6|         1|       60|
|2021-03-01|        2|         1|       60|
+----------+-

In [93]:
data=[["1","02-01-2020 11 01 19 06"],["2","03-01-2019 12 01 19 406"],["3","03-01-2021 12 01 19 406"]]
df2=spark.createDataFrame(data,["id","input"])
df2.show(truncate=False)

+---+-----------------------+
|id |input                  |
+---+-----------------------+
|1  |02-01-2020 11 01 19 06 |
|2  |03-01-2019 12 01 19 406|
|3  |03-01-2021 12 01 19 406|
+---+-----------------------+



In [94]:
#current_timestamp()
df2.select(current_timestamp().alias("current_timestamp")
  ).show(1,truncate=False)

#to_timestamp()
df2.select(col("input"), 
    to_timestamp(col("input"), "MM-dd-yyyy HH mm ss SSS").alias("to_timestamp") 
  ).show(truncate=False)

+-------------------------+
|current_timestamp        |
+-------------------------+
|2024-10-12 21:23:41.45752|
+-------------------------+
only showing top 1 row

+-----------------------+-----------------------+
|input                  |to_timestamp           |
+-----------------------+-----------------------+
|02-01-2020 11 01 19 06 |2020-02-01 11:01:19.06 |
|03-01-2019 12 01 19 406|2019-03-01 12:01:19.406|
|03-01-2021 12 01 19 406|2021-03-01 12:01:19.406|
+-----------------------+-----------------------+



In [95]:
#hour, minute,second
data=[["1","2020-02-01 11:01:19.06"],["2","2019-03-01 12:01:19.406"],["3","2021-03-01 12:01:19.406"]]
df3=spark.createDataFrame(data,["id","input"])

df3.select(col("input"), 
    hour(col("input")).alias("hour"), 
    minute(col("input")).alias("minute"),
    second(col("input")).alias("second") 
  ).show(truncate=False)

+-----------------------+----+------+------+
|input                  |hour|minute|second|
+-----------------------+----+------+------+
|2020-02-01 11:01:19.06 |11  |1     |19    |
|2019-03-01 12:01:19.406|12  |1     |19    |
|2021-03-01 12:01:19.406|12  |1     |19    |
+-----------------------+----+------+------+



# 17. PySpark - Read and Write CSV

In [102]:
# Read CSV File
df = spark.read.csv("zipcodes.csv")
df.printSchema()

# Using format().load()
df = spark.read.format("csv").load("zipcodes.csv")
#df.printSchema()

# Use header record for column names
df2 = spark.read.option("header",True) \
     .csv("zipcodes.csv")

# Read multiple CSV files
#df = spark.read.csv("path/file1.csv,path/file2.csv,path/file3.csv")

# Mulitple options at once
# Use header record for column names
df2 = spark.read.options(header='True',delimiter=',',inferSchema='True') \
    .csv("zipcodes.csv")

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- _c7: string (nullable = true)
 |-- _c8: string (nullable = true)
 |-- _c9: string (nullable = true)
 |-- _c10: string (nullable = true)
 |-- _c11: string (nullable = true)
 |-- _c12: string (nullable = true)
 |-- _c13: string (nullable = true)
 |-- _c14: string (nullable = true)
 |-- _c15: string (nullable = true)
 |-- _c16: string (nullable = true)
 |-- _c17: string (nullable = true)
 |-- _c18: string (nullable = true)
 |-- _c19: string (nullable = true)



### Custom Schema

In [107]:
# Using custom schema
schema = StructType() \
      .add("RecordNumber",IntegerType(),True) \
      .add("Zipcode",IntegerType(),True) \
      .add("ZipCodeType",StringType(),True) \
      .add("City",StringType(),True) \
      .add("State",StringType(),True) \
      .add("LocationType",StringType(),True) \
      .add("Lat",DoubleType(),True) \
      .add("Long",DoubleType(),True) \
      .add("Xaxis",IntegerType(),True) \
      .add("Yaxis",DoubleType(),True) \
      .add("Zaxis",DoubleType(),True) \
      .add("WorldRegion",StringType(),True) \
      .add("Country",StringType(),True) \
      .add("LocationText",StringType(),True) \
      .add("Location",StringType(),True) \
      .add("Decommisioned",BooleanType(),True) \
      .add("TaxReturnsFiled",StringType(),True) \
      .add("EstimatedPopulation",IntegerType(),True) \
      .add("TotalWages",IntegerType(),True) \
      .add("Notes",StringType(),True)
      
df_with_schema = spark.read.format("csv") \
      .option("header", True) \
      .schema(schema) \
      .load("zipcodes.csv")
df.show(truncate=False)

+------------+-------+-----------+-------------------+-----+--------------+-----+-------+-----+-----+-----+-----------+-------+-----------------------+----------------------------+-------------+---------------+-------------------+----------+-------------+
|_c0         |_c1    |_c2        |_c3                |_c4  |_c5           |_c6  |_c7    |_c8  |_c9  |_c10 |_c11       |_c12   |_c13                   |_c14                        |_c15         |_c16           |_c17               |_c18      |_c19         |
+------------+-------+-----------+-------------------+-----+--------------+-----+-------+-----+-----+-----+-----------+-------+-----------------------+----------------------------+-------------+---------------+-------------------+----------+-------------+
|RecordNumber|Zipcode|ZipCodeType|City               |State|LocationType  |Lat  |Long   |Xaxis|Yaxis|Zaxis|WorldRegion|Country|LocationText           |Location                    |Decommisioned|TaxReturnsFiled|EstimatedPopulation|To

### DataFrame Transformations

PySpark DataFrame transformations involve applying various operations to manipulate the data within a DataFrame. These transformations include:

**Filtering**: Selecting rows from the DataFrame based on specified conditions.  
**Selecting Columns**: Extracting specific columns from the DataFrame.  
**Adding Columns**: Creating new columns by performing computations or transformations on existing columns.  
**Dropping Columns**: Removing unnecessary columns from the DataFrame.  
**Grouping and Aggregating**: Grouping rows based on certain criteria and computing aggregate statistics, such as sum, average, count, etc., within each group.    
**Sorting**: Arranging the rows of the DataFrame in a specified order based on column values.  
**Joining**: Combining two DataFrames based on a common key or condition.  
**Union**: Concatenating two DataFrames vertically, adding rows from one DataFrame to another.  
**Pivoting and Melting**: Reshaping the DataFrame from long to wide format (pivoting) or from wide to long format (melting).   
**Window Functions**: Performing calculations over a sliding window of rows, such as computing moving averages or ranking.

### Write options:

In [124]:
df.write.mode('overwrite').option("header",True).csv("write-zipcodes-csv")

# You can also use this
#df2.write.format("csv").mode('overwrite').save("/tmp/spark_output/zipcodes")

**header** : Specifies whether to include a header row with column names in the CSV file. Example: option("header", "true").  
**delimiter** : Specifies the delimiter to use between fields in the CSV file. Example: option("delimiter", ",").  
**quote** : Specifies the character used for quoting fields in the CSV file. Example: option("quote", "\"").  
**escape** : Specifies the escape character used in the CSV file. Example: option("escape", "\\").  
**nullValue** : Specifies the string to represent null values in the CSV file. Example: option("nullValue", "NA").  
**dateFormat** : Specifies the date format to use for date columns. Example: option("dateFormat", "yyyy-MM-dd").  
**mode** : Specifies the write mode for the output. Options include “overwrite”, “append”, “ignore”, and “error”. Example: option("mode", "overwrite").  
**compression** : Specifies the compression codec to use for the output file. Example: option("compression", "gzip").  

# 18. PySpark - Read and Write Parquet

In [121]:
data =[("James ","","Smith","36636","M",3000),
              ("Michael ","Rose","","40288","M",4000),
              ("Robert ","","Williams","42114","M",4000),
              ("Maria ","Anne","Jones","39192","F",4000),
              ("Jen","Mary","Brown","","F",-1)]
columns=["firstname","middlename","lastname","dob","gender","salary"]
df=spark.createDataFrame(data,columns)
df.write.mode("overwrite").parquet("people.parquet")
parDF1=spark.read.parquet("people.parquet")
spark.sql("drop view ParquetTable")
parDF1.createOrReplaceTempView("parquetTable")
parDF1.printSchema()
parDF1.show(truncate=False)

parkSQL = spark.sql("select * from ParquetTable where salary >= 4000 ")
parkSQL.show(truncate=False)

spark.sql("CREATE OR REPLACE TEMPORARY VIEW PERSON USING parquet OPTIONS (path \"people.parquet\")")
spark.sql("SELECT * FROM PERSON").show()

df.write.partitionBy("gender","salary").mode("overwrite").parquet("people2.parquet")

parDF2=spark.read.parquet("people2.parquet/gender=M")
parDF2.show(truncate=False)

spark.sql("CREATE OR REPLACE  TEMPORARY VIEW PERSON2 USING parquet OPTIONS (path \"people2.parquet/gender=F\")")
spark.sql("SELECT * FROM PERSON2" ).show()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)

+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|dob  |gender|salary|
+---------+----------+--------+-----+------+------+
|Robert   |          |Williams|42114|M     |4000  |
|Maria    |Anne      |Jones   |39192|F     |4000  |
|Michael  |Rose      |        |40288|M     |4000  |
|James    |          |Smith   |36636|M     |3000  |
|Jen      |Mary      |Brown   |     |F     |-1    |
+---------+----------+--------+-----+------+------+

+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|dob  |gender|salary|
+---------+----------+--------+-----+------+------+
|Robert   |          |Williams|42114|M     |4000  |
|Maria    |Anne      |Jones   |39192|F     |4000  |
|Michael  |Rose      |        |40

# 18. Read and Write JSON

In [125]:
# Read JSON file into dataframe    
df = spark.read.json("zipcodes.json")
df.printSchema()
df.show()

# Read multiline json file
multiline_df = spark.read.option("multiline","true") \
      .json("multiline-zipcode.json")
multiline_df.show()

#Read multiple files
# df2 = spark.read.json(
#     ['resources/zipcode2.json','resources/zipcode1.json'])
# df2.show()    

#Read All JSON files from a directory
df3 = spark.read.json("*.json")
df3.show()

# Define custom schema
schema = StructType([
      StructField("RecordNumber",IntegerType(),True),
      StructField("Zipcode",IntegerType(),True),
      StructField("ZipCodeType",StringType(),True),
      StructField("City",StringType(),True),
      StructField("State",StringType(),True),
      StructField("LocationType",StringType(),True),
      StructField("Lat",DoubleType(),True),
      StructField("Long",DoubleType(),True),
      StructField("Xaxis",IntegerType(),True),
      StructField("Yaxis",DoubleType(),True),
      StructField("Zaxis",DoubleType(),True),
      StructField("WorldRegion",StringType(),True),
      StructField("Country",StringType(),True),
      StructField("LocationText",StringType(),True),
      StructField("Location",StringType(),True),
      StructField("Decommisioned",BooleanType(),True),
      StructField("TaxReturnsFiled",StringType(),True),
      StructField("EstimatedPopulation",IntegerType(),True),
      StructField("TotalWages",IntegerType(),True),
      StructField("Notes",StringType(),True)
  ])

df_with_schema = spark.read.schema(schema) \
        .json("zipcodes.json")
df_with_schema.printSchema()
df_with_schema.show()

# Create a table from Parquet File
spark.sql("CREATE OR REPLACE TEMPORARY VIEW zipcode3 USING json OPTIONS" + 
      " (path 'zipcodes.json')")
spark.sql("select * from zipcode3").show()

# PySpark write Parquet File
df2.write.mode('Overwrite').json("write-zipcodes-json")

root
 |-- City: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Decommisioned: boolean (nullable = true)
 |-- EstimatedPopulation: long (nullable = true)
 |-- Lat: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- LocationText: string (nullable = true)
 |-- LocationType: string (nullable = true)
 |-- Long: double (nullable = true)
 |-- Notes: string (nullable = true)
 |-- RecordNumber: long (nullable = true)
 |-- State: string (nullable = true)
 |-- TaxReturnsFiled: long (nullable = true)
 |-- TotalWages: long (nullable = true)
 |-- WorldRegion: string (nullable = true)
 |-- Xaxis: double (nullable = true)
 |-- Yaxis: double (nullable = true)
 |-- Zaxis: double (nullable = true)
 |-- ZipCodeType: string (nullable = true)
 |-- Zipcode: long (nullable = true)

+-------------------+-------+-------------+-------------------+-----+--------------------+--------------------+--------------+-------+-------------+------------+-----+---------------+-------

# 19. PySpark - Built in Functions

### when() or otherwise() function

In [136]:
data = [("James","M",60000),("Michael","M",70000),
        ("Robert",None,400000),("Maria","F",500000),
        ("Jen","",None)]

columns = ["name","gender","salary"]
df = spark.createDataFrame(data = data, schema = columns)
df.show()

+-------+------+------+
|   name|gender|salary|
+-------+------+------+
|  James|     M| 60000|
|Michael|     M| 70000|
| Robert|  NULL|400000|
|  Maria|     F|500000|
|    Jen|      |  NULL|
+-------+------+------+



In [131]:
# when() or otherwise() function

df2 = df.withColumn("new_gender", when(df.gender == "M", "Male") \
             .when(df.gender == "F","Female") \
             .when(df.gender.isNull(),"") \
             .otherwise(df.gender))
df2.show()

+-------+------+------+----------+
|   name|gender|salary|new_gender|
+-------+------+------+----------+
|  James|     M| 60000|      Male|
|Michael|     M| 70000|      Male|
| Robert|  NULL|400000|          |
|  Maria|     F|500000|    Female|
|    Jen|      |  NULL|          |
+-------+------+------+----------+



### expr() function

In [132]:
#Using Case When on withColumn()
df3 = df.withColumn("new_gender", expr("CASE WHEN gender = 'M' THEN 'Male' " + 
               "WHEN gender = 'F' THEN 'Female' WHEN gender IS NULL THEN ''" +
               "ELSE gender END"))
df3.show(truncate=False)

+-------+------+------+----------+
|name   |gender|salary|new_gender|
+-------+------+------+----------+
|James  |M     |60000 |Male      |
|Michael|M     |70000 |Male      |
|Robert |NULL  |400000|          |
|Maria  |F     |500000|Female    |
|Jen    |      |NULL  |          |
+-------+------+------+----------+



In [138]:
#Using Case When on select()
df4 = df.select(col("*"), expr("CASE WHEN gender = 'M' THEN 'Male' " +
           "WHEN gender = 'F' THEN 'Female' WHEN gender IS NULL THEN ''" +
           "ELSE gender END").alias("new_gender"))
df4.show()

+-------+------+------+----------+
|   name|gender|salary|new_gender|
+-------+------+------+----------+
|  James|     M| 60000|      Male|
|Michael|     M| 70000|      Male|
| Robert|  NULL|400000|          |
|  Maria|     F|500000|    Female|
|    Jen|      |  NULL|          |
+-------+------+------+----------+



In [139]:
data=[("2019-01-23",1),("2019-06-24",2),("2019-09-20",3)] 
df=spark.createDataFrame(data).toDF("date","increment") 

#Add Month value from another column
df.select(df.date,df.increment,
     expr("add_months(date,increment)")
  .alias("inc_date")).show()

+----------+---------+----------+
|      date|increment|  inc_date|
+----------+---------+----------+
|2019-01-23|        1|2019-02-23|
|2019-06-24|        2|2019-08-24|
|2019-09-20|        3|2019-12-20|
+----------+---------+----------+



### split() function

**Syntax:**  
pyspark.sql.functions.split(str, pattern, limit=-1)

In [140]:
# Data
data = [("James, A, Smith","2018","M",3000),
            ("Michael, Rose, Jones","2010","M",4000),
            ("Robert,K,Williams","2010","M",4000),
            ("Maria,Anne,Jones","2005","F",4000),
            ("Jen,Mary,Brown","2010","",-1)
            ]

columns=["name","dob_year","gender","salary"]

# Create DataFrame
df=spark.createDataFrame(data,columns)
df.printSchema()
df.show()

root
 |-- name: string (nullable = true)
 |-- dob_year: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)

+--------------------+--------+------+------+
|                name|dob_year|gender|salary|
+--------------------+--------+------+------+
|     James, A, Smith|    2018|     M|  3000|
|Michael, Rose, Jones|    2010|     M|  4000|
|   Robert,K,Williams|    2010|     M|  4000|
|    Maria,Anne,Jones|    2005|     F|  4000|
|      Jen,Mary,Brown|    2010|      |    -1|
+--------------------+--------+------+------+



In [142]:
df2 = df.select(split(col("name"),",")).alias("NameArray")
df2.printSchema()
df2.show(truncate=False)

root
 |-- split(name, ,, -1): array (nullable = true)
 |    |-- element: string (containsNull = false)

+------------------------+
|split(name, ,, -1)      |
+------------------------+
|[James,  A,  Smith]     |
|[Michael,  Rose,  Jones]|
|[Robert, K, Williams]   |
|[Maria, Anne, Jones]    |
|[Jen, Mary, Brown]      |
+------------------------+



In [146]:
df = df.withColumn("name_array",split(col("name"),",\s*"))
df.show(truncate=False)

+--------------------+--------+------+------+----------------------+
|name                |dob_year|gender|salary|name_array            |
+--------------------+--------+------+------+----------------------+
|James, A, Smith     |2018    |M     |3000  |[James, A, Smith]     |
|Michael, Rose, Jones|2010    |M     |4000  |[Michael, Rose, Jones]|
|Robert,K,Williams   |2010    |M     |4000  |[Robert, K, Williams] |
|Maria,Anne,Jones    |2005    |F     |4000  |[Maria, Anne, Jones]  |
|Jen,Mary,Brown      |2010    |      |-1    |[Jen, Mary, Brown]    |
+--------------------+--------+------+------+----------------------+



In [148]:
# Run SQL query
df.createOrReplaceTempView("PERSON")
spark.sql("select SPLIT(name,',') as NameArray from PERSON") \
    .show(truncate=False)

+------------------------+
|NameArray               |
+------------------------+
|[James,  A,  Smith]     |
|[Michael,  Rose,  Jones]|
|[Robert, K, Williams]   |
|[Maria, Anne, Jones]    |
|[Jen, Mary, Brown]      |
+------------------------+



### substring() function

**Syntax:**  
substring(str, pos, len)

In [153]:
# Create Sample Data
data = [(1,"20200828"),(2,"20180525")]
columns=["id","date"]
df=spark.createDataFrame(data,columns)
df.printSchema()
df.show(truncate=False)

# Using substring()
df2 = df.withColumn('year', substring('date', 1,4)) \
    .withColumn('month', substring('date', 5,2)) \
    .withColumn('day', substring('date', 7,2))
df2.printSchema()
df2.show(truncate=False)

root
 |-- id: long (nullable = true)
 |-- date: string (nullable = true)

+---+--------+
|id |date    |
+---+--------+
|1  |20200828|
|2  |20180525|
+---+--------+

root
 |-- id: long (nullable = true)
 |-- date: string (nullable = true)
 |-- year: string (nullable = true)
 |-- month: string (nullable = true)
 |-- day: string (nullable = true)

+---+--------+----+-----+---+
|id |date    |year|month|day|
+---+--------+----+-----+---+
|1  |20200828|2020|08   |28 |
|2  |20180525|2018|05   |25 |
+---+--------+----+-----+---+



### explode(), array() and array_contains() function

In [154]:
data = [
 ("James,,Smith",["Java","Scala","C++"],["Spark","Java"],"OH","CA"),
 ("Michael,Rose,",["Spark","Java","C++"],["Spark","Java"],"NY","NJ"),
 ("Robert,,Williams",["CSharp","VB"],["Spark","Python"],"UT","NV")
]

schema = StructType([ 
    StructField("name",StringType(),True), 
    StructField("languagesAtSchool",ArrayType(StringType()),True), 
    StructField("languagesAtWork",ArrayType(StringType()),True), 
    StructField("currentState", StringType(), True), 
    StructField("previousState", StringType(), True)
  ])

df = spark.createDataFrame(data=data,schema=schema)
df.printSchema()
df.show()

root
 |-- name: string (nullable = true)
 |-- languagesAtSchool: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- languagesAtWork: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- currentState: string (nullable = true)
 |-- previousState: string (nullable = true)

+----------------+------------------+---------------+------------+-------------+
|            name| languagesAtSchool|languagesAtWork|currentState|previousState|
+----------------+------------------+---------------+------------+-------------+
|    James,,Smith|[Java, Scala, C++]|  [Spark, Java]|          OH|           CA|
|   Michael,Rose,|[Spark, Java, C++]|  [Spark, Java]|          NY|           NJ|
|Robert,,Williams|      [CSharp, VB]|[Spark, Python]|          UT|           NV|
+----------------+------------------+---------------+------------+-------------+



In [156]:
# explode() function:
#Use explode() function to create a new row for each element in the given array column.
#There are various PySpark SQL explode functions available to work with Array columns.

df.select(df.name,explode(df.languagesAtSchool)).show()

+----------------+------+
|            name|   col|
+----------------+------+
|    James,,Smith|  Java|
|    James,,Smith| Scala|
|    James,,Smith|   C++|
|   Michael,Rose,| Spark|
|   Michael,Rose,|  Java|
|   Michael,Rose,|   C++|
|Robert,,Williams|CSharp|
|Robert,,Williams|    VB|
+----------------+------+



In [157]:
# array() function:
#Use array() function to create a new array column by merging the data from multiple columns. 
#All input columns must have the same data type.

df.select(df.name,array(df.currentState,df.previousState).alias("states")).show()

+----------------+--------+
|            name|  states|
+----------------+--------+
|    James,,Smith|[OH, CA]|
|   Michael,Rose,|[NY, NJ]|
|Robert,,Williams|[UT, NV]|
+----------------+--------+



In [160]:
# array_contains() function:
df.select(df.name,array_contains(df.languagesAtSchool, "Java").alias("array_contains")).show()

# another way of writing it
df2 = df.withColumn("array_contains_java",array_contains(df.languagesAtSchool,"Java")).show()

+----------------+--------------+
|            name|array_contains|
+----------------+--------------+
|    James,,Smith|          true|
|   Michael,Rose,|          true|
|Robert,,Williams|         false|
+----------------+--------------+

+----------------+------------------+---------------+------------+-------------+-------------------+
|            name| languagesAtSchool|languagesAtWork|currentState|previousState|array_contains_java|
+----------------+------------------+---------------+------------+-------------+-------------------+
|    James,,Smith|[Java, Scala, C++]|  [Spark, Java]|          OH|           CA|               true|
|   Michael,Rose,|[Spark, Java, C++]|  [Spark, Java]|          NY|           NJ|               true|
|Robert,,Williams|      [CSharp, VB]|[Spark, Python]|          UT|           NV|              false|
+----------------+------------------+---------------+------------+-------------+-------------------+



### Json Functions:

**JSON Functions -	Description**  
from_json() - 	Converts JSON string into Struct type or Map type.  
to_json() - 	Converts MapType or Struct type to JSON string.  
json_tuple() - 	Extract the Data from JSON and create them as a new columns.  
get_json_object() - 	Extracts JSON element from a JSON string based on json path specified.  
schema_of_json() - 	Create schema string from JSON string.  

In [161]:
jsonString="""{"Zipcode":704,"ZipCodeType":"STANDARD","City":"PARC PARQUE","State":"PR"}"""
df = spark.createDataFrame([(1, jsonString)],["id","value"])
df.show(truncate=False)

+---+--------------------------------------------------------------------------+
|id |value                                                                     |
+---+--------------------------------------------------------------------------+
|1  |{"Zipcode":704,"ZipCodeType":"STANDARD","City":"PARC PARQUE","State":"PR"}|
+---+--------------------------------------------------------------------------+



In [162]:
# from_json() function:
# PySpark from_json() function is used to convert JSON string into Struct type or Map type

df2 = df.withColumn("value",from_json(df.value,MapType(StringType(),StringType())))
df2.printSchema()
df2.show(truncate=False)

root
 |-- id: long (nullable = true)
 |-- value: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)

+---+---------------------------------------------------------------------------+
|id |value                                                                      |
+---+---------------------------------------------------------------------------+
|1  |{Zipcode -> 704, ZipCodeType -> STANDARD, City -> PARC PARQUE, State -> PR}|
+---+---------------------------------------------------------------------------+



In [163]:
df2.withColumn("value",to_json(col("value"))).show(truncate=False)

+---+----------------------------------------------------------------------------+
|id |value                                                                       |
+---+----------------------------------------------------------------------------+
|1  |{"Zipcode":"704","ZipCodeType":"STANDARD","City":"PARC PARQUE","State":"PR"}|
+---+----------------------------------------------------------------------------+



In [164]:
df.select(col("id"),json_tuple(col("value"),"Zipcode","ZipCodeType","City")) \
    .toDF("id","Zipcode","ZipCodeType","City") \
    .show(truncate=False)

+---+-------+-----------+-----------+
|id |Zipcode|ZipCodeType|City       |
+---+-------+-----------+-----------+
|1  |704    |STANDARD   |PARC PARQUE|
+---+-------+-----------+-----------+



In [165]:
df.select(col("id"),get_json_object(col("value"),"$.ZipCodeType").alias("ZipCodeType")) \
    .show(truncate=False)

+---+-----------+
|id |ZipCodeType|
+---+-----------+
|1  |STANDARD   |
+---+-----------+



In [166]:
schemaStr=spark.range(1) \
    .select(schema_of_json(lit("""{"Zipcode":704,"ZipCodeType":"STANDARD","City":"PARC PARQUE","State":"PR"}"""))) \
    .collect()[0][0]
print(schemaStr)

STRUCT<City: STRING, State: STRING, ZipCodeType: STRING, Zipcode: BIGINT>
