In [1]:
import sys
import os

In [2]:
os.environ.get('JAVA_HOME')

'C:\\Program Files\\Java\\jdk1.8.0_311'

In [3]:
import findspark
findspark.init()

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, ArrayType, MapType

In [5]:
spark = SparkSession.builder.config("spark.sql.warehouse.dir", "temp").appName("SparkSQL").getOrCreate()

In [6]:
data = [(("James", "", "Smith"), "36636", "M", 3100),
        (("Michael", "Rose", ""), "40288", "M", 4300),
        (("Robert", "", "Williams"), "42114", "M", 1400),
        (("Maria", "Anne", "Jones"), "39192", "F", 5500),
        (("Jen", "Mary", "Brown"), "", "F", -1)]

In [7]:
schema = StructType([
    StructField('name', StructType([
        StructField('firstname', StringType(), True),
        StructField('middlename', StringType(), True),
        StructField('lastname', StringType(), True)
    ])),
    StructField('id', StringType(), True),
    StructField('gender', StringType(), True),
    StructField('salary', IntegerType(), True)
])

In [8]:
df = spark.createDataFrame(data=data, schema=schema)
df.printSchema()
df.show(truncate=False)

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)

+--------------------+-----+------+------+
|name                |id   |gender|salary|
+--------------------+-----+------+------+
|{James, , Smith}    |36636|M     |3100  |
|{Michael, Rose, }   |40288|M     |4300  |
|{Robert, , Williams}|42114|M     |1400  |
|{Maria, Anne, Jones}|39192|F     |5500  |
|{Jen, Mary, Brown}  |     |F     |-1    |
+--------------------+-----+------+------+



In [9]:
df.select("name.firstname", "name.lastname").show(truncate=False)

+---------+--------+
|firstname|lastname|
+---------+--------+
|James    |Smith   |
|Michael  |        |
|Robert   |Williams|
|Maria    |Jones   |
|Jen      |Brown   |
+---------+--------+



In [10]:
import json
json_data = df.schema.json()
print(json.dumps(json.loads(json_data), indent=2))

{
  "fields": [
    {
      "metadata": {},
      "name": "name",
      "nullable": true,
      "type": {
        "fields": [
          {
            "metadata": {},
            "name": "firstname",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "middlename",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "lastname",
            "nullable": true,
            "type": "string"
          }
        ],
        "type": "struct"
      }
    },
    {
      "metadata": {},
      "name": "id",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "gender",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "salary",
      "nullable": true,
      "type": "integer"
    }
  ],
  "type": "struct"
}


In [11]:
data = [
    (
        ("James", "", "Smith"),
        ["gym", "music"],
        {"a": "wf3eg"}
    ),
    (
        ("Michael", "Rose", ""),
        ["art"],
        {"b": "ef4g"}
    ),
    (
        ("Robert", "", "Williams"),
        ["gym"],
        {"b": "dw3"}
    ),
    (
        ("Maria", "Anne", "Jones"),
        ["dance"],
        {"d": "e32r"}),
    (
        ("Jen", "Mary", "Brown"),
        ["music"],
        {"a": "swf3"}
    )
]

In [12]:
schema = StructType([
    StructField('name', StructType([
        StructField('firstname', StringType(), True),
        StructField('middlename', StringType(), True),
        StructField('lastname', StringType(), True)
    ])),
    StructField('hobbies', ArrayType(StringType()), True),
    StructField('properties', MapType(StringType(), StringType()), True)
])

In [13]:
df = spark.createDataFrame(data=data, schema=schema)
df.printSchema()
df.show(truncate=False)

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- hobbies: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- properties: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)

+--------------------+------------+------------+
|name                |hobbies     |properties  |
+--------------------+------------+------------+
|{James, , Smith}    |[gym, music]|{a -> wf3eg}|
|{Michael, Rose, }   |[art]       |{b -> ef4g} |
|{Robert, , Williams}|[gym]       |{b -> dw3}  |
|{Maria, Anne, Jones}|[dance]     |{d -> e32r} |
|{Jen, Mary, Brown}  |[music]     |{a -> swf3} |
+--------------------+------------+------------+



In [14]:
from pyspark.sql.functions import lit

In [15]:
df.select("name").withColumn("country", lit("USA")).show()

+--------------------+-------+
|                name|country|
+--------------------+-------+
|    {James, , Smith}|    USA|
|   {Michael, Rose, }|    USA|
|{Robert, , Williams}|    USA|
|{Maria, Anne, Jones}|    USA|
|  {Jen, Mary, Brown}|    USA|
+--------------------+-------+



In [16]:
df.select("hobbies", "properties.a", "properties.b").show(truncate=False)

+------------+-----+----+
|hobbies     |a    |b   |
+------------+-----+----+
|[gym, music]|wf3eg|null|
|[art]       |null |ef4g|
|[gym]       |null |dw3 |
|[dance]     |null |null|
|[music]     |swf3 |null|
+------------+-----+----+



In [17]:
df.select(df.properties.getField("a")).show()

+-------------+
|properties[a]|
+-------------+
|        wf3eg|
|         null|
|         null|
|         null|
|         swf3|
+-------------+



In [18]:
df.select(df.name.getField("firstname").alias("f_name")).show()

+-------+
| f_name|
+-------+
|  James|
|Michael|
| Robert|
|  Maria|
|    Jen|
+-------+



In [19]:
df.select("name.*").show(truncate=False)

+---------+----------+--------+
|firstname|middlename|lastname|
+---------+----------+--------+
|James    |          |Smith   |
|Michael  |Rose      |        |
|Robert   |          |Williams|
|Maria    |Anne      |Jones   |
|Jen      |Mary      |Brown   |
+---------+----------+--------+



In [20]:
from pyspark.sql.functions import col

In [21]:
dfNew = df \
    .withColumn("firstname", col("name.firstname")) \
    .withColumn("middlename", col("name.middlename")) \
    .withColumn("lastname", col("name.lastname")) \
    .drop("name")
dfNew.printSchema()
dfNew.show()

root
 |-- hobbies: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- properties: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)

+------------+------------+---------+----------+--------+
|     hobbies|  properties|firstname|middlename|lastname|
+------------+------------+---------+----------+--------+
|[gym, music]|{a -> wf3eg}|    James|          |   Smith|
|       [art]| {b -> ef4g}|  Michael|      Rose|        |
|       [gym]|  {b -> dw3}|   Robert|          |Williams|
|     [dance]| {d -> e32r}|    Maria|      Anne|   Jones|
|     [music]| {a -> swf3}|      Jen|      Mary|   Brown|
+------------+------------+---------+----------+--------+



In [22]:
dfNew.filter(~(dfNew.firstname == "James")).show(truncate=False)
dfNew.filter(col("firstname") != "James").show(truncate=False)

+-------+-----------+---------+----------+--------+
|hobbies|properties |firstname|middlename|lastname|
+-------+-----------+---------+----------+--------+
|[art]  |{b -> ef4g}|Michael  |Rose      |        |
|[gym]  |{b -> dw3} |Robert   |          |Williams|
|[dance]|{d -> e32r}|Maria    |Anne      |Jones   |
|[music]|{a -> swf3}|Jen      |Mary      |Brown   |
+-------+-----------+---------+----------+--------+

+-------+-----------+---------+----------+--------+
|hobbies|properties |firstname|middlename|lastname|
+-------+-----------+---------+----------+--------+
|[art]  |{b -> ef4g}|Michael  |Rose      |        |
|[gym]  |{b -> dw3} |Robert   |          |Williams|
|[dance]|{d -> e32r}|Maria    |Anne      |Jones   |
|[music]|{a -> swf3}|Jen      |Mary      |Brown   |
+-------+-----------+---------+----------+--------+



In [23]:
dfNew.filter("firstname != 'Michael'").show(truncate=False)

+------------+------------+---------+----------+--------+
|hobbies     |properties  |firstname|middlename|lastname|
+------------+------------+---------+----------+--------+
|[gym, music]|{a -> wf3eg}|James    |          |Smith   |
|[gym]       |{b -> dw3}  |Robert   |          |Williams|
|[dance]     |{d -> e32r} |Maria    |Anne      |Jones   |
|[music]     |{a -> swf3} |Jen      |Mary      |Brown   |
+------------+------------+---------+----------+--------+



In [24]:
dfNew.columns

['hobbies', 'properties', 'firstname', 'middlename', 'lastname']

In [25]:
from pyspark.sql.functions import array_contains
dfNew.filter(array_contains(df.hobbies, "gym")).show(truncate=False)

+------------+------------+---------+----------+--------+
|hobbies     |properties  |firstname|middlename|lastname|
+------------+------------+---------+----------+--------+
|[gym, music]|{a -> wf3eg}|James    |          |Smith   |
|[gym]       |{b -> dw3}  |Robert   |          |Williams|
+------------+------------+---------+----------+--------+



In [26]:
dfNew.orderBy(col("firstname").desc(),col("lastname")).show(truncate=False)

+------------+------------+---------+----------+--------+
|hobbies     |properties  |firstname|middlename|lastname|
+------------+------------+---------+----------+--------+
|[gym]       |{b -> dw3}  |Robert   |          |Williams|
|[art]       |{b -> ef4g} |Michael  |Rose      |        |
|[dance]     |{d -> e32r} |Maria    |Anne      |Jones   |
|[music]     |{a -> swf3} |Jen      |Mary      |Brown   |
|[gym, music]|{a -> wf3eg}|James    |          |Smith   |
+------------+------------+---------+----------+--------+



In [27]:
dfNew.sort(col("firstname").asc(),col("lastname").desc()).show(truncate=False)

+------------+------------+---------+----------+--------+
|hobbies     |properties  |firstname|middlename|lastname|
+------------+------------+---------+----------+--------+
|[gym, music]|{a -> wf3eg}|James    |          |Smith   |
|[music]     |{a -> swf3} |Jen      |Mary      |Brown   |
|[dance]     |{d -> e32r} |Maria    |Anne      |Jones   |
|[art]       |{b -> ef4g} |Michael  |Rose      |        |
|[gym]       |{b -> dw3}  |Robert   |          |Williams|
+------------+------------+---------+----------+--------+



In [28]:
# RENAME COLUMNS
print(f"OLD COLUMNS: {df.columns}\n")

new_cols = ['user_name', 'hobbies_list', 'product_dict']
df.toDF(*new_cols).printSchema()

OLD COLUMNS: ['name', 'hobbies', 'properties']

root
 |-- user_name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- hobbies_list: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- product_dict: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)



In [29]:
import json
json_data = df.schema.json()
print(json.dumps(json.loads(json_data), indent=2))

{
  "fields": [
    {
      "metadata": {},
      "name": "name",
      "nullable": true,
      "type": {
        "fields": [
          {
            "metadata": {},
            "name": "firstname",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "middlename",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "lastname",
            "nullable": true,
            "type": "string"
          }
        ],
        "type": "struct"
      }
    },
    {
      "metadata": {},
      "name": "hobbies",
      "nullable": true,
      "type": {
        "containsNull": true,
        "elementType": "string",
        "type": "array"
      }
    },
    {
      "metadata": {},
      "name": "properties",
      "nullable": true,
      "type": {
        "keyType": "string",
        "type": "map",
        "valueContainsNull": true,
        "va