In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *

In [2]:
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

In [16]:
data = [(('James','','Smith'),'1991-04-01','M',3000.0),
  (('Michael','Rose',''),'2000-05-19','M',3500.0),
  (('Robert','','Williams'),'1978-09-05','M',7300.0),
  (('Maria','Anne','Jones'),'1967-12-01','F',5004.0),
  (('Jen','Mary','Brown'),'1980-02-17','F',10000.0)
]

In [17]:
schema = StructType([
        StructField('name', StructType([
             StructField('firstname', StringType(), True),
             StructField('middlename', StringType(), True),
             StructField('lastname', StringType(), True)
             ])),
         StructField('dob', StringType(), True),
         StructField('gender', StringType(), True),
         StructField('salary', DoubleType(), True)
         ])

In [18]:
df = spark.createDataFrame(data = data, schema = schema)

In [19]:
df.printSchema()

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: double (nullable = true)



In [20]:
df.show(truncate=False)

+--------------------+----------+------+-------+
|name                |dob       |gender|salary |
+--------------------+----------+------+-------+
|[James, , Smith]    |1991-04-01|M     |3000.0 |
|[Michael, Rose, ]   |2000-05-19|M     |3500.0 |
|[Robert, , Williams]|1978-09-05|M     |7300.0 |
|[Maria, Anne, Jones]|1967-12-01|F     |5004.0 |
|[Jen, Mary, Brown]  |1980-02-17|F     |10000.0|
+--------------------+----------+------+-------+



In [21]:
df.select("name.firstname").show()

+---------+
|firstname|
+---------+
|    James|
|  Michael|
|   Robert|
|    Maria|
|      Jen|
+---------+



In [22]:
df.withColumn("first_name", F.col("name.firstname")).select("*").show()

+--------------------+----------+------+-------+----------+
|                name|       dob|gender| salary|first_name|
+--------------------+----------+------+-------+----------+
|    [James, , Smith]|1991-04-01|     M| 3000.0|     James|
|   [Michael, Rose, ]|2000-05-19|     M| 3500.0|   Michael|
|[Robert, , Williams]|1978-09-05|     M| 7300.0|    Robert|
|[Maria, Anne, Jones]|1967-12-01|     F| 5004.0|     Maria|
|  [Jen, Mary, Brown]|1980-02-17|     F|10000.0|       Jen|
+--------------------+----------+------+-------+----------+



In [29]:
df2 = (df
       .withColumn("first_name", F.col("name.firstname"))
       .withColumn("mid_name", F.col("name.middlename"))
       .withColumn("last_name", F.col("name.lastname"))
       .withColumn("uid", F.monotonically_increasing_id())
       .drop("name")
      )

In [30]:
df2.show()

+----------+------+-------+----------+--------+---------+-----------+
|       dob|gender| salary|first_name|mid_name|last_name|        uid|
+----------+------+-------+----------+--------+---------+-----------+
|1991-04-01|     M| 3000.0|     James|        |    Smith|          0|
|2000-05-19|     M| 3500.0|   Michael|    Rose|         | 8589934592|
|1978-09-05|     M| 7300.0|    Robert|        | Williams|17179869184|
|1967-12-01|     F| 5004.0|     Maria|    Anne|    Jones|25769803776|
|1980-02-17|     F|10000.0|       Jen|    Mary|    Brown|25769803777|
+----------+------+-------+----------+--------+---------+-----------+



In [25]:
# rename multiple columns
schema2 = StructType([
    StructField("f_name",StringType()),
    StructField("m_name",StringType()),
    StructField("l_name",StringType())])

In [26]:
(
df.select(
    F.col("name").cast(schema2), 
    F.col("dob"), 
    F.col("gender"),
    F.col("salary")) 
  .printSchema()  
)

root
 |-- name: struct (nullable = true)
 |    |-- f_name: string (nullable = true)
 |    |-- m_name: string (nullable = true)
 |    |-- l_name: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: double (nullable = true)



In [28]:
# use alias()
(
df.select(
  F.col("name.firstname").alias("fname"), 
  F.col("name.middlename").alias("mname"),
  F.col("name.lastname").alias("lname"),
  F.col("dob"),
  F.col("gender"),
  F.col("salary"))
  .printSchema()
)

root
 |-- fname: string (nullable = true)
 |-- mname: string (nullable = true)
 |-- lname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: double (nullable = true)



In [33]:
# change all columns
newColumns = [f"newCol{i+1}" for i in range(len(df.columns))]
newColumns

In [35]:
df.toDF(*newColumns).printSchema()

root
 |-- newCol1: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- newCol2: string (nullable = true)
 |-- newCol3: string (nullable = true)
 |-- newCol4: double (nullable = true)



In [39]:
x = df.rdd.map(lambda x: x[0].firstname + ' ' + x[0].lastname)
type(x)

pyspark.rdd.PipelinedRDD

regular `RDD` has `toDF()` function to convert to DataFrame, but `PipelinedRDD` does not