In [10]:
from pyspark.sql import SparkSession

from pyspark.sql.functions import *

spark = SparkSession.builder.appName("YourSparkApplication").master("local[*]").getOrCreate()

In [2]:
data = [('James','','Smith','1991-04-01','M',3000),
  ('Michael','Rose','','2000-05-19','M',4000),
  ('Robert','','Williams','1978-09-05','M',4000),
  ('Maria','Anne','Jones','1967-12-01','F',4000),
  ('Jen','Mary','Brown','1980-02-17','F',-1)
]

In [3]:
columns = ["firstname","middlename","lastname","dob","gender","salary"]

In [4]:
df=spark.createDataFrame(data,columns)

In [5]:
df.show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|  3000|
|  Michael|      Rose|        |2000-05-19|     M|  4000|
|   Robert|          |Williams|1978-09-05|     M|  4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|
+---------+----------+--------+----------+------+------+



## new column creation 

In [13]:
df2=df.withColumn('fullname',concat(df.firstname,df.middlename,df.lastname))

In [16]:
df2.withColumn('string_len',length('fullname')).show()

+---------+----------+--------+----------+------+------+--------------+----------+
|firstname|middlename|lastname|       dob|gender|salary|      fullname|string_len|
+---------+----------+--------+----------+------+------+--------------+----------+
|    James|          |   Smith|1991-04-01|     M|  3000|    JamesSmith|        10|
|  Michael|      Rose|        |2000-05-19|     M|  4000|   MichaelRose|        11|
|   Robert|          |Williams|1978-09-05|     M|  4000|RobertWilliams|        14|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|MariaAnneJones|        14|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|  JenMaryBrown|        12|
+---------+----------+--------+----------+------+------+--------------+----------+



## manipulating existing columns

In [17]:
df2.withColumn('middlename',trim('middlename')).show()

+---------+----------+--------+----------+------+------+--------------+
|firstname|middlename|lastname|       dob|gender|salary|      fullname|
+---------+----------+--------+----------+------+------+--------------+
|    James|          |   Smith|1991-04-01|     M|  3000|    JamesSmith|
|  Michael|      Rose|        |2000-05-19|     M|  4000|   MichaelRose|
|   Robert|          |Williams|1978-09-05|     M|  4000|RobertWilliams|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|MariaAnneJones|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|  JenMaryBrown|
+---------+----------+--------+----------+------+------+--------------+



# observation - first parameter should be a string

In [35]:
df2=df2.withColumn('salary',df2.salary.cast("integer"))

In [36]:
df2.printSchema()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- fullname: string (nullable = true)



In [37]:
df2 = df2.withColumnRenamed('dob','date_of_birth')

df2.show()

+---------+----------+--------+-------------+------+------+--------------+
|firstname|middlename|lastname|date_of_birth|gender|salary|      fullname|
+---------+----------+--------+-------------+------+------+--------------+
|    James|          |   Smith|   1991-04-01|     M|  3000|    JamesSmith|
|  Michael|      Rose|        |   2000-05-19|     M|  4000|   MichaelRose|
|   Robert|          |Williams|   1978-09-05|     M|  4000|RobertWilliams|
|    Maria|      Anne|   Jones|   1967-12-01|     F|  4000|MariaAnneJones|
|      Jen|      Mary|   Brown|   1980-02-17|     F|    -1|  JenMaryBrown|
+---------+----------+--------+-------------+------+------+--------------+

