<a href="https://colab.research.google.com/github/sumaaithal/PySpark_30Days_Challenge/blob/main/pyspark_learn3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark

In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

In [3]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # Property used to format output tables better
spark

In [4]:
dataDF = [(('James','','Smith'),'1991-04-01','M',3000),
  (('Michael','Rose',''),'2000-05-19','M',4000),
  (('Robert','','Williams'),'1978-09-05','M',4000),
  (('Maria','Anne','Jones'),'1967-12-01','F',4000),
  (('Jen','Mary','Brown'),'1980-02-17','F',-1)
]

In [19]:
from pyspark.sql.types import StringType,IntegerType,StructField,StructType
from pyspark.sql.functions import col

In [7]:
schema = StructType([
    StructField('name', StructType([
        StructField('firstname', StringType(), True ),
        StructField('middlename', StringType(), True ),
        StructField('lastname', StringType(), True ),
    ])),

    StructField('dob', StringType(), True),
    StructField('gender', StringType(), True),
    StructField('salary', IntegerType(), True),
])

In [8]:
df = spark.createDataFrame( dataDF, schema=schema )

In [9]:
df.show()

+--------------------+----------+------+------+
|                name|       dob|gender|salary|
+--------------------+----------+------+------+
|    {James, , Smith}|1991-04-01|     M|  3000|
|   {Michael, Rose, }|2000-05-19|     M|  4000|
|{Robert, , Williams}|1978-09-05|     M|  4000|
|{Maria, Anne, Jones}|1967-12-01|     F|  4000|
|  {Jen, Mary, Brown}|1980-02-17|     F|    -1|
+--------------------+----------+------+------+



In [10]:
df.printSchema()

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)



In [11]:
## withColumnRenamed

In [14]:
df = df.withColumnRenamed('dob','dateOfBirth')

In [15]:
df.printSchema()

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- dateOfBirth: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)



In [16]:
df2 = df.withColumnRenamed('gender','Gender')\
        .withColumnRenamed('salary','Salary_Amt')

In [17]:
df2.printSchema()

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- dateOfBirth: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Salary_Amt: integer (nullable = true)



In [18]:
schema2 = StructType([
    StructField("fname", StringType(), True),
    StructField("mname", StringType(), True ),
    StructField("lname", StringType(), True )
])

In [22]:
df2.select(col("name").cast(schema2),
           col("dateOfBirth"), col("Gender"), col("Salary_Amt")).printSchema()

root
 |-- name: struct (nullable = true)
 |    |-- fname: string (nullable = true)
 |    |-- mname: string (nullable = true)
 |    |-- lname: string (nullable = true)
 |-- dateOfBirth: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Salary_Amt: integer (nullable = true)



In [23]:
df2.select(col("name").cast(schema2),
           col("dateOfBirth"), col("Gender"), col("Salary_Amt")).show()

+--------------------+-----------+------+----------+
|                name|dateOfBirth|Gender|Salary_Amt|
+--------------------+-----------+------+----------+
|    {James, , Smith}| 1991-04-01|     M|      3000|
|   {Michael, Rose, }| 2000-05-19|     M|      4000|
|{Robert, , Williams}| 1978-09-05|     M|      4000|
|{Maria, Anne, Jones}| 1967-12-01|     F|      4000|
|  {Jen, Mary, Brown}| 1980-02-17|     F|        -1|
+--------------------+-----------+------+----------+



In [24]:
df.printSchema()

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- dateOfBirth: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)



In [25]:
df3 = df

In [40]:
df3 = df3.select(col("name.firstname").alias('fname'),\
           col("name.middlename").alias('mname'),\
           col("name.lastname").alias('lname'),\
           col("dateOfBirth"),col("gender"),col("salary"))

In [43]:
df4 = df

In [44]:
df4 = df4.withColumn("fname",col("name.firstname"))\
    .withColumn("mname",col("name.middlename"))\
    .withColumn("lname",col("name.lastname"))\
    .drop("name")

In [45]:
df4.show()

+-----------+------+------+-------+-----+--------+
|dateOfBirth|gender|salary|  fname|mname|   lname|
+-----------+------+------+-------+-----+--------+
| 1991-04-01|     M|  3000|  James|     |   Smith|
| 2000-05-19|     M|  4000|Michael| Rose|        |
| 1978-09-05|     M|  4000| Robert|     |Williams|
| 1967-12-01|     F|  4000|  Maria| Anne|   Jones|
| 1980-02-17|     F|    -1|    Jen| Mary|   Brown|
+-----------+------+------+-------+-----+--------+



In [48]:
df5 = df4

In [46]:
cols = ["DOB","Gender","Salary","Fname","Mname","Lname"]

In [49]:
df5.toDF(*cols).show()

+----------+------+------+-------+-----+--------+
|       DOB|Gender|Salary|  Fname|Mname|   Lname|
+----------+------+------+-------+-----+--------+
|1991-04-01|     M|  3000|  James|     |   Smith|
|2000-05-19|     M|  4000|Michael| Rose|        |
|1978-09-05|     M|  4000| Robert|     |Williams|
|1967-12-01|     F|  4000|  Maria| Anne|   Jones|
|1980-02-17|     F|    -1|    Jen| Mary|   Brown|
+----------+------+------+-------+-----+--------+

