<a href="https://colab.research.google.com/github/t-abs/Apache-Spark/blob/main/Pyspark_8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Transformations(1) in Spark

In [1]:
from pyspark.sql import SparkSession


In [2]:
from pyspark.sql import SparkSession

# Create Spark session
spark = SparkSession.builder \
    .appName("Read CSV Example") \
    .getOrCreate()

# Read CSV file
df = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema","true") \
    .load("/content/employees.csv")


# Show data
df.show()

# Check schema
df.printSchema()

+----------+------+----------+---------------+------+-------+-----------------+--------------------+
|First Name|Gender|Start Date|Last Login Time|Salary|Bonus %|Senior Management|                Team|
+----------+------+----------+---------------+------+-------+-----------------+--------------------+
|   Douglas|  Male|  8/6/1993|       12:42 PM| 97308|  6.945|             true|           Marketing|
|    Thomas|  Male| 3/31/1996|        6:53 AM| 61933|   4.17|             true|                NULL|
|     Maria|Female| 4/23/1993|       11:17 AM|130590| 11.858|            false|             Finance|
|     Jerry|  Male|  3/4/2005|        1:00 PM|138705|   9.34|             true|             Finance|
|     Larry|  Male| 1/24/1998|        4:47 PM|101004|  1.389|             true|     Client Services|
|    Dennis|  Male| 4/18/1987|        1:35 AM|115163| 10.125|            false|               Legal|
|      Ruby|Female| 8/17/1987|        4:20 PM| 65476| 10.012|             true|            

In [4]:
df.select("First Name").show()

+----------+
|First Name|
+----------+
|   Douglas|
|    Thomas|
|     Maria|
|     Jerry|
|     Larry|
|    Dennis|
|      Ruby|
|      NULL|
|    Angela|
|   Frances|
|    Louise|
|     Julie|
|   Brandon|
|      Gary|
|  Kimberly|
|   Lillian|
|    Jeremy|
|     Shawn|
|     Diana|
|     Donna|
+----------+
only showing top 20 rows


In [5]:
from pyspark.sql.functions import *
from pyspark.sql.types import *


In [6]:
df.select(col("First Name")).show()

+----------+
|First Name|
+----------+
|   Douglas|
|    Thomas|
|     Maria|
|     Jerry|
|     Larry|
|    Dennis|
|      Ruby|
|      NULL|
|    Angela|
|   Frances|
|    Louise|
|     Julie|
|   Brandon|
|      Gary|
|  Kimberly|
|   Lillian|
|    Jeremy|
|     Shawn|
|     Diana|
|     Donna|
+----------+
only showing top 20 rows


  Transformation

In [8]:
df.select(col("Salary") + 5).show()

+------------+
|(Salary + 5)|
+------------+
|       97313|
|       61938|
|      130595|
|      138710|
|      101009|
|      115168|
|       65481|
|       45911|
|       95575|
|      139857|
|       63246|
|      102513|
|      112812|
|      109836|
|       41431|
|       59419|
|       90375|
|      111742|
|      132945|
|       81019|
+------------+
only showing top 20 rows


Multiple Columns

In [9]:
df.select("First Name","Salary").show()

+----------+------+
|First Name|Salary|
+----------+------+
|   Douglas| 97308|
|    Thomas| 61933|
|     Maria|130590|
|     Jerry|138705|
|     Larry|101004|
|    Dennis|115163|
|      Ruby| 65476|
|      NULL| 45906|
|    Angela| 95570|
|   Frances|139852|
|    Louise| 63241|
|     Julie|102508|
|   Brandon|112807|
|      Gary|109831|
|  Kimberly| 41426|
|   Lillian| 59414|
|    Jeremy| 90370|
|     Shawn|111737|
|     Diana|132940|
|     Donna| 81014|
+----------+------+
only showing top 20 rows


In [10]:
df.select(expr("Salary + 5")).show()

+------------+
|(Salary + 5)|
+------------+
|       97313|
|       61938|
|      130595|
|      138710|
|      101009|
|      115168|
|       65481|
|       45911|
|       95575|
|      139857|
|       63246|
|      102513|
|      112812|
|      109836|
|       41431|
|       59419|
|       90375|
|      111742|
|      132945|
|       81019|
+------------+
only showing top 20 rows


Alias and concat using Expression

In [12]:
df.select(expr("`Salary` as slt"), expr("`First Name` as fn"), expr("concat(`First Name`,`Gender`)")).show()

+------+--------+--------------------------+
|   slt|      fn|concat(First Name, Gender)|
+------+--------+--------------------------+
| 97308| Douglas|               DouglasMale|
| 61933|  Thomas|                ThomasMale|
|130590|   Maria|               MariaFemale|
|138705|   Jerry|                 JerryMale|
|101004|   Larry|                 LarryMale|
|115163|  Dennis|                DennisMale|
| 65476|    Ruby|                RubyFemale|
| 45906|    NULL|                      NULL|
| 95570|  Angela|              AngelaFemale|
|139852| Frances|             FrancesFemale|
| 63241|  Louise|              LouiseFemale|
|102508|   Julie|               JulieFemale|
|112807| Brandon|               BrandonMale|
|109831|    Gary|                  GaryMale|
| 41426|Kimberly|            KimberlyFemale|
| 59414| Lillian|             LillianFemale|
| 90370|  Jeremy|                JeremyMale|
|111737|   Shawn|                 ShawnMale|
|132940|   Diana|               DianaFemale|
| 81014|  

Spark SQL

In [16]:
df.createOrReplaceGlobalTempView("emp_tb")

In [17]:
spark.sql("""
select * from global_temp.emp_tb
""").show()


+----------+------+----------+---------------+------+-------+-----------------+--------------------+
|First Name|Gender|Start Date|Last Login Time|Salary|Bonus %|Senior Management|                Team|
+----------+------+----------+---------------+------+-------+-----------------+--------------------+
|   Douglas|  Male|  8/6/1993|       12:42 PM| 97308|  6.945|             true|           Marketing|
|    Thomas|  Male| 3/31/1996|        6:53 AM| 61933|   4.17|             true|                NULL|
|     Maria|Female| 4/23/1993|       11:17 AM|130590| 11.858|            false|             Finance|
|     Jerry|  Male|  3/4/2005|        1:00 PM|138705|   9.34|             true|             Finance|
|     Larry|  Male| 1/24/1998|        4:47 PM|101004|  1.389|             true|     Client Services|
|    Dennis|  Male| 4/18/1987|        1:35 AM|115163| 10.125|            false|               Legal|
|      Ruby|Female| 8/17/1987|        4:20 PM| 65476| 10.012|             true|            

In [19]:
df.show()

+----------+------+----------+---------------+------+-------+-----------------+--------------------+
|First Name|Gender|Start Date|Last Login Time|Salary|Bonus %|Senior Management|                Team|
+----------+------+----------+---------------+------+-------+-----------------+--------------------+
|   Douglas|  Male|  8/6/1993|       12:42 PM| 97308|  6.945|             true|           Marketing|
|    Thomas|  Male| 3/31/1996|        6:53 AM| 61933|   4.17|             true|                NULL|
|     Maria|Female| 4/23/1993|       11:17 AM|130590| 11.858|            false|             Finance|
|     Jerry|  Male|  3/4/2005|        1:00 PM|138705|   9.34|             true|             Finance|
|     Larry|  Male| 1/24/1998|        4:47 PM|101004|  1.389|             true|     Client Services|
|    Dennis|  Male| 4/18/1987|        1:35 AM|115163| 10.125|            false|               Legal|
|      Ruby|Female| 8/17/1987|        4:20 PM| 65476| 10.012|             true|            