<a href="https://colab.research.google.com/github/sumaaithal/PySpark_30Days_Challenge/blob/main/pyspark4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark

In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

In [3]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # Property used to format output tables better
spark

In [9]:
from pyspark.sql.functions import *

In [5]:
data = [('James','','Smith','1991-04-01','M',3000),
  ('Michael','Rose','','2000-05-19','M',4000),
  ('Robert','','Williams','1978-09-05','M',4000),
  ('Maria','Anne','Jones','1967-12-01','F',4000),
  ('Jen','Mary','Brown','1980-02-17','F',-1)
]

In [6]:
columns = ["firstname","middlename","lastname","dob","gender","salary"]
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
df = spark.createDataFrame(data=data, schema = columns)

In [7]:
df.printSchema()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)



In [4]:
## with column

In [11]:
df.withColumn("salary",col("salary").cast("integer")).show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|  3000|
|  Michael|      Rose|        |2000-05-19|     M|  4000|
|   Robert|          |Williams|1978-09-05|     M|  4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|
+---------+----------+--------+----------+------+------+



In [12]:
df.withColumn("salary",col("salary")*100).show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|300000|
|  Michael|      Rose|        |2000-05-19|     M|400000|
|   Robert|          |Williams|1978-09-05|     M|400000|
|    Maria|      Anne|   Jones|1967-12-01|     F|400000|
|      Jen|      Mary|   Brown|1980-02-17|     F|  -100|
+---------+----------+--------+----------+------+------+



In [13]:
df.withColumn("copiedColumn",col("salary")*-1).show()

+---------+----------+--------+----------+------+------+------------+
|firstname|middlename|lastname|       dob|gender|salary|copiedColumn|
+---------+----------+--------+----------+------+------+------------+
|    James|          |   Smith|1991-04-01|     M|  3000|       -3000|
|  Michael|      Rose|        |2000-05-19|     M|  4000|       -4000|
|   Robert|          |Williams|1978-09-05|     M|  4000|       -4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|       -4000|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|           1|
+---------+----------+--------+----------+------+------+------------+



In [14]:
df.withColumn("country",lit("USA")).show()

+---------+----------+--------+----------+------+------+-------+
|firstname|middlename|lastname|       dob|gender|salary|country|
+---------+----------+--------+----------+------+------+-------+
|    James|          |   Smith|1991-04-01|     M|  3000|    USA|
|  Michael|      Rose|        |2000-05-19|     M|  4000|    USA|
|   Robert|          |Williams|1978-09-05|     M|  4000|    USA|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|    USA|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|    USA|
+---------+----------+--------+----------+------+------+-------+



In [18]:
df.withColumn("country",lit("USA"))\
  .withColumn("anotherColumn",lit("anotherCountry")).show()

+---------+----------+--------+----------+------+------+-------+--------------+
|firstname|middlename|lastname|       dob|gender|salary|country| anotherColumn|
+---------+----------+--------+----------+------+------+-------+--------------+
|    James|          |   Smith|1991-04-01|     M|  3000|    USA|anotherCountry|
|  Michael|      Rose|        |2000-05-19|     M|  4000|    USA|anotherCountry|
|   Robert|          |Williams|1978-09-05|     M|  4000|    USA|anotherCountry|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|    USA|anotherCountry|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|    USA|anotherCountry|
+---------+----------+--------+----------+------+------+-------+--------------+



In [19]:
## withColumnRenamed

In [20]:
df.withColumnRenamed("gender","Sex").show()

+---------+----------+--------+----------+---+------+
|firstname|middlename|lastname|       dob|Sex|salary|
+---------+----------+--------+----------+---+------+
|    James|          |   Smith|1991-04-01|  M|  3000|
|  Michael|      Rose|        |2000-05-19|  M|  4000|
|   Robert|          |Williams|1978-09-05|  M|  4000|
|    Maria|      Anne|   Jones|1967-12-01|  F|  4000|
|      Jen|      Mary|   Brown|1980-02-17|  F|    -1|
+---------+----------+--------+----------+---+------+



In [21]:
df.drop("salary").show()

+---------+----------+--------+----------+------+
|firstname|middlename|lastname|       dob|gender|
+---------+----------+--------+----------+------+
|    James|          |   Smith|1991-04-01|     M|
|  Michael|      Rose|        |2000-05-19|     M|
|   Robert|          |Williams|1978-09-05|     M|
|    Maria|      Anne|   Jones|1967-12-01|     F|
|      Jen|      Mary|   Brown|1980-02-17|     F|
+---------+----------+--------+----------+------+

