In [1]:
import sys
import os

In [2]:
os.environ.get('JAVA_HOME')

'C:\\Program Files\\Java\\jdk1.8.0_311'

In [3]:
import findspark
findspark.init()

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [5]:
spark = SparkSession.builder.master("local[4]").appName("SparkSQL").getOrCreate()

In [6]:
data = (
    ("Julian", "accounts", 30000), 
    ("Mason", "accounts", 46000),
    ("Rashid", "sales", 41000), 
    ("Jenny", "marketing", 30000),
    ("Milan", "finance", 30000), 
    ("Julian", "accounts", 30000),
    ("Salim", "sales", 41000), 
    ("Scott", "finance", 33000),
    ("Jen", "finance", 39000), 
    ("Jenny", "marketing", 30000),
    ("Kumaran", "marketing", 20000), 
    ("Salim", "sales", 41000)
)
rdd = spark.sparkContext.parallelize(data)

[('Julian', 'accounts', 30000),
 ('Mason', 'accounts', 46000),
 ('Rashid', 'sales', 41000),
 ('Jenny', 'marketing', 30000),
 ('Milan', 'finance', 30000),
 ('Julian', 'accounts', 30000),
 ('Salim', 'sales', 41000),
 ('Scott', 'finance', 33000),
 ('Jen', 'finance', 39000),
 ('Jenny', 'marketing', 30000),
 ('Kumaran', 'marketing', 20000),
 ('Salim', 'sales', 41000)]

In [7]:
df = spark.createDataFrame(rdd, ["emp_name", "department", "salary"])
df.show()

+--------+----------+------+
|emp_name|department|salary|
+--------+----------+------+
|  Julian|  accounts| 30000|
|   Mason|  accounts| 46000|
|  Rashid|     sales| 41000|
|   Jenny| marketing| 30000|
|   Milan|   finance| 30000|
|  Julian|  accounts| 30000|
|   Salim|     sales| 41000|
|   Scott|   finance| 33000|
|     Jen|   finance| 39000|
|   Jenny| marketing| 30000|
| Kumaran| marketing| 20000|
|   Salim|     sales| 41000|
+--------+----------+------+



In [8]:
df.count()

12

In [9]:
df.distinct().count()

9

In [11]:
df.dropDuplicates(["department", "salary"]).show()

+--------+----------+------+
|emp_name|department|salary|
+--------+----------+------+
|  Julian|  accounts| 30000|
|   Mason|  accounts| 46000|
|   Milan|   finance| 30000|
|   Scott|   finance| 33000|
|     Jen|   finance| 39000|
| Kumaran| marketing| 20000|
|   Jenny| marketing| 30000|
|  Rashid|     sales| 41000|
+--------+----------+------+



## REPLACE NULL's

In [15]:
data = (
    (1, "Julian", "accounts", 30000.00, 98012028375, "2023-02-04"), 
    (2, "Rashid", None, 46000.25, 2837482748, None),
    (3, None, "sales", 41000.30, None, "2023-10-24"), 
    (None, "Jenny", "marketing", None, 7003452847, "2023-05-16")
)
rdd = spark.sparkContext.parallelize(data)
df = spark.createDataFrame(rdd, ["id", "name", "department", "salary", "phone", "join_date"])
df.show()

+----+------+----------+--------+-----------+----------+
|  id|  name|department|  salary|      phone| join_date|
+----+------+----------+--------+-----------+----------+
|   1|Julian|  accounts| 30000.0|98012028375|2023-02-04|
|   2|Rashid|      null|46000.25| 2837482748|      null|
|   3|  null|     sales| 41000.3|       null|2023-10-24|
|null| Jenny| marketing|    null| 7003452847|2023-05-16|
+----+------+----------+--------+-----------+----------+



In [35]:
df2 = df.na.fill(-0.0, "salary") \
    .na.fill(-999, "id") \
    .na.fill(-999999, "phone") \
    .na.fill("IT", ["name", "department"]) \
    .na.fill("2024-01-01", "join_date")
df2.show()

+----+------+----------+--------+-----------+----------+
|  id|  name|department|  salary|      phone| join_date|
+----+------+----------+--------+-----------+----------+
|   1|Julian|  accounts| 30000.0|98012028375|2023-02-04|
|   2|Rashid|        IT|46000.25| 2837482748|2024-01-01|
|   3|    IT|     sales| 41000.3|    -999999|2023-10-24|
|-999| Jenny| marketing|    -0.0| 7003452847|2023-05-16|
+----+------+----------+--------+-----------+----------+

