In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

# Reading data
employee_data1 = spark.read.format("csv")\
    .option("header",True)\
    .option("inferSchema",True)\
    .option("mode","PERMISSIVE")\
    .load("/FileStore/tables/employee_data_csv-1.csv")

employee_data1.show(5)

+---+--------+---+------+------------+--------+
| id|    name|age|salary|     address| nominee|
+---+--------+---+------+------------+--------+
|  1|  Manish| 26| 75000|       bihar|nominee1|
|  2|  Nikita| 23|100000|uttarpradesh|nominee2|
|  3|  Pritam| 22|150000|   Bangalore|   India|
|  4|Prantosh| 17|200000|     Kolkata|   India|
|  5|  Vikash| 31|300000|        null|nominee5|
+---+--------+---+------+------------+--------+



In [0]:
employee_data1.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- address: string (nullable = true)
 |-- nominee: string (nullable = true)



# Select Statement

In [0]:
employee_data1.select("name").show()

+--------+
|    name|
+--------+
|  Manish|
|  Nikita|
|  Pritam|
|Prantosh|
|  Vikash|
+--------+



In [0]:
employee_data1.select(col("name")).show()

+--------+
|    name|
+--------+
|  Manish|
|  Nikita|
|  Pritam|
|Prantosh|
|  Vikash|
+--------+



In [0]:
employee_data1.select("id + 5").show()

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-2960761336320886>:1[0m
[0;32m----> 1[0m [43memployee_data1[49m[38;5;241;43m.[39;49m[43mselect[49m[43m([49m[38;5;124;43m"[39;49m[38;5;124;43mid + 5[39;49m[38;5;124;43m"[39;49m[43m)[49m[38;5;241m.[39mshow()

File [0;32m/databricks/spark/python/pyspark/instrumentation_utils.py:48[0m, in [0;36m_wrap_function.<locals>.wrapper[0;34m(*args, **kwargs)[0m
[1;32m     46[0m start [38;5;241m=[39m time[38;5;241m.[39mperf_counter()
[1;32m     47[0m [38;5;28;01mtry[39;00m:
[0;32m---> 48[0m     res [38;5;241m=[39m [43mfunc[49m[43m([49m[38;5;241;43m*[39;49m[43margs[49m[43m,[49m[43m [49m[38;5;241;43m*[39;49m[38;5;241;43m*[39;49m[43mkwargs[49m[43m)[49m
[1;32m     49[0m     logger[38;5;241m.[39mlog_success(
[1;32m     50[0m         module_name, clas

In [0]:
employee_data1.select(col("id") + 5).show()

+--------+
|(id + 5)|
+--------+
|       6|
|       7|
|       8|
|       9|
|      10|
+--------+



In [0]:
employee_data1.select("id","name","age").show()

+---+--------+---+
| id|    name|age|
+---+--------+---+
|  1|  Manish| 26|
|  2|  Nikita| 23|
|  3|  Pritam| 22|
|  4|Prantosh| 17|
|  5|  Vikash| 31|
+---+--------+---+



In [0]:
employee_data1.select(col("id"),col("name"),col("age")).show()

+---+--------+---+
| id|    name|age|
+---+--------+---+
|  1|  Manish| 26|
|  2|  Nikita| 23|
|  3|  Pritam| 22|
|  4|Prantosh| 17|
|  5|  Vikash| 31|
+---+--------+---+



In [0]:
employee_data1.select("id",col("name"),employee_data1["age"],employee_data1.address).show()

+---+--------+---+------------+
| id|    name|age|     address|
+---+--------+---+------------+
|  1|  Manish| 26|       bihar|
|  2|  Nikita| 23|uttarpradesh|
|  3|  Pritam| 22|   Bangalore|
|  4|Prantosh| 17|     Kolkata|
|  5|  Vikash| 31|        null|
+---+--------+---+------------+



# Expression

In [0]:
employee_data1.select(expr("id + 5")).show()

+--------+
|(id + 5)|
+--------+
|       6|
|       7|
|       8|
|       9|
|      10|
+--------+



In [0]:
employee_data1.select(expr('id as employee_id'),expr('name as employee_name'),
                      expr('concat(name,address)')).show()

+-----------+-------------+---------------------+
|employee_id|employee_name|concat(name, address)|
+-----------+-------------+---------------------+
|          1|       Manish|          Manishbihar|
|          2|       Nikita|   Nikitauttarpradesh|
|          3|       Pritam|      PritamBangalore|
|          4|     Prantosh|      PrantoshKolkata|
|          5|       Vikash|                 null|
+-----------+-------------+---------------------+



# Spark SQL

In [0]:
employee_data1.createOrReplaceTempView('employee_tbl')

In [0]:
spark.sql("""
          select * from employee_tbl
          """).show() #select all column in spark sql

+---+--------+---+------+------------+--------+
| id|    name|age|salary|     address| nominee|
+---+--------+---+------+------------+--------+
|  1|  Manish| 26| 75000|       bihar|nominee1|
|  2|  Nikita| 23|100000|uttarpradesh|nominee2|
|  3|  Pritam| 22|150000|   Bangalore|   India|
|  4|Prantosh| 17|200000|     Kolkata|   India|
|  5|  Vikash| 31|300000|        null|nominee5|
+---+--------+---+------+------------+--------+



In [0]:
employee_data1.select("*").show() #select all column in dataframe

+---+--------+---+------+------------+--------+
| id|    name|age|salary|     address| nominee|
+---+--------+---+------+------------+--------+
|  1|  Manish| 26| 75000|       bihar|nominee1|
|  2|  Nikita| 23|100000|uttarpradesh|nominee2|
|  3|  Pritam| 22|150000|   Bangalore|   India|
|  4|Prantosh| 17|200000|     Kolkata|   India|
|  5|  Vikash| 31|300000|        null|nominee5|
+---+--------+---+------+------------+--------+



In [0]:
spark.sql("""
          select id, name, age from employee_tbl
          """).show()

+---+--------+---+
| id|    name|age|
+---+--------+---+
|  1|  Manish| 26|
|  2|  Nikita| 23|
|  3|  Pritam| 22|
|  4|Prantosh| 17|
|  5|  Vikash| 31|
+---+--------+---+



In [0]:
employee_data1.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- address: string (nullable = true)
 |-- nominee: string (nullable = true)



# Aliasing

In [0]:
employee_data1.select(col('id').alias('employee_id'),'name','age').show()

+-----------+--------+---+
|employee_id|    name|age|
+-----------+--------+---+
|          1|  Manish| 26|
|          2|  Nikita| 23|
|          3|  Pritam| 22|
|          4|Prantosh| 17|
|          5|  Vikash| 31|
+-----------+--------+---+



# Filter

In [0]:
employee_data1.filter(col('salary') > 150000).show()

+---+--------+---+------+-------+--------+
| id|    name|age|salary|address| nominee|
+---+--------+---+------+-------+--------+
|  4|Prantosh| 17|200000|Kolkata|   India|
|  5|  Vikash| 31|300000|   null|nominee5|
+---+--------+---+------+-------+--------+



In [0]:
employee_data1.where(col('salary') > 150000).show()

+---+--------+---+------+-------+--------+
| id|    name|age|salary|address| nominee|
+---+--------+---+------+-------+--------+
|  4|Prantosh| 17|200000|Kolkata|   India|
|  5|  Vikash| 31|300000|   null|nominee5|
+---+--------+---+------+-------+--------+



In [0]:
employee_data1.filter(col('salary') > 150000 and col('age') < 18).show()

[0;31m---------------------------------------------------------------------------[0m
[0;31mValueError[0m                                Traceback (most recent call last)
File [0;32m<command-2960761336320905>:1[0m
[0;32m----> 1[0m employee_data1[38;5;241m.[39mfilter(col([38;5;124m'[39m[38;5;124msalary[39m[38;5;124m'[39m) [38;5;241m>[39m [38;5;241m150000[39m [38;5;129;01mand[39;00m col([38;5;124m'[39m[38;5;124mage[39m[38;5;124m'[39m) [38;5;241m<[39m [38;5;241m18[39m)[38;5;241m.[39mshow()

File [0;32m/databricks/spark/python/pyspark/sql/column.py:1295[0m, in [0;36mColumn.__nonzero__[0;34m(self)[0m
[1;32m   1294[0m [38;5;28;01mdef[39;00m [38;5;21m__nonzero__[39m([38;5;28mself[39m) [38;5;241m-[39m[38;5;241m>[39m [38;5;28;01mNone[39;00m:
[0;32m-> 1295[0m     [38;5;28;01mraise[39;00m [38;5;167;01mValueError[39;00m(
[1;32m   1296[0m         [38;5;124m"[39m[38;5;124mCannot convert column into bool: please use [39m[38;5;124m'[39m

In [0]:
employee_data1.filter((col('salary') > 150000) & (col('age') < 18)).show()

+---+--------+---+------+-------+-------+
| id|    name|age|salary|address|nominee|
+---+--------+---+------+-------+-------+
|  4|Prantosh| 17|200000|Kolkata|  India|
+---+--------+---+------+-------+-------+



# Literal

In [0]:
employee_data1.select("*", lit("Kumar").alias("last_name")).show()

+---+--------+---+------+------------+--------+---------+
| id|    name|age|salary|     address| nominee|last_name|
+---+--------+---+------+------------+--------+---------+
|  1|  Manish| 26| 75000|       bihar|nominee1|    Kumar|
|  2|  Nikita| 23|100000|uttarpradesh|nominee2|    Kumar|
|  3|  Pritam| 22|150000|   Bangalore|   India|    Kumar|
|  4|Prantosh| 17|200000|     Kolkata|   India|    Kumar|
|  5|  Vikash| 31|300000|        null|nominee5|    Kumar|
+---+--------+---+------+------------+--------+---------+



# Adding columns

In [0]:
employee_data1.withColumn("sur_name",lit("singh")).show()

+---+--------+---+------+------------+--------+--------+
| id|    name|age|salary|     address| nominee|sur_name|
+---+--------+---+------+------------+--------+--------+
|  1|  Manish| 26| 75000|       bihar|nominee1|   singh|
|  2|  Nikita| 23|100000|uttarpradesh|nominee2|   singh|
|  3|  Pritam| 22|150000|   Bangalore|   India|   singh|
|  4|Prantosh| 17|200000|     Kolkata|   India|   singh|
|  5|  Vikash| 31|300000|        null|nominee5|   singh|
+---+--------+---+------+------------+--------+--------+



# Renaming

In [0]:
employee_data1.withColumnRenamed("id","employee_id").show()

+-----------+--------+---+------+------------+--------+
|employee_id|    name|age|salary|     address| nominee|
+-----------+--------+---+------+------------+--------+
|          1|  Manish| 26| 75000|       bihar|nominee1|
|          2|  Nikita| 23|100000|uttarpradesh|nominee2|
|          3|  Pritam| 22|150000|   Bangalore|   India|
|          4|Prantosh| 17|200000|     Kolkata|   India|
|          5|  Vikash| 31|300000|        null|nominee5|
+-----------+--------+---+------+------------+--------+



In [0]:
new_emploee_df = employee_data1.withColumnRenamed("id","employee_id")

In [0]:
new_emploee_df.show()

+-----------+--------+---+------+------------+--------+
|employee_id|    name|age|salary|     address| nominee|
+-----------+--------+---+------+------------+--------+
|          1|  Manish| 26| 75000|       bihar|nominee1|
|          2|  Nikita| 23|100000|uttarpradesh|nominee2|
|          3|  Pritam| 22|150000|   Bangalore|   India|
|          4|Prantosh| 17|200000|     Kolkata|   India|
|          5|  Vikash| 31|300000|        null|nominee5|
+-----------+--------+---+------+------------+--------+



# Casting Data types

In [0]:
employee_data1.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- address: string (nullable = true)
 |-- nominee: string (nullable = true)



In [0]:
employee_data1.withColumn("id",col("id").cast("string")).printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- address: string (nullable = true)
 |-- nominee: string (nullable = true)



In [0]:
employee_data1.withColumn("id",col("id").cast("string"))\
    .withColumn("salary",col("salary").cast("long"))\
    .printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: long (nullable = true)
 |-- address: string (nullable = true)
 |-- nominee: string (nullable = true)



# Remove Columns

In [0]:
employee_data1.drop('id',col('name')).show()

+---+------+------------+--------+
|age|salary|     address| nominee|
+---+------+------------+--------+
| 26| 75000|       bihar|nominee1|
| 23|100000|uttarpradesh|nominee2|
| 22|150000|   Bangalore|   India|
| 17|200000|     Kolkata|   India|
| 31|300000|        null|nominee5|
+---+------+------------+--------+



# Spark SQL

In [0]:
# Alias
spark.sql("""
          select id as emp_id, name, age from employee_tbl
          """).show()

+------+--------+---+
|emp_id|    name|age|
+------+--------+---+
|     1|  Manish| 26|
|     2|  Nikita| 23|
|     3|  Pritam| 22|
|     4|Prantosh| 17|
|     5|  Vikash| 31|
+------+--------+---+



In [0]:
# Filter/Where
spark.sql("""
          select * from employee_tbl where salary > 150000 and age < 18
          """).show()

+---+--------+---+------+-------+-------+
| id|    name|age|salary|address|nominee|
+---+--------+---+------+-------+-------+
|  4|Prantosh| 17|200000|Kolkata|  India|
+---+--------+---+------+-------+-------+



In [0]:
# Literal
spark.sql("""
          select *, "kumar" as last_name from employee_tbl where salary > 150000 and age < 18
          """).show()

+---+--------+---+------+-------+-------+---------+
| id|    name|age|salary|address|nominee|last_name|
+---+--------+---+------+-------+-------+---------+
|  4|Prantosh| 17|200000|Kolkata|  India|    kumar|
+---+--------+---+------+-------+-------+---------+



In [0]:

# Adding Columns
spark.sql("""
          select *, "kumar" as last_name, concat(name,last_name) as full_name from employee_tbl where salary > 150000 and age < 18
          """).show()

+---+--------+---+------+-------+-------+---------+-------------+
| id|    name|age|salary|address|nominee|last_name|    full_name|
+---+--------+---+------+-------+-------+---------+-------------+
|  4|Prantosh| 17|200000|Kolkata|  India|    kumar|Prantoshkumar|
+---+--------+---+------+-------+-------+---------+-------------+



In [0]:
#Renaming Columns
spark.sql("""
          select *, "kumar" as last_name, concat(name,last_name) as full_name, id as employee_id from employee_tbl where salary > 150000 and age < 18
          """).show()

+---+--------+---+------+-------+-------+---------+-------------+-----------+
| id|    name|age|salary|address|nominee|last_name|    full_name|employee_id|
+---+--------+---+------+-------+-------+---------+-------------+-----------+
|  4|Prantosh| 17|200000|Kolkata|  India|    kumar|Prantoshkumar|          4|
+---+--------+---+------+-------+-------+---------+-------------+-----------+



In [0]:
# Casting Data type
spark.sql("""
          select *, "kumar" as last_name, concat(name,last_name) as full_name, cast(id as string) from employee_tbl where salary > 150000 and age < 18
          """).printSchema()


root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- address: string (nullable = true)
 |-- nominee: string (nullable = true)
 |-- last_name: string (nullable = false)
 |-- full_name: string (nullable = true)
 |-- id: string (nullable = true)



In [0]:
# Removing Columns - dont write in select statement if no need any columns


# Unique & sorted records

In [0]:
data=[(10 ,'Anil',50000, 18),
(11 ,'Vikas',75000,  16),
(12 ,'Nisha',40000,  18),
(13 ,'Nidhi',60000,  17),
(14 ,'Priya',80000,  18),
(15 ,'Mohit',45000,  18),
(16 ,'Rajesh',90000, 10),
(17 ,'Raman',55000, 16),
(18 ,'Sam',65000,   17),
(15 ,'Mohit',45000,  18),
(13 ,'Nidhi',60000,  17),      
(14 ,'Priya',90000,  18),  
(18 ,'Sam',65000,   17)
     ]


schema = ['id','Name','sal','mngr_id']

manager_df = spark.createDataFrame(data=data,schema=schema)

In [0]:
manager_df.show()

+---+------+-----+-------+
| id|  Name|  sal|mngr_id|
+---+------+-----+-------+
| 10|  Anil|50000|     18|
| 11| Vikas|75000|     16|
| 12| Nisha|40000|     18|
| 13| Nidhi|60000|     17|
| 14| Priya|80000|     18|
| 15| Mohit|45000|     18|
| 16|Rajesh|90000|     10|
| 17| Raman|55000|     16|
| 18|   Sam|65000|     17|
| 15| Mohit|45000|     18|
| 13| Nidhi|60000|     17|
| 14| Priya|90000|     18|
| 18|   Sam|65000|     17|
+---+------+-----+-------+



In [0]:
manager_df.distinct().show()

+---+------+-----+-------+
| id|  Name|  sal|mngr_id|
+---+------+-----+-------+
| 10|  Anil|50000|     18|
| 12| Nisha|40000|     18|
| 11| Vikas|75000|     16|
| 13| Nidhi|60000|     17|
| 15| Mohit|45000|     18|
| 14| Priya|80000|     18|
| 16|Rajesh|90000|     10|
| 17| Raman|55000|     16|
| 18|   Sam|65000|     17|
| 14| Priya|90000|     18|
+---+------+-----+-------+



In [0]:
manager_df.distinct().count()

Out[4]: 10

In [0]:
manager_df.count()

Out[5]: 13

In [0]:
manager_df.distinct("id","name").show()

[0;31m---------------------------------------------------------------------------[0m
[0;31mTypeError[0m                                 Traceback (most recent call last)
File [0;32m<command-1195057310164348>:1[0m
[0;32m----> 1[0m [43mmanager_df[49m[38;5;241;43m.[39;49m[43mdistinct[49m[43m([49m[38;5;124;43m"[39;49m[38;5;124;43mid[39;49m[38;5;124;43m"[39;49m[43m,[49m[38;5;124;43m"[39;49m[38;5;124;43mname[39;49m[38;5;124;43m"[39;49m[43m)[49m[38;5;241m.[39mshow()

File [0;32m/databricks/spark/python/pyspark/instrumentation_utils.py:48[0m, in [0;36m_wrap_function.<locals>.wrapper[0;34m(*args, **kwargs)[0m
[1;32m     46[0m start [38;5;241m=[39m time[38;5;241m.[39mperf_counter()
[1;32m     47[0m [38;5;28;01mtry[39;00m:
[0;32m---> 48[0m     res [38;5;241m=[39m [43mfunc[49m[43m([49m[38;5;241;43m*[39;49m[43margs[49m[43m,[49m[43m [49m[38;5;241;43m*[39;49m[38;5;241;43m*[39;49m[43mkwargs[49m[43m)[49m
[1;32m     49[0m     log

In [0]:
manager_df.select("id","name").distinct().show()

+---+------+
| id|  name|
+---+------+
| 10|  Anil|
| 11| Vikas|
| 12| Nisha|
| 13| Nidhi|
| 15| Mohit|
| 14| Priya|
| 17| Raman|
| 16|Rajesh|
| 18|   Sam|
+---+------+



In [0]:
dropped_manager_df = manager_df.dropDuplicates(["id","name","sal","mngr_id"])

In [0]:
dropped_manager_df.show()

+---+------+-----+-------+
| id|  Name|  sal|mngr_id|
+---+------+-----+-------+
| 10|  Anil|50000|     18|
| 12| Nisha|40000|     18|
| 11| Vikas|75000|     16|
| 13| Nidhi|60000|     17|
| 15| Mohit|45000|     18|
| 14| Priya|80000|     18|
| 16|Rajesh|90000|     10|
| 17| Raman|55000|     16|
| 18|   Sam|65000|     17|
| 14| Priya|90000|     18|
+---+------+-----+-------+



In [0]:
from pyspark.sql.functions import col

In [0]:
manager_df.sort(col("sal")).show()

+---+------+-----+-------+
| id|  Name|  sal|mngr_id|
+---+------+-----+-------+
| 12| Nisha|40000|     18|
| 15| Mohit|45000|     18|
| 15| Mohit|45000|     18|
| 10|  Anil|50000|     18|
| 17| Raman|55000|     16|
| 13| Nidhi|60000|     17|
| 13| Nidhi|60000|     17|
| 18|   Sam|65000|     17|
| 18|   Sam|65000|     17|
| 11| Vikas|75000|     16|
| 14| Priya|80000|     18|
| 16|Rajesh|90000|     10|
| 14| Priya|90000|     18|
+---+------+-----+-------+



In [0]:
manager_df.sort(col("sal").desc()).show()

+---+------+-----+-------+
| id|  Name|  sal|mngr_id|
+---+------+-----+-------+
| 16|Rajesh|90000|     10|
| 14| Priya|90000|     18|
| 14| Priya|80000|     18|
| 11| Vikas|75000|     16|
| 18|   Sam|65000|     17|
| 18|   Sam|65000|     17|
| 13| Nidhi|60000|     17|
| 13| Nidhi|60000|     17|
| 17| Raman|55000|     16|
| 10|  Anil|50000|     18|
| 15| Mohit|45000|     18|
| 15| Mohit|45000|     18|
| 12| Nisha|40000|     18|
+---+------+-----+-------+



In [0]:
manager_df.sort(col("sal").desc(),col("Name").desc()).show()

+---+------+-----+-------+
| id|  Name|  sal|mngr_id|
+---+------+-----+-------+
| 16|Rajesh|90000|     10|
| 14| Priya|90000|     18|
| 14| Priya|80000|     18|
| 11| Vikas|75000|     16|
| 18|   Sam|65000|     17|
| 18|   Sam|65000|     17|
| 13| Nidhi|60000|     17|
| 13| Nidhi|60000|     17|
| 17| Raman|55000|     16|
| 10|  Anil|50000|     18|
| 15| Mohit|45000|     18|
| 15| Mohit|45000|     18|
| 12| Nisha|40000|     18|
+---+------+-----+-------+



In [0]:
# leetcode question - https://leetcode.com/problems/find-customer-referee/description/?envType=study-plan-v2&envId=top-sql-50


leet_code_data = [
    (1, 'Will', None),
    (2, 'Jane', None),
    (3, 'Alex', 2),
    (4, 'Bill', None),
    (5, 'Zack', 1),
    (6, 'Mark', 2)
]

schema = ['id','name','referee_id']

leetcode_df = spark.createDataFrame(data=leet_code_data,schema=schema)

Table: Customer

+-------------+---------+
| Column Name | Type    |
+-------------+---------+
| id          | int     |
| name        | varchar |
| referee_id  | int     |
+-------------+---------+
In SQL, id is the primary key column for this table.
Each row of this table indicates the id of a customer, their name, and the id of the customer who referred them.
 

Find the names of the customer that are not referred by the customer with id = 2.

Return the result table in any order.

The result format is in the following example.

 

Example 1:

Input: 
Customer table:
+----+------+------------+
| id | name | referee_id |
+----+------+------------+
| 1  | Will | null       |
| 2  | Jane | null       |
| 3  | Alex | 2          |
| 4  | Bill | null       |
| 5  | Zack | 1          |
| 6  | Mark | 2          |
+----+------+------------+
Output: 
+------+
| name |
+------+
| Will |
| Jane |
| Bill |
| Zack |
+------+

In [0]:
# Filter the DataFrame for customers where referee_id is either null or not equal to 2
result_df = leetcode_df.filter((leetcode_df.referee_id.isNull()) | (leetcode_df.referee_id != 2))

# Select only the 'name' column for the result
result_df = result_df.select('name')

# Show the result
result_df.show()

+----+
|name|
+----+
|Will|
|Jane|
|Bill|
|Zack|
+----+

