In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as FS
from pyspark.sql import Window as WN

In [2]:
employee_date = [
(1,'JOE',85000,1),
(2,'Henry',80000,2),
(3,'Sam',60000,2),
(4,'Max',90000,1),
(5,'Janet',69000,1),
(6,'Randy',85000,1),
(7,'Will',70000,1)
]
emp_schema = [ 'ID', 'NAME', 'SALARY','DEPARTMENTID']
dept_data = [
(1,'IT'),
(2,'Sales')
]
dept_schema = [ 'ID', 'NAME']

In [3]:
spark = SparkSession.builder.master("local[2]").appName("MAX_SAL_EMP").getOrCreate()

In [4]:
emp_df = spark.createDataFrame(employee_date,schema = emp_schema)
emp_df.show()

+---+-----+------+------------+
| ID| NAME|SALARY|DEPARTMENTID|
+---+-----+------+------------+
|  1|  JOE| 85000|           1|
|  2|Henry| 80000|           2|
|  3|  Sam| 60000|           2|
|  4|  Max| 90000|           1|
|  5|Janet| 69000|           1|
|  6|Randy| 85000|           1|
|  7| Will| 70000|           1|
+---+-----+------+------------+



In [5]:
dept_df = spark.createDataFrame(dept_data,schema = dept_schema)
dept_df.show()

+---+-----+
| ID| NAME|
+---+-----+
|  1|   IT|
|  2|Sales|
+---+-----+



#### Find The employee information from each department who is getting max salary

#### Method-1: Dense_RANK()

In [10]:
result = emp_df.join(dept_df, emp_df.DEPARTMENTID == dept_df.ID,'inner').\
withColumn("RNK",FS.dense_rank().over(WN.partitionBy('DEPARTMENTID').orderBy(FS.col('SALARY').desc()))).\
withColumn("DEPARTMENT",dept_df.NAME).select("DEPARTMENT",emp_df.NAME,"SALARY").filter(FS.col('RNK') == 1)

result.show()

+----------+-----+------+
|DEPARTMENT| NAME|SALARY|
+----------+-----+------+
|        IT|  Max| 90000|
|     Sales|Henry| 80000|
+----------+-----+------+



#### Method-1: INNER JOIN()

In [7]:
join_data = emp_df.join(dept_df, emp_df.DEPARTMENTID ==dept_df.ID).\
select(emp_df.ID.alias("emp_id"),emp_df.NAME,emp_df.SALARY,dept_df.ID.alias("dept_id"),dept_df.NAME.alias("dpt"))
join_data.show()

+------+-----+------+-------+-----+
|emp_id| NAME|SALARY|dept_id|  dpt|
+------+-----+------+-------+-----+
|     1|  JOE| 85000|      1|   IT|
|     4|  Max| 90000|      1|   IT|
|     5|Janet| 69000|      1|   IT|
|     6|Randy| 85000|      1|   IT|
|     7| Will| 70000|      1|   IT|
|     2|Henry| 80000|      2|Sales|
|     3|  Sam| 60000|      2|Sales|
+------+-----+------+-------+-----+



In [8]:
max_sal_data = emp_df.groupBy("DEPARTMENTID").agg(FS.max("SALARY").alias("MAX_SAL"))
max_sal_data.show()

+------------+-------+
|DEPARTMENTID|MAX_SAL|
+------------+-------+
|           1|  90000|
|           2|  80000|
+------------+-------+



In [9]:
join_data.join(FS.broadcast(max_sal_data),join_data.dept_id == max_sal_data.DEPARTMENTID,"inner").\
filter("SALARY == MAX_SAL").select("emp_id","NAME","SALARY","dpt").show()

+------+-----+------+-----+
|emp_id| NAME|SALARY|  dpt|
+------+-----+------+-----+
|     4|  Max| 90000|   IT|
|     2|Henry| 80000|Sales|
+------+-----+------+-----+

