In [0]:
#Create DataFrame
employee_data = [("Nancy", 32, "HR", 7000), ("Megala", 28, "IT", 10000),("John", 35, "IT", 12000), ("Mathew", 23, "Marketing", 4500)]
columns = ["name", "age", "department", "salary"]
df = spark.createDataFrame(employee_data, columns)
df.show()

+------+---+----------+------+
|  name|age|department|salary|
+------+---+----------+------+
| Nancy| 32|        HR|  7000|
|Megala| 28|        IT| 10000|
|  John| 35|        IT| 12000|
|Mathew| 23| Marketing|  4500|
+------+---+----------+------+



In [0]:
## Employees older than 25
df.filter(df.age > 25).show()

+------+---+----------+------+
|  name|age|department|salary|
+------+---+----------+------+
| Nancy| 32|        HR|  7000|
|Megala| 28|        IT| 10000|
|  John| 35|        IT| 12000|
+------+---+----------+------+



In [0]:
#Group by department
df.groupby("department").count().show()

+----------+-----+
|department|count|
+----------+-----+
|        HR|    1|
|        IT|    2|
| Marketing|    1|
+----------+-----+



In [0]:
#Show Schema
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- department: string (nullable = true)
 |-- salary: long (nullable = true)



In [0]:
#Select Specific Columns
df.select("department", "salary").show()

+----------+------+
|department|salary|
+----------+------+
|        HR|  7000|
|        IT| 10000|
|        IT| 12000|
| Marketing|  4500|
+----------+------+



In [0]:
#Aggregations (avg, max, min)
from pyspark.sql.functions import avg,max,min

df.select(avg("salary")).show()  #average salary
df.select(max("salary")).show()  #max salary
df.select(min("salary")).show()  #min salary

+-----------+
|avg(salary)|
+-----------+
|     8375.0|
+-----------+

+-----------+
|max(salary)|
+-----------+
|      12000|
+-----------+

+-----------+
|min(salary)|
+-----------+
|       4500|
+-----------+



In [0]:
#Sort Data
df.orderBy(df.salary.desc()).show()

+------+---+----------+------+
|  name|age|department|salary|
+------+---+----------+------+
|  John| 35|        IT| 12000|
|Megala| 28|        IT| 10000|
| Nancy| 32|        HR|  7000|
|Mathew| 23| Marketing|  4500|
+------+---+----------+------+



In [0]:
# IT employees older than 30
df.filter((df.age > 30) & (df.department == "IT")).show()

+----+---+----------+------+
|name|age|department|salary|
+----+---+----------+------+
|John| 35|        IT| 12000|
+----+---+----------+------+



In [0]:
#Add New Column
df = df.withColumn("salary_in_k", df.salary/1000)
df.show()

+------+---+----------+------+-----------+
|  name|age|department|salary|salary_in_k|
+------+---+----------+------+-----------+
| Nancy| 32|        HR|  7000|        7.0|
|Megala| 28|        IT| 10000|       10.0|
|  John| 35|        IT| 12000|       12.0|
|Mathew| 23| Marketing|  4500|        4.5|
+------+---+----------+------+-----------+



In [0]:
df.groupby("department").avg("salary").show()

+----------+-----------+
|department|avg(salary)|
+----------+-----------+
|        HR|     7000.0|
|        IT|    11000.0|
| Marketing|     4500.0|
+----------+-----------+

