PART A

In [12]:
from pyspark.sql import Row

data = [
    Row(EmpID=101, Name="Ravi", Department="Engineering", Project="AI Engine", Salary=95000, HoursPerWeek=42),
    Row(EmpID=102, Name="Sneha", Department="Engineering", Project="Data Platform", Salary=87000, HoursPerWeek=45),
    Row(EmpID=103, Name="Kabir", Department="Marketing", Project="Product Launch", Salary=65000, HoursPerWeek=40),
    Row(EmpID=104, Name="Anita", Department="Sales", Project="Client Outreach", Salary=70000, HoursPerWeek=38),
    Row(EmpID=105, Name="Divya", Department="Engineering", Project="AI Engine", Salary=99000, HoursPerWeek=48),
    Row(EmpID=106, Name="Amit", Department="Marketing", Project="Social Media", Salary=62000, HoursPerWeek=35),
    Row(EmpID=107, Name="Priya", Department="HR", Project="Policy Revamp", Salary=58000, HoursPerWeek=37),
    Row(EmpID=108, Name="Manav", Department="Sales", Project="Lead Gen", Salary=73000, HoursPerWeek=41),
    Row(EmpID=109, Name="Neha", Department="Engineering", Project="Security Suite", Salary=91000, HoursPerWeek=46),
    Row(EmpID=110, Name="Farah", Department="HR", Project="Onboarding", Salary=60000, HoursPerWeek=36)
]

df = spark.createDataFrame(data)
df.createOrReplaceTempView("employees_local")
df.createOrReplaceGlobalTempView("employees_global")


In [14]:
#1
spark.sql("SELECT * FROM employees_local WHERE Project = 'AI Engine'").show()

+-----+-----+-----------+---------+------+------------+
|EmpID| Name| Department|  Project|Salary|HoursPerWeek|
+-----+-----+-----------+---------+------+------------+
|  101| Ravi|Engineering|AI Engine| 95000|          42|
|  105|Divya|Engineering|AI Engine| 99000|          48|
+-----+-----+-----------+---------+------+------------+



In [13]:
#2
spark.sql("SELECT * FROM employees_local WHERE Department = 'Marketing' AND Salary > 60000").show()


+-----+-----+----------+--------------+------+------------+
|EmpID| Name|Department|       Project|Salary|HoursPerWeek|
+-----+-----+----------+--------------+------+------------+
|  103|Kabir| Marketing|Product Launch| 65000|          40|
|  106| Amit| Marketing|  Social Media| 62000|          35|
+-----+-----+----------+--------------+------+------------+



In [15]:
#3
spark.sql("SELECT Department, AVG(Salary) AS Avg_Salary FROM employees_local GROUP BY Department").show()

+-----------+----------+
| Department|Avg_Salary|
+-----------+----------+
|      Sales|   71500.0|
|Engineering|   93000.0|
|  Marketing|   63500.0|
|         HR|   59000.0|
+-----------+----------+



In [16]:
#4
spark.sql("SELECT * FROM employees_local ORDER BY Salary DESC LIMIT 3").show()

+-----+-----+-----------+--------------+------+------------+
|EmpID| Name| Department|       Project|Salary|HoursPerWeek|
+-----+-----+-----------+--------------+------+------------+
|  105|Divya|Engineering|     AI Engine| 99000|          48|
|  101| Ravi|Engineering|     AI Engine| 95000|          42|
|  109| Neha|Engineering|Security Suite| 91000|          46|
+-----+-----+-----------+--------------+------+------------+



In [17]:
#5
spark.sql("SELECT * FROM employees_local WHERE HoursPerWeek > 40").show()

+-----+-----+-----------+--------------+------+------------+
|EmpID| Name| Department|       Project|Salary|HoursPerWeek|
+-----+-----+-----------+--------------+------+------------+
|  101| Ravi|Engineering|     AI Engine| 95000|          42|
|  102|Sneha|Engineering| Data Platform| 87000|          45|
|  105|Divya|Engineering|     AI Engine| 99000|          48|
|  108|Manav|      Sales|      Lead Gen| 73000|          41|
|  109| Neha|Engineering|Security Suite| 91000|          46|
+-----+-----+-----------+--------------+------+------------+



In [18]:
#6
spark.sql("SELECT Project, COUNT(*) AS Employee_Count FROM employees_local GROUP BY Project").show()

+---------------+--------------+
|        Project|Employee_Count|
+---------------+--------------+
|  Data Platform|             1|
|      AI Engine|             2|
| Product Launch|             1|
|Client Outreach|             1|
| Security Suite|             1|
|  Policy Revamp|             1|
|       Lead Gen|             1|
|   Social Media|             1|
|     Onboarding|             1|
+---------------+--------------+



In [19]:
#7
spark.catalog.dropTempView("employees_local")

True

In [20]:
#8
try:
    spark.sql("SELECT * FROM employees_local").show()
except Exception as e:
    print("Error:", e)

Error: [TABLE_OR_VIEW_NOT_FOUND] The table or view `employees_local` cannot be found. Verify the spelling and correctness of the schema and catalog.
If you did not qualify the name with a schema, verify the current_schema() output, or qualify the name with the correct schema and catalog.
To tolerate the error on drop use DROP VIEW IF EXISTS or DROP TABLE IF EXISTS.; line 1 pos 14;
'Project [*]
+- 'UnresolvedRelation [employees_local], [], false



 Part B: Exercises on Global View

In [21]:
#1
spark.sql("SELECT * FROM global_temp.employees_global WHERE Department = 'HR' AND HoursPerWeek < 38").show()

+-----+-----+----------+-------------+------+------------+
|EmpID| Name|Department|      Project|Salary|HoursPerWeek|
+-----+-----+----------+-------------+------+------------+
|  107|Priya|        HR|Policy Revamp| 58000|          37|
|  110|Farah|        HR|   Onboarding| 60000|          36|
+-----+-----+----------+-------------+------+------------+



In [22]:
#2
spark.sql("SELECT Department, SUM(Salary) AS Total_Payout FROM global_temp.employees_global GROUP BY Department").show()

+-----------+------------+
| Department|Total_Payout|
+-----------+------------+
|      Sales|      143000|
|Engineering|      372000|
|  Marketing|      127000|
|         HR|      118000|
+-----------+------------+



In [23]:
#3
spark.sql("""
SELECT *,
  CASE
    WHEN HoursPerWeek > 45 THEN 'Overworked'
    ELSE 'Normal'
  END AS Status
FROM global_temp.employees_global
""").show()

+-----+-----+-----------+---------------+------+------------+----------+
|EmpID| Name| Department|        Project|Salary|HoursPerWeek|    Status|
+-----+-----+-----------+---------------+------+------------+----------+
|  101| Ravi|Engineering|      AI Engine| 95000|          42|    Normal|
|  102|Sneha|Engineering|  Data Platform| 87000|          45|    Normal|
|  103|Kabir|  Marketing| Product Launch| 65000|          40|    Normal|
|  104|Anita|      Sales|Client Outreach| 70000|          38|    Normal|
|  105|Divya|Engineering|      AI Engine| 99000|          48|Overworked|
|  106| Amit|  Marketing|   Social Media| 62000|          35|    Normal|
|  107|Priya|         HR|  Policy Revamp| 58000|          37|    Normal|
|  108|Manav|      Sales|       Lead Gen| 73000|          41|    Normal|
|  109| Neha|Engineering| Security Suite| 91000|          46|Overworked|
|  110|Farah|         HR|     Onboarding| 60000|          36|    Normal|
+-----+-----+-----------+---------------+------+---

In [24]:
#4
spark.sql("SELECT Project, COUNT(*) AS Employee_Count FROM global_temp.employees_global GROUP BY Project").show()

+---------------+--------------+
|        Project|Employee_Count|
+---------------+--------------+
|  Data Platform|             1|
|      AI Engine|             2|
| Product Launch|             1|
|Client Outreach|             1|
| Security Suite|             1|
|  Policy Revamp|             1|
|       Lead Gen|             1|
|   Social Media|             1|
|     Onboarding|             1|
+---------------+--------------+



In [25]:
#5
spark.sql("""
WITH dept_avg AS (
  SELECT Department, AVG(Salary) AS AvgSal
  FROM global_temp.employees_global
  GROUP BY Department
)
SELECT e.*
FROM global_temp.employees_global e
JOIN dept_avg d ON e.Department = d.Department
WHERE e.Salary > d.AvgSal
""").show()

+-----+-----+-----------+--------------+------+------------+
|EmpID| Name| Department|       Project|Salary|HoursPerWeek|
+-----+-----+-----------+--------------+------+------------+
|  101| Ravi|Engineering|     AI Engine| 95000|          42|
|  105|Divya|Engineering|     AI Engine| 99000|          48|
|  103|Kabir|  Marketing|Product Launch| 65000|          40|
|  108|Manav|      Sales|      Lead Gen| 73000|          41|
|  110|Farah|         HR|    Onboarding| 60000|          36|
+-----+-----+-----------+--------------+------+------------+



In [26]:
#6
from pyspark.sql import SparkSession
new_spark = SparkSession.builder.getOrCreate()
new_spark.sql("SELECT * FROM global_temp.employees_global").show()

+-----+-----+-----------+---------------+------+------------+
|EmpID| Name| Department|        Project|Salary|HoursPerWeek|
+-----+-----+-----------+---------------+------+------------+
|  101| Ravi|Engineering|      AI Engine| 95000|          42|
|  102|Sneha|Engineering|  Data Platform| 87000|          45|
|  103|Kabir|  Marketing| Product Launch| 65000|          40|
|  104|Anita|      Sales|Client Outreach| 70000|          38|
|  105|Divya|Engineering|      AI Engine| 99000|          48|
|  106| Amit|  Marketing|   Social Media| 62000|          35|
|  107|Priya|         HR|  Policy Revamp| 58000|          37|
|  108|Manav|      Sales|       Lead Gen| 73000|          41|
|  109| Neha|Engineering| Security Suite| 91000|          46|
|  110|Farah|         HR|     Onboarding| 60000|          36|
+-----+-----+-----------+---------------+------+------------+



In [28]:
#1
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, col

windowSpec = Window.partitionBy("Department").orderBy(col("Salary").desc())
ranked_df = spark.table("global_temp.employees_global") \
                 .withColumn("Rank", rank().over(windowSpec))
ranked_df.select("EmpID", "Name", "Department", "Salary", "Rank").show()

+-----+-----+-----------+------+----+
|EmpID| Name| Department|Salary|Rank|
+-----+-----+-----------+------+----+
|  105|Divya|Engineering| 99000|   1|
|  101| Ravi|Engineering| 95000|   2|
|  109| Neha|Engineering| 91000|   3|
|  102|Sneha|Engineering| 87000|   4|
|  110|Farah|         HR| 60000|   1|
|  107|Priya|         HR| 58000|   2|
|  103|Kabir|  Marketing| 65000|   1|
|  106| Amit|  Marketing| 62000|   2|
|  108|Manav|      Sales| 73000|   1|
|  104|Anita|      Sales| 70000|   2|
+-----+-----+-----------+------+----+



In [29]:
#2
engineering_df = spark.table("global_temp.employees_global").filter(col("Department") == "Engineering")
engineering_df.createOrReplaceTempView("engineering_employees")
spark.sql("SELECT * FROM engineering_employees").show()

+-----+-----+-----------+--------------+------+------------+
|EmpID| Name| Department|       Project|Salary|HoursPerWeek|
+-----+-----+-----------+--------------+------+------------+
|  101| Ravi|Engineering|     AI Engine| 95000|          42|
|  102|Sneha|Engineering| Data Platform| 87000|          45|
|  105|Divya|Engineering|     AI Engine| 99000|          48|
|  109| Neha|Engineering|Security Suite| 91000|          46|
+-----+-----+-----------+--------------+------+------------+



In [30]:
#3
active_df = spark.table("global_temp.employees_global").filter(col("HoursPerWeek") >= 38)
active_df.createOrReplaceTempView("active_employees")
spark.sql("SELECT * FROM active_employees").show()

+-----+-----+-----------+---------------+------+------------+
|EmpID| Name| Department|        Project|Salary|HoursPerWeek|
+-----+-----+-----------+---------------+------+------------+
|  101| Ravi|Engineering|      AI Engine| 95000|          42|
|  102|Sneha|Engineering|  Data Platform| 87000|          45|
|  103|Kabir|  Marketing| Product Launch| 65000|          40|
|  104|Anita|      Sales|Client Outreach| 70000|          38|
|  105|Divya|Engineering|      AI Engine| 99000|          48|
|  108|Manav|      Sales|       Lead Gen| 73000|          41|
|  109| Neha|Engineering| Security Suite| 91000|          46|
+-----+-----+-----------+---------------+------+------------+

