In [None]:
! pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=6ebaced058603b893a9d92c9e59ab5b76d67b8d631405678216e6746564bf0e7
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Initialize a Spark session
spark = SparkSession.builder \
    .appName("Advanced DataFrame Operations") \
    .getOrCreate()

# Create two sample DataFrames
data1 = [
    (1, 'Arjun', 'IT', 75000, '2022-01-15'),
    (2, 'Vijay', 'Finance', 85000, '2022-03-12'),
    (3, 'Shalini', 'IT', 90000, '2021-06-30')
]

data2 = [
    (4, 'Sneha', 'HR', 50000, '2022-05-01'),
    (5, 'Rahul', 'Finance', 60000, '2022-08-20'),
    (6, 'Amit', 'IT', 55000, '2021-12-15')
]

# Define schema (columns)
columns = ['EmployeeID', 'EmployeeName', 'Department', 'Salary', 'JoiningDate']

# Create DataFrames
employee_df1 = spark.createDataFrame(data1, columns)
employee_df2 = spark.createDataFrame(data2, columns)

# Show the DataFrames
print("Employee DataFrame 1:")
employee_df1.show()

print("Employee DataFrame 2:")
employee_df2.show()



Employee DataFrame 1:
+----------+------------+----------+------+-----------+
|EmployeeID|EmployeeName|Department|Salary|JoiningDate|
+----------+------------+----------+------+-----------+
|         1|       Arjun|        IT| 75000| 2022-01-15|
|         2|       Vijay|   Finance| 85000| 2022-03-12|
|         3|     Shalini|        IT| 90000| 2021-06-30|
+----------+------------+----------+------+-----------+

Employee DataFrame 2:
+----------+------------+----------+------+-----------+
|EmployeeID|EmployeeName|Department|Salary|JoiningDate|
+----------+------------+----------+------+-----------+
|         4|       Sneha|        HR| 50000| 2022-05-01|
|         5|       Rahul|   Finance| 60000| 2022-08-20|
|         6|        Amit|        IT| 55000| 2021-12-15|
+----------+------------+----------+------+-----------+



In [None]:
# Union of two DataFrames (removes duplicates)
union_df = employee_df1.union(employee_df2).dropDuplicates()
union_df.show()


+----------+------------+----------+------+-----------+
|EmployeeID|EmployeeName|Department|Salary|JoiningDate|
+----------+------------+----------+------+-----------+
|         1|       Arjun|        IT| 75000| 2022-01-15|
|         3|     Shalini|        IT| 90000| 2021-06-30|
|         2|       Vijay|   Finance| 85000| 2022-03-12|
|         4|       Sneha|        HR| 50000| 2022-05-01|
|         5|       Rahul|   Finance| 60000| 2022-08-20|
|         6|        Amit|        IT| 55000| 2021-12-15|
+----------+------------+----------+------+-----------+



In [None]:
# Union of two DataFrames (includes duplicates)
union_all_df = employee_df1.union(employee_df2)
union_all_df.show()


+----------+------------+----------+------+-----------+
|EmployeeID|EmployeeName|Department|Salary|JoiningDate|
+----------+------------+----------+------+-----------+
|         1|       Arjun|        IT| 75000| 2022-01-15|
|         2|       Vijay|   Finance| 85000| 2022-03-12|
|         3|     Shalini|        IT| 90000| 2021-06-30|
|         4|       Sneha|        HR| 50000| 2022-05-01|
|         5|       Rahul|   Finance| 60000| 2022-08-20|
|         6|        Amit|        IT| 55000| 2021-12-15|
+----------+------------+----------+------+-----------+



In [None]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank
from pyspark.sql.functions import col

# Define a window specification to rank employees by salary within each department
window_spec = Window.partitionBy("Department").orderBy(col("Salary").desc())

# Add a rank column to the DataFrame
ranked_df = union_all_df.withColumn("Rank", rank().over(window_spec))
ranked_df.show()


+----------+------------+----------+------+-----------+----+
|EmployeeID|EmployeeName|Department|Salary|JoiningDate|Rank|
+----------+------------+----------+------+-----------+----+
|         2|       Vijay|   Finance| 85000| 2022-03-12|   1|
|         5|       Rahul|   Finance| 60000| 2022-08-20|   2|
|         4|       Sneha|        HR| 50000| 2022-05-01|   1|
|         3|     Shalini|        IT| 90000| 2021-06-30|   1|
|         1|       Arjun|        IT| 75000| 2022-01-15|   2|
|         6|        Amit|        IT| 55000| 2021-12-15|   3|
+----------+------------+----------+------+-----------+----+



In [12]:
from pyspark.sql.functions import sum

# Define a window specification for cumulative sum of salaries within each department
window_spec_sum = Window.partitionBy("Department").orderBy("JoiningDate").rowsBetween(Window.unboundedPreceding, Window.currentRow)

# Calculate the running total of salaries
running_total_df = union_all_df.withColumn("RunningTotal", sum(col("Salary")).over(window_spec_sum))
running_total_df.show()




+----------+------------+----------+------+-----------+------------+
|EmployeeID|EmployeeName|Department|Salary|JoiningDate|RunningTotal|
+----------+------------+----------+------+-----------+------------+
|         2|       Vijay|   Finance| 85000| 2022-03-12|       85000|
|         5|       Rahul|   Finance| 60000| 2022-08-20|      145000|
|         4|       Sneha|        HR| 50000| 2022-05-01|       50000|
|         3|     Shalini|        IT| 90000| 2021-06-30|       90000|
|         6|        Amit|        IT| 55000| 2021-12-15|      145000|
|         1|       Arjun|        IT| 75000| 2022-01-15|      220000|
+----------+------------+----------+------+-----------+------------+



In [13]:
# Convert JoiningDate from string to date type
date_converted_df = union_all_df.withColumn("JoiningDate", F.to_date(col("JoiningDate"), "yyyy-MM-dd"))
date_converted_df.show()


+----------+------------+----------+------+-----------+
|EmployeeID|EmployeeName|Department|Salary|JoiningDate|
+----------+------------+----------+------+-----------+
|         1|       Arjun|        IT| 75000| 2022-01-15|
|         2|       Vijay|   Finance| 85000| 2022-03-12|
|         3|     Shalini|        IT| 90000| 2021-06-30|
|         4|       Sneha|        HR| 50000| 2022-05-01|
|         5|       Rahul|   Finance| 60000| 2022-08-20|
|         6|        Amit|        IT| 55000| 2021-12-15|
+----------+------------+----------+------+-----------+



In [14]:
# Calculate the number of years since joining
experience_df = date_converted_df.withColumn("YearsOfExperience", F.round(F.datediff(F.current_date(), col("JoiningDate")) / 365, 2))
experience_df.show()


+----------+------------+----------+------+-----------+-----------------+
|EmployeeID|EmployeeName|Department|Salary|JoiningDate|YearsOfExperience|
+----------+------------+----------+------+-----------+-----------------+
|         1|       Arjun|        IT| 75000| 2022-01-15|             2.64|
|         2|       Vijay|   Finance| 85000| 2022-03-12|             2.48|
|         3|     Shalini|        IT| 90000| 2021-06-30|             3.18|
|         4|       Sneha|        HR| 50000| 2022-05-01|             2.35|
|         5|       Rahul|   Finance| 60000| 2022-08-20|             2.04|
|         6|        Amit|        IT| 55000| 2021-12-15|             2.72|
+----------+------------+----------+------+-----------+-----------------+



In [15]:
# Add a new column for next evaluation date (one year after joining)
eval_date_df = date_converted_df.withColumn("NextEvaluationDate", F.date_add(col("JoiningDate"), 365))
eval_date_df.show()


+----------+------------+----------+------+-----------+------------------+
|EmployeeID|EmployeeName|Department|Salary|JoiningDate|NextEvaluationDate|
+----------+------------+----------+------+-----------+------------------+
|         1|       Arjun|        IT| 75000| 2022-01-15|        2023-01-15|
|         2|       Vijay|   Finance| 85000| 2022-03-12|        2023-03-12|
|         3|     Shalini|        IT| 90000| 2021-06-30|        2022-06-30|
|         4|       Sneha|        HR| 50000| 2022-05-01|        2023-05-01|
|         5|       Rahul|   Finance| 60000| 2022-08-20|        2023-08-20|
|         6|        Amit|        IT| 55000| 2021-12-15|        2022-12-15|
+----------+------------+----------+------+-----------+------------------+



In [16]:
# Calculate average salary per department
avg_salary_df = union_all_df.groupBy("Department").agg(F.avg(col("Salary")).alias("AverageSalary"))
avg_salary_df.show()


+----------+-----------------+
|Department|    AverageSalary|
+----------+-----------------+
|        IT|73333.33333333333|
|   Finance|          72500.0|
|        HR|          50000.0|
+----------+-----------------+



In [17]:
# Calculate the total number of employees
total_employees_df = union_all_df.agg(F.count("EmployeeID").alias("TotalEmployees"))
total_employees_df.show()


+--------------+
|TotalEmployees|
+--------------+
|             6|
+--------------+



In [19]:
# Convert EmployeeNAme to upper case
Upper_name_df = union_all_df.withColumn("EmployeeNameUpper", F.upper(col("EmployeeNAme")))
Upper_name_df.show()

+----------+------------+----------+------+-----------+-----------------+
|EmployeeID|EmployeeName|Department|Salary|JoiningDate|EmployeeNameUpper|
+----------+------------+----------+------+-----------+-----------------+
|         1|       Arjun|        IT| 75000| 2022-01-15|            ARJUN|
|         2|       Vijay|   Finance| 85000| 2022-03-12|            VIJAY|
|         3|     Shalini|        IT| 90000| 2021-06-30|          SHALINI|
|         4|       Sneha|        HR| 50000| 2022-05-01|            SNEHA|
|         5|       Rahul|   Finance| 60000| 2022-08-20|            RAHUL|
|         6|        Amit|        IT| 55000| 2021-12-15|             AMIT|
+----------+------------+----------+------+-----------+-----------------+

