In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, current_date, datediff

In [2]:
spark = SparkSession.builder \
    .appName("Filter and Transform DataFrame") \
    .getOrCreate()



In [3]:
data = [
    (1, "John Doe", "Sales", "2020-05-01"),
    (2, "Jane Smith", "Marketing", "2019-07-15"),
    (3, "Emily Davis", "Sales", "2018-10-25"),
    (4, "Michael Brown", "HR", "2021-01-10")
]

In [4]:
columns = ["id", "name", "department", "hire_date"]

In [5]:
df = spark.createDataFrame(data, columns)

In [6]:
print("Initial DataFrame:")
df.show()

Initial DataFrame:
+---+-------------+----------+----------+
| id|         name|department| hire_date|
+---+-------------+----------+----------+
|  1|     John Doe|     Sales|2020-05-01|
|  2|   Jane Smith| Marketing|2019-07-15|
|  3|  Emily Davis|     Sales|2018-10-25|
|  4|Michael Brown|        HR|2021-01-10|
+---+-------------+----------+----------+



In [7]:
filtered_df = df.filter(col("department") == "Sales")

In [8]:
filtered_df = filtered_df.withColumn("hire_date", col("hire_date").cast("date"))

In [9]:
filtered_df = filtered_df.withColumn("years_of_service", 
                                     datediff(current_date(), col("hire_date")) / 3)

In [10]:
print("Transformed DataFrame:")
filtered_df.show()

Transformed DataFrame:
+---+-----------+----------+----------+-----------------+
| id|       name|department| hire_date| years_of_service|
+---+-----------+----------+----------+-----------------+
|  1|   John Doe|     Sales|2020-05-01|520.6666666666666|
|  3|Emily Davis|     Sales|2018-10-25|705.3333333333334|
+---+-----------+----------+----------+-----------------+



In [11]:
row_count = df.count()
print(f"Number of rows in DataFrame: {row_count}")

# 2. Show the contents of the DataFrame
print("Contents of the DataFrame:")
df.show()

Number of rows in DataFrame: 4
Contents of the DataFrame:
+---+-------------+----------+----------+
| id|         name|department| hire_date|
+---+-------------+----------+----------+
|  1|     John Doe|     Sales|2020-05-01|
|  2|   Jane Smith| Marketing|2019-07-15|
|  3|  Emily Davis|     Sales|2018-10-25|
|  4|Michael Brown|        HR|2021-01-10|
+---+-------------+----------+----------+



In [12]:
spark.stop()