In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType, FloatType

spark = SparkSession.builder.getOrCreate()

# 🧍 Drivers Data
drivers_data = [
    (1, "Alice Johnson"),
    (2, "Bob Smith"),
    (3, "Carol Davis"),
    (4, "David Wilson"),
    (5, "Emma Brown")
]

drivers_schema = StructType([
    StructField("driver_id", IntegerType(), True),
    StructField("driver_name", StringType(), True)
])

drivers = spark.createDataFrame(drivers_data, schema=drivers_schema)

# 🚗 Trips Data
trips_data = [
    (1, 1, "2023-02-15", 120.5, 10.2),
    (2, 1, "2023-03-20", 200.0, 16.5),
    (3, 1, "2023-08-10", 150.0, 11.0),
    (4, 1, "2023-09-25", 180.0, 12.5),
    (5, 2, "2023-01-10", 100.0, 9.0),
    (6, 2, "2023-04-15", 250.0, 22.0),
    (7, 2, "2023-10-05", 200.0, 15.0),
    (8, 3, "2023-03-12", 80.0, 8.5),
    (9, 3, "2023-05-18", 90.0, 9.2),
    (10, 4, "2023-07-22", 160.0, 12.8),
    (11, 4, "2023-11-30", 140.0, 11.0),
    (12, 5, "2023-02-28", 110.0, 11.5)
]

trips_schema = StructType([
    StructField("trip_id", IntegerType(), True),
    StructField("driver_id", IntegerType(), True),
    StructField("trip_date", StringType(), True),  # You can cast this to DateType if needed
    StructField("distance_km", FloatType(), True),
    StructField("fuel_consumed", FloatType(), True)
])

trips = spark.createDataFrame(trips_data, schema=trips_schema)


In [0]:
drivers.show()
trips.show()

In [0]:
from pyspark.sql.functions import *

In [0]:
first_half = trips.filter((month(col('trip_date'))>=1) & (month(col('trip_date'))<=6))\
    .groupBy('driver_id').agg((sum(col('distance_km')/col('fuel_consumed'))/count('*')).alias('first_half_avg'))
second_half = trips.filter((month(col('trip_date'))>=7) & (month(col('trip_date'))<=12))\
    .groupBy('driver_id').agg((sum(col('distance_km')/col('fuel_consumed'))/count('*')).alias('second_half_avg'))

first_half.join(second_half, on='driver_id', how='inner')\
    .filter('first_half_avg<second_half_avg')\
    .withColumn('efficiency_improvement',col('second_half_avg')-col('first_half_avg'))\
    .join(drivers, on='driver_id',how='inner')\
    .select('driver_id','driver_name', round('first_half_avg',2).alias('first_half_avg'),round('second_half_avg',2).alias('second_half_avg'), round('efficiency_improvement',2).alias('efficiency_improvement'))\
    .orderBy(desc('efficiency_improvement'),'driver_name')\
    .show()