<a href="https://colab.research.google.com/github/vamshap/PySpark-Challenges/blob/main/PercentageCancelled.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.sql import functions as F
#write a programme to find the cancellation percentage it should exclude the banned users and drivers
# Create Spark session
spark = SparkSession.builder.appName("CreateAndPopulateTables").getOrCreate()

# Define schema for Trips table
trips_schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("client_id", IntegerType(), True),
    StructField("driver_id", IntegerType(), True),
    StructField("city_id", IntegerType(), True),
    StructField("status", StringType(), True),
    StructField("request_at", StringType(), True)
])

# Define schema for Users table
users_schema = StructType([
    StructField("users_id", IntegerType(), True),
    StructField("banned", StringType(), True),
    StructField("role", StringType(), True)
])

# Create data for Trips table
trips_data = [
    (1, 1, 10, 1, 'completed', '2013-10-01'),
    (2, 2, 11, 1, 'cancelled_by_driver', '2013-10-01'),
    (3, 3, 12, 6, 'completed', '2013-10-01'),
    (4, 4, 13, 6, 'cancelled_by_client', '2013-10-01'),
    (5, 1, 10, 1, 'completed', '2013-10-02'),
    (6, 2, 11, 6, 'completed', '2013-10-02'),
    (7, 3, 12, 6, 'completed', '2013-10-02'),
    (8, 2, 12, 12, 'completed', '2013-10-03'),
    (9, 3, 10, 12, 'completed', '2013-10-03'),
    (10, 4, 13, 12, 'cancelled_by_driver', '2013-10-03')
]

# Create data for Users table
users_data = [
    (1, 'No', 'client'),
    (2, 'Yes', 'client'),
    (3, 'No', 'client'),
    (4, 'No', 'client'),
    (10, 'No', 'driver'),
    (11, 'No', 'driver'),
    (12, 'No', 'driver'),
    (13, 'No', 'driver')
]

# Create DataFrames
trips_df = spark.createDataFrame(data=trips_data, schema=trips_schema)
users_df = spark.createDataFrame(data=users_data, schema=users_schema)

# Show the data
print("Trips Table:")
trips_df.show()

print("Users Table:")
users_df.show()



# Perform a self-join on Trips with Users for client_id and driver_id
trips_with_users = trips_df.alias("T") \
    .join(users_df.alias("U"), F.col("T.client_id") == F.col("U.users_id"), "left") \
    .join(users_df.alias("D"), F.col("T.driver_id") == F.col("D.users_id"), "left") \
    .filter(F.col("U.banned") != "Yes")  # Filter banned users

# Calculate TotalCompletedRequests and TotalRequests
result = trips_with_users.agg(
    F.count(F.when(F.col("T.status") == "completed", 1)).alias("TotalCompletedRequests"),
    F.count("*").alias("TotalRequests")
).withColumn(
    "CancellationPercentage",
    ((
        F.col("TotalCompletedRequests").cast("float") -
        F.col("TotalRequests").cast("float")
    ) / F.col("TotalRequests").cast("float")) * 100 * -1
)

# Show the result
result.show()



Trips Table:
+---+---------+---------+-------+-------------------+----------+
| id|client_id|driver_id|city_id|             status|request_at|
+---+---------+---------+-------+-------------------+----------+
|  1|        1|       10|      1|          completed|2013-10-01|
|  2|        2|       11|      1|cancelled_by_driver|2013-10-01|
|  3|        3|       12|      6|          completed|2013-10-01|
|  4|        4|       13|      6|cancelled_by_client|2013-10-01|
|  5|        1|       10|      1|          completed|2013-10-02|
|  6|        2|       11|      6|          completed|2013-10-02|
|  7|        3|       12|      6|          completed|2013-10-02|
|  8|        2|       12|     12|          completed|2013-10-03|
|  9|        3|       10|     12|          completed|2013-10-03|
| 10|        4|       13|     12|cancelled_by_driver|2013-10-03|
+---+---------+---------+-------+-------------------+----------+

Users Table:
+--------+------+------+
|users_id|banned|  role|
+--------+---