# Uber-Data-Analysis-Project-in-Pyspark

In [53]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.functions import sum, max, window, avg, hour, sum, dayofweek

In [2]:
# create a sparksession
spark = SparkSession.builder.appName("UberDataAnalysis").getOrCreate()

# Load the dataset into a dataframe
df = spark.read.csv('./data/uber.csv', header = True, inferSchema = True)

In [3]:
df.show(5)

+---------+------------+---------+-------+----------------+---------+--------------+
|     Date|Time (Local)|Eyeballs |Zeroes |Completed Trips |Requests |Unique Drivers|
+---------+------------+---------+-------+----------------+---------+--------------+
|10-Sep-12|           7|        5|      0|               2|        2|             9|
|     NULL|           8|        6|      0|               2|        2|            14|
|     NULL|           9|        8|      3|               0|        0|            14|
|     NULL|          10|        9|      2|               0|        1|            14|
|     NULL|          11|       11|      1|               4|        4|            11|
+---------+------------+---------+-------+----------------+---------+--------------+
only showing top 5 rows



In [4]:
print(df)

DataFrame[Date: string, Time (Local): int, Eyeballs : int, Zeroes : int, Completed Trips : int, Requests : int, Unique Drivers: int]


In [5]:
# Group the data and sum the completed trips
complete_trips_by_date = df.groupBy("Date").sum("Completed Trips ")
complete_trips_by_date.show()

+---------+---------------------+
|     Date|sum(Completed Trips )|
+---------+---------------------+
|11-Sep-12|                    1|
|13-Sep-12|                    0|
|17-Sep-12|                    0|
|18-Sep-12|                    3|
|21-Sep-12|                   10|
|     NULL|                 1261|
|15-Sep-12|                   23|
|12-Sep-12|                    0|
|19-Sep-12|                    0|
|14-Sep-12|                    3|
|16-Sep-12|                   17|
|23-Sep-12|                   23|
|24-Sep-12|                    1|
|22-Sep-12|                   19|
|20-Sep-12|                    2|
|10-Sep-12|                    2|
+---------+---------------------+



In [6]:
# Find the date with the most completed trips
date_with_completed_trips = \
    complete_trips_by_date.orderBy("sum(Completed Trips )", ascending = True).select("Date").first()['Date']

print(date_with_completed_trips)

13-Sep-12


In [7]:
# group the data by 24 hour windows and sum the completed trips
df = df.withColumn("Time (Local)", from_unixtime(col("Time (Local)")))

complete_trips_by_windows = df.groupBy(window("Time (Local)", "24 hours")).agg(
    sum("Completed Trips ").alias("Total Completed Trips")).orderBy("Total Completed Trips", ascending = False)

complete_trips_by_windows.show()

+--------------------+---------------------+
|              window|Total Completed Trips|
+--------------------+---------------------+
|{1970-01-01 08:00...|                 1365|
+--------------------+---------------------+



In [8]:
# Get the highest number of completed trips within a 24-hour period
highest_completed_trips_in_24_hours = complete_trips_by_windows.select("Total Completed Trips").first()[
    'Total Completed Trips']

print(highest_completed_trips_in_24_hours)

1365


In [14]:
hourly_requests = df.groupBy(hour("Time (Local)").alias("hour")).agg(sum("Requests ").alias("total_requests")).orderBy(
    "total_requests", ascending = False)

hourly_requests.show()

+----+--------------+
|hour|total_requests|
+----+--------------+
|   8|          1858|
+----+--------------+



In [15]:
most_requested_hour = hourly_requests.select("hour").first()[0]

print("The hour with the most requests is: ", most_requested_hour)

The hour with the most requests is:  8


In [17]:
print(df)

DataFrame[Date: string, Time (Local): string, Eyeballs : int, Zeroes : int, Completed Trips : int, Requests : int, Unique Drivers: int]


In [31]:
df.show(5)

+---------+-------------------+---------+-------+----------------+---------+--------------+
|     Date|       Time (Local)|Eyeballs |Zeroes |Completed Trips |Requests |Unique Drivers|
+---------+-------------------+---------+-------+----------------+---------+--------------+
|10-Sep-12|1970-01-01 08:00:07|        5|      0|               2|        2|             9|
|     NULL|1970-01-01 08:00:08|        6|      0|               2|        2|            14|
|     NULL|1970-01-01 08:00:09|        8|      3|               0|        0|            14|
|     NULL|1970-01-01 08:00:10|        9|      2|               0|        1|            14|
|     NULL|1970-01-01 08:00:11|       11|      1|               4|        4|            11|
+---------+-------------------+---------+-------+----------------+---------+--------------+
only showing top 5 rows



In [50]:
weekend_zeros = weekend_zeros = df.filter((hour("Time (Local)") >= 17) | (hour("Time (Local)") < 3)).filter(
    (dayofweek("Date") == 6) | (dayofweek("Date") == 7)).agg(sum("Zeroes ").alias("weekend_zeros")).collect()[0][
    "weekend_zeros"]

print(weekend_zeros)

None


In [29]:
# Calculate total zeros
total_zeros = df.agg(sum("Zeroes ").alias("total_zeros")).collect()[0]["total_zeros"]

print(total_zeros)

1429


In [52]:
weekend_zeros = 0
percent_weekend_zeros = weekend_zeros / total_zeros * 100

print("The percentage of zeros that occurred on weekends is: ", percent_weekend_zeros)

The percentage of zeros that occurred on weekends is:  0.0


In [54]:
weighted_avg = df.withColumn("completed_per_driver", df["Completed Trips"] / df["Unique Drivers"]) \
    .groupBy("Date", "Time (Local)") \
    .agg(avg("completed_per_driver").alias("avg_completed_per_driver"),
         sum("Completed Trips").alias("total_completed_trips")) \
    .withColumn("weighted_ratio", col("avg_completed_per_driver") * col("total_completed_trips")) \
    .agg(sum("weighted_ratio") / sum("total_completed_trips")).collect()[0][0]

print("The weighted average ratio of completed trips per driver is:", weighted_avg)

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `Completed Trips` cannot be resolved. Did you mean one of the following? [`Date`, `Time (Local)`, `Eyeballs `, `Zeroes `, `Completed Trips `, `Requests `, `Unique Drivers`].