# Uber-Data-Analysis-Project-in-Pyspark

In [23]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.functions import max
from pyspark.sql.functions import sum, window

In [3]:
# create a sparksession
spark = SparkSession.builder.appName("UberDataAnalysis").getOrCreate()

# Load the dataset into a dataframe
df = spark.read.csv('./data/uber.csv', header = True, inferSchema = True)

In [4]:
df.show(5)

+---------+------------+---------+-------+----------------+---------+--------------+
|     Date|Time (Local)|Eyeballs |Zeroes |Completed Trips |Requests |Unique Drivers|
+---------+------------+---------+-------+----------------+---------+--------------+
|10-Sep-12|           7|        5|      0|               2|        2|             9|
|     NULL|           8|        6|      0|               2|        2|            14|
|     NULL|           9|        8|      3|               0|        0|            14|
|     NULL|          10|        9|      2|               0|        1|            14|
|     NULL|          11|       11|      1|               4|        4|            11|
+---------+------------+---------+-------+----------------+---------+--------------+
only showing top 5 rows



In [26]:
print(df)

DataFrame[Date: string, Time (Local): int, Eyeballs : int, Zeroes : int, Completed Trips : int, Requests : int, Unique Drivers: int]


In [8]:
# Group the data and sum the completed trips
complete_trips_by_date = df.groupBy("Date").sum("Completed Trips ")
complete_trips_by_date.show()

+---------+---------------------+
|     Date|sum(Completed Trips )|
+---------+---------------------+
|11-Sep-12|                    1|
|13-Sep-12|                    0|
|17-Sep-12|                    0|
|18-Sep-12|                    3|
|21-Sep-12|                   10|
|     NULL|                 1261|
|15-Sep-12|                   23|
|12-Sep-12|                    0|
|19-Sep-12|                    0|
|14-Sep-12|                    3|
|16-Sep-12|                   17|
|23-Sep-12|                   23|
|24-Sep-12|                    1|
|22-Sep-12|                   19|
|20-Sep-12|                    2|
|10-Sep-12|                    2|
+---------+---------------------+



In [22]:
# Find the date with the most completed trips
date_with_completed_trips = \
    complete_trips_by_date.orderBy("sum(Completed Trips )", ascending = True).select("Date").first()['Date']

print(date_with_completed_trips)

13-Sep-12


In [28]:
# group the data by 24 hour windows and sum the completed trips
df = df.withColumn("Time (Local)", from_unixtime(col("Time (Local)")))

complete_trips_by_windows = df.groupBy(window("Time (Local)", "24 hours")).agg(
    sum("Completed Trips ").alias("Total Completed Trips")).orderBy("Total Completed Trips", ascending = False)

complete_trips_by_windows.show()

+--------------------+---------------------+
|              window|Total Completed Trips|
+--------------------+---------------------+
|{1970-01-01 08:00...|                 1365|
+--------------------+---------------------+



In [30]:
# Get the highest number of completed trips within a 24-hour period
highest_completed_trips_in_24_hours = complete_trips_by_windows.select("Total Completed Trips").first()[
    'Total Completed Trips']

print(highest_completed_trips_in_24_hours)

1365
