In [1]:
# Install necessary libraries
!pip install pyspark
!pip install geopandas
!pip install shapely
!pip install matplotlib
!pip install seaborn

from pyspark.sql import SparkSession
import geopandas as gpd
from shapely.geometry import shape, Point
from pyspark.sql.functions import udf, unix_timestamp, col, lag, sum as spark_sum, avg, trim
from pyspark.sql.window import Window
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

# Initialize Spark session
spark = SparkSession.builder.appName("NYC Taxi Analysis").getOrCreate()

# Function to trim column names
def trim_columns(df):
    for col_name in df.columns:
        df = df.withColumnRenamed(col_name, col_name.strip())
    return df

# Upload and read trip data
from google.colab import files
uploaded = files.upload()
trip_data_df = spark.read.csv('trip_data/*.csv', header=True, inferSchema=True)
trip_data_df = trim_columns(trip_data_df)

# Upload and read trip fare data
uploaded = files.upload()
trip_fare_df = spark.read.csv('trip_fare/*.csv', header=True, inferSchema=True)
trip_fare_df = trim_columns(trip_fare_df)

# Upload and load GeoJSON data
uploaded = files.upload()  # Make sure to select your nyc-boroughs.geojson file here
gdf = gpd.read_file('nyc-boroughs.geojson')
gdf['geometry'] = gdf['geometry'].apply(lambda x: x if x.is_valid else x.buffer(0))

# Broadcast borough data
boroughs = gdf.apply(lambda row: (row['boroughCode'], shape(row['geometry'])), axis=1)
borough_dict = {code: geom for code, geom in boroughs}
boroughs_broadcast = spark.sparkContext.broadcast(borough_dict)

# Define UDF for getting borough from coordinates
def get_borough(longitude, latitude):
    point = Point(longitude, latitude)
    for code, geom in boroughs_broadcast.value.items():
        if geom.contains(point):
            return code
    return None

get_borough_udf = udf(get_borough)

# Enrich DataFrame with borough information
df = trip_data_df.alias('td').join(trip_fare_df.alias('tf'), ['medallion', 'hack_license'])
df = df.withColumn("pickup_borough", get_borough_udf(df["td.pickup_longitude"], df["td.pickup_latitude"]))
df = df.withColumn("dropoff_borough", get_borough_udf(df["td.dropoff_longitude"], df["td.dropoff_latitude"]))

# Process timestamps and calculate durations
df = df.withColumn("pickup_ts", unix_timestamp(df["td.pickup_datetime"]))
df = df.withColumn("dropoff_ts", unix_timestamp(df["td.dropoff_datetime"]))
df = df.withColumn("duration", (col("dropoff_ts") - col("pickup_ts")) / 3600)
df = df.filter((col("duration") > 0) & (col("duration") <= 4))

# Calculate idle time using window functions
windowSpec = Window.partitionBy("hack_license").orderBy("pickup_ts")
df = df.withColumn("previous_dropoff_ts", lag("dropoff_ts").over(windowSpec))
df = df.withColumn("idle_time", (col("pickup_ts") - col("previous_dropoff_ts")) / 3600)
df = df.filter(col("idle_time") <= 4)

# Compute utilization
utilization = df.groupBy("hack_license").agg(
    (1 - (spark_sum("idle_time") / spark_sum("duration"))).alias("utilization")
)

# Compute average time to next fare per borough
avg_time_to_next_fare = df.groupBy("dropoff_borough").agg(avg("idle_time").alias("avg_idle_time"))

# Show the average idle time per dropoff borough
avg_time_to_next_fare.show()

# Compute trip counts
same_borough_trips = df.filter(col("pickup_borough") == col("dropoff_borough")).count()
different_borough_trips = df.filter(col("pickup_borough") != col("dropoff_borough")).count()

# Print the number of trips within the same borough and across different boroughs
print(f"Number of trips within the same borough: {same_borough_trips}")
print(f"Number of trips across different boroughs: {different_borough_trips}")

# Convert to Pandas DataFrame for plotting
pandas_df = df.select("idle_time").toPandas()

# Replace inf/-inf with NaN
pandas_df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Plotting
plt.figure(figsize=(10, 6))
sns.histplot(pandas_df['idle_time'].dropna(), bins=30, kde=True)
plt.title('Distribution of Idle Times')
plt.xlabel('Idle Time (hours)')
plt.ylabel('Frequency')
plt.show()

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=fc8583bbe085e992006cbbea651eba17740353599735380f7d86698c25209b0e
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


KeyboardInterrupt: 