# Project 2

### Imports

In [16]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.functions import unix_timestamp
from pyspark.sql.functions import max, col, unix_timestamp, lit


### Query 0 - Data Cleansing and Setup

In [1]:
spark = SparkSession.builder \
    .appName("Project") \
    .getOrCreate()

In [8]:
df_small = spark.read.csv("input/sorted_data_smaller.csv", header=False, inferSchema=True)

In [9]:
columns = [
    "medallion",
    "hack_license",
    "pickup_datetime",
    "dropoff_datetime",
    "trip_time_in_secs",
    "trip_distance",
    "pickup_longitude",
    "pickup_latitude",
    "dropoff_longitude",
    "dropoff_latitude",
    "payment_type",
    "fare_amount",
    "surcharge",
    "mta_tax",
    "tip_amount",
    "tolls_amount",
    "total_amount"
]
df_small = df_small.toDF(*columns)

In [None]:
df_small.show(5)  # See the first 5 rows
df_small.printSchema()

In [15]:

# Remove rows with missing or 0.0 coordinates
df_clean = df_small.filter(
    (col("pickup_longitude").isNotNull()) & (col("pickup_longitude") != 0.0) &
    (col("pickup_latitude").isNotNull()) & (col("pickup_latitude") != 0.0) &
    (col("dropoff_longitude").isNotNull()) & (col("dropoff_longitude") != 0.0) &
    (col("dropoff_latitude").isNotNull()) & (col("dropoff_latitude") != 0.0)
)

# Remove rows with missing medallions or licenses
df_clean = df_clean.filter(
    (col("medallion").isNotNull()) & (col("medallion") != "") &
    (col("hack_license").isNotNull()) & (col("hack_license") != "")
)
# Tme Model - this supports time-based queries
df_clean = df_clean.withColumn("pickup_ts", unix_timestamp("pickup_datetime")) \
                   .withColumn("dropoff_ts", unix_timestamp("dropoff_datetime"))

In [21]:
print("Original row count:", df_small.count())
print("Cleaned row count:", df_clean.count())

Original row count: 14432092
Cleaned row count: 14186504


### Query 1: Frequent Routes