In [1]:
import pyspark
print(pyspark.__version__)

3.5.3


In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Query 0 - Data Cleansing") \
    .getOrCreate()

In [3]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, TimestampType
# Create schema for csv data
schema = StructType([
    StructField("medallion", StringType(), True),
    StructField("hack_license", StringType(), True),
    StructField("pickup_datetime", TimestampType(), True),
    StructField("dropoff_datetime", TimestampType(), True),
    StructField("trip_time_in_secs", IntegerType(), True),
    StructField("trip_distance", DoubleType(), True),
    StructField("pickup_longitude", DoubleType(), True),
    StructField("pickup_latitude", DoubleType(), True),
    StructField("dropoff_longitude", DoubleType(), True),
    StructField("dropoff_latitude", DoubleType(), True),
    StructField("payment_type", StringType(), True),
    StructField("fare_amount", DoubleType(), True),
    StructField("surcharge", DoubleType(), True),
    StructField("mta_tax", DoubleType(), True),
    StructField("tip_amount", DoubleType(), True),
    StructField("tolls_amount", DoubleType(), True)
])

In [4]:
# Load the CSV data
df = spark.read.schema(schema).csv("input/sorted_data.csv")
df.printSchema()
df.show(5)

root
 |-- medallion: string (nullable = true)
 |-- hack_license: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- trip_time_in_secs: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- pickup_longitude: double (nullable = true)
 |-- pickup_latitude: double (nullable = true)
 |-- dropoff_longitude: double (nullable = true)
 |-- dropoff_latitude: double (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- surcharge: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)

+--------------------+--------------------+-------------------+-------------------+-----------------+-------------+----------------+---------------+-----------------+----------------+------------+-----------+---------+-------+----------+------------+
|           medal

In [5]:
initial_count = df.count()
print(f"Initial row count: {initial_count}")

Initial row count: 173185091


In [5]:
# Clean the data, drop null values that might have been added by giving the data a schema.
df_clean = df.dropna()
# Filter out wrong values
df_clean = df_clean.filter(df_clean.trip_time_in_secs > 0)
df_clean = df_clean.filter(df_clean.trip_distance > 0)
df_clean = df_clean.filter(df_clean.hack_license != "unknown")
df_clean = df_clean.filter(df_clean.medallion != "unknown")

df_clean = df_clean.filter(df_clean.pickup_latitude != 0.0)
df_clean = df_clean.filter(df_clean.pickup_longitude != 0.0)
df_clean = df_clean.filter(df_clean.dropoff_latitude != 0.0)
df_clean = df_clean.filter(df_clean.dropoff_longitude != 0.0)

In [6]:
from pyspark.sql.functions import to_date
#Create new column for partitioning
df_clean = df_clean.withColumn("trip_date", to_date(df_clean.pickup_datetime))

In [8]:
# Count total rows after cleaning
final_count = df_clean.count()
print(f"Row count after cleaning: {final_count}")


Row count after cleaning: 169346433


In [15]:
# Count total rows after cleaning
from pyspark.sql.functions import max

df_clean.head()
df_clean.select(max("pickup_datetime")).show()

+--------------------+
|max(pickup_datetime)|
+--------------------+
| 2013-12-31 23:59:57|
+--------------------+



In [16]:
# Count total rows after cleaning
from pyspark.sql.functions import max

df.head()
df.select(max("pickup_datetime")).show()

+--------------------+
|max(pickup_datetime)|
+--------------------+
| 2013-12-31 23:59:57|
+--------------------+



In [7]:
# Take only a sample of the cleaned data due to memory issues
df_sample = df_clean.sample(fraction=0.1, seed=42)
# Create daily partitions to output folder
df_sample.write.partitionBy("trip_date").mode("overwrite").parquet("output")

Kafka osa on pooleli, hetkel üritasin kasutada partitioneid mis lõin outputi, et need üks haaval kafkasse sisse lugeda.

In [None]:
import os
partition_dirs = [
    os.path.join("output", d)
    for d in os.listdir("output")
    if d.startswith("trip_date=")
]
for partition_path in partition_dirs:
    print(f"Writing data from: {partition_path}")
    df_partition = spark.read.parquet(partition_path)
    df_partition.show(5)

In [None]:
# Send the data to Kafka for future use.
df_sample.selectExpr("CAST(medallion AS STRING) AS key", "to_json(struct(*)) AS value") \
    .write \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:9092") \
    .option("topic", "cleaned_data") \
    .save()

Query 1

In [31]:
from pyspark.sql import DataFrame
from pyspark.sql.functions import (
    current_timestamp, make_timestamp, month, dayofmonth, hour, minute, second, lit,
    expr, col, floor, concat_ws, count
)

def get_adjusted_window() -> tuple:
    now = current_timestamp()
    adjusted_now = make_timestamp(
        lit(2013),
        month(now),
        dayofmonth(now),
        hour(now),
        minute(now),
        second(now)
    )
    window_start = expr("""
        make_timestamp(2013,
                       month(current_timestamp()),
                       day(current_timestamp()),
                       hour(current_timestamp()),
                       minute(current_timestamp()),
                       second(current_timestamp()))
        - interval 30 minutes
    """)
    return adjusted_now, window_start


def get_window_dates(adjusted_now, window_start) -> tuple:
    adjusted_date_str = adjusted_now.cast("date")
    window_date_str = window_start.cast("date")
    return adjusted_date_str, window_date_str


def load_filtered_parquet(base_path: str, adjusted_date, window_date) -> DataFrame:
    return spark.read.option("basePath", base_path).parquet(base_path) \
        .filter((col("trip_date") == adjusted_date) | (col("trip_date") == window_date))


def add_grid_cells(df: DataFrame, grid_size: float = 0.01) -> DataFrame:
    return df \
        .withColumn("start_cell_lat", floor(col("pickup_latitude") / grid_size)) \
        .withColumn("start_cell_lon", floor(col("pickup_longitude") / grid_size)) \
        .withColumn("start_cell", concat_ws("_", col("start_cell_lat"), col("start_cell_lon"))) \
        .withColumn("end_cell_lat", floor(col("dropoff_latitude") / grid_size)) \
        .withColumn("end_cell_lon", floor(col("dropoff_longitude") / grid_size)) \
        .withColumn("end_cell", concat_ws("_", col("end_cell_lat"), col("end_cell_lon")))


def get_top_routes(df: DataFrame, adjusted_now, window_start, top_n: int = 10) -> DataFrame:
    filtered = df.filter(
        (col("dropoff_datetime") > window_start) &
        (col("dropoff_datetime") <= adjusted_now)
    )
    return filtered.groupBy("start_cell", "end_cell") \
        .agg(count("*").alias("Number of Rides")) \
        .orderBy(col("Number of Rides").desc()) \
        .limit(top_n)

In [33]:
adjusted_now, window_start = get_adjusted_window()
adjusted_date, window_date = get_window_dates(adjusted_now, window_start)

df = load_filtered_parquet("output/", adjusted_date, window_date)
df_with_cells = add_grid_cells(df)

top_routes = get_top_routes(df_with_cells, adjusted_now, window_start)
top_routes.show()

+----------+----------+---------------+
|start_cell|  end_cell|Number of Rides|
+----------+----------+---------------+
|4075_-7398|4074_-7399|             10|
|4075_-7399|4075_-7398|             10|
|4075_-7398|4075_-7399|              9|
|4074_-7399|4075_-7399|              8|
|4074_-7399|4075_-7398|              8|
|4076_-7398|4075_-7398|              8|
|4077_-7397|4076_-7397|              7|
|4074_-7400|4075_-7399|              7|
|4077_-7396|4077_-7395|              6|
|4075_-7400|4075_-7398|              6|
+----------+----------+---------------+

