In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_timestamp, hour, dayofweek

# Initialize SparkSession
spark = SparkSession.builder.appName("NYC Taxi Preprocessing").getOrCreate()

# Load CSV data (adjust path if needed)
df = spark.read.csv("../data/yellow_tripdata_2023-01.csv", header=True, inferSchema=True)

# Remove duplicates
df = df.dropDuplicates()

# Drop rows with missing critical values
df = df.na.drop(subset=["tpep_pickup_datetime", "trip_distance", "passenger_count"])

# Convert pickup datetime to timestamp type
df = df.withColumn("pickup_datetime", to_timestamp(col("tpep_pickup_datetime")))

# Extract hour and day of week
df = df.withColumn("pickup_hour", hour(col("pickup_datetime")))
df = df.withColumn("pickup_dayofweek", dayofweek(col("pickup_datetime")))

# Filter out invalid trips
df = df.filter((col("trip_distance") > 0) & (col("passenger_count") > 0))

# Show sample data
df.select("pickup_datetime", "pickup_hour", "pickup_dayofweek", "trip_distance", "passenger_count").show(5)