In [1]:
import pyspark

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder \
    .appName("Local-PySpark-to-Docker-Spark") \
    .master("local[*]") \
    .config("spark.driver.host", "host.docker.internal") \
    .config("spark.executor.memory", "1g") \
    .getOrCreate()

In [6]:
df = spark.read.option("header", "true").csv("taxi_zone_lookup.csv")

In [1]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .getOrCreate()

df = spark.read \
    .option("header", "true") \
    .csv('taxi_zone_lookup.csv')

df.show()

+----------+-------------+--------------------+------------+
|LocationID|      Borough|                Zone|service_zone|
+----------+-------------+--------------------+------------+
|         1|          EWR|      Newark Airport|         EWR|
|         2|       Queens|         Jamaica Bay|   Boro Zone|
|         3|        Bronx|Allerton/Pelham G...|   Boro Zone|
|         4|    Manhattan|       Alphabet City| Yellow Zone|
|         5|Staten Island|       Arden Heights|   Boro Zone|
|         6|Staten Island|Arrochar/Fort Wad...|   Boro Zone|
|         7|       Queens|             Astoria|   Boro Zone|
|         8|       Queens|        Astoria Park|   Boro Zone|
|         9|       Queens|          Auburndale|   Boro Zone|
|        10|       Queens|        Baisley Park|   Boro Zone|
|        11|     Brooklyn|          Bath Beach|   Boro Zone|
|        12|    Manhattan|        Battery Park| Yellow Zone|
|        13|    Manhattan|   Battery Park City| Yellow Zone|
|        14|     Brookly

In [None]:
# Question 1. PySpark version 
spark.version

'3.3.2'

In [20]:
df = spark.read.parquet("yellow_tripdata_2024-10.parquet")

In [None]:
# Question 2. Size of partition
df.repartition(4).write.parquet("yellow_data/yellow_repartitioned")

In [7]:
from pyspark.sql import functions as F


In [None]:
# Question 3. Number of trips
df = df.withColumn("pickup_date", F.to_date(df.tpep_pickup_datetime))
df.filter(df.pickup_date == "2024-10-15").count()

122561

In [None]:
# Question 4. Longest trip
df = df.withColumn("trip_hours", (F.unix_timestamp(df.tpep_dropoff_datetime)-F.unix_timestamp(df.tpep_pickup_datetime)) / 3600)
df.orderBy(F.desc(df.trip_hours)).first()

In [29]:
zones = spark.read.option("header", "true").csv("taxi_zone_lookup.csv")

In [None]:
# Question 6. Least frequent pickup location zone
df = df.join(zones, df.PULocationID==zones.LocationID, how="inner")
df.groupBy(df.Zone).count().orderBy("count").show()