5.2 Yellow Taxi 

In [0]:
#Load the yellow taxi dataframe
df_y = spark.read.option("header", True).parquet("/mnt/dummy-1/yellow_taxi*")
display(df_y)

In [0]:
#Shape of yellow taxi dataframe
print((df_y.count(), len(df_y.columns)))

(663055251, 19)


In [0]:
df_y.printSchema() 

root
 |-- VendorID: long (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: double (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)



In [0]:
#Drop some unused columns
df_y_c = df_y.drop("VendorID", "RatecodeID", "store_and_fwd_flag", "payment_type")
display(df_y_c)

In [0]:
#Reduce the size of yellow taxi dataset due to it take more than 1 hour to run entire around 600 million rows.
#Sample the dataset by 20% of entire dataset.
df_y_s = df_y_c.sample(withReplacement=False, fraction=0.2, seed=42)
display(df_y_s)
df_y_s.cache().count()

In [0]:
#Use Spark SQL and create temp view for SQL
df_y_s.createOrReplaceTempView("taxi_y_trips")

In [0]:
# 1) The Trips finishing before the starting time
filter_1 = spark.sql("SELECT tpep_pickup_datetime, tpep_dropoff_datetime FROM taxi_y_trips WHERE tpep_pickup_datetime > tpep_dropoff_datetime")
#filter_1 = df_y.filter(df_y['tpep_pickup_datetime'] > df_y['tpep_dropoff_datetime'])
#display(filter_1)
filter_1.show()

+--------------------+---------------------+
|tpep_pickup_datetime|tpep_dropoff_datetime|
+--------------------+---------------------+
| 2015-01-02 13:17:09|  2015-01-02 13:15:23|
| 2015-01-04 01:51:53|  2015-01-04 01:50:28|
| 2015-01-04 18:33:39|  2015-01-04 18:33:10|
| 2015-01-05 13:33:38|  2015-01-05 13:33:19|
| 2015-01-06 12:36:07|  2015-01-06 12:34:37|
| 2015-01-06 15:34:42|  2015-01-06 15:33:54|
| 2015-01-06 15:30:24|  2015-01-06 15:30:00|
| 2015-01-07 10:19:06|  2015-01-07 10:18:42|
| 2015-01-07 11:45:44|  2015-01-07 11:45:18|
| 2015-01-07 18:35:05|  2015-01-07 18:33:53|
| 2015-01-07 21:42:21|  2015-01-07 21:41:04|
| 2015-01-08 15:10:34|  2015-01-08 15:09:29|
| 2015-01-08 16:12:59|  2015-01-08 16:12:13|
| 2015-01-08 19:45:31|  2015-01-08 19:45:00|
| 2015-01-09 18:26:12|  2015-01-09 18:25:57|
| 2015-01-09 18:31:01|  2015-01-09 18:30:42|
| 2015-01-10 15:29:26|  2015-01-10 15:28:05|
| 2015-01-10 22:54:19|  2015-01-10 22:51:49|
| 2015-01-11 14:53:24|  2015-01-11 14:52:55|
| 2015-01-

In [0]:
filter_1.count()

Out[10]: 16036

In [0]:
#Filter 1) out
df_y_s = spark.sql("SELECT * FROM taxi_y_trips WHERE tpep_pickup_datetime < tpep_dropoff_datetime")
display(df_y_s)

In [0]:
df_y_s.count()

Out[15]: 132455634

In [0]:
from pyspark.sql.functions import split, col

# Split the lpep_pickup_datetime column into date and time
df_y_s = df_y_s.withColumn("date_pickup", split(col("tpep_pickup_datetime"), "T").getItem(0))

df_y_s = df_y_s.withColumn("d_pickup", split(col("date_pickup"), " ").getItem(0))\
            .withColumn("t_pickup", split(col("date_pickup"), " ").getItem(1))
df_y_s = df_y_s.drop("date_pickup", "time_pickup")
display(df_y_s, truncate = False)

In [0]:
# Split the timestamp column into date and time
df_y_s = df_y_s.withColumn("date_dropoff", split(col("tpep_dropoff_datetime"), "T").getItem(0))

df_y_s = df_y_s.withColumn("d_dropoff", split(col("date_dropoff"), " ").getItem(0))\
            .withColumn("t_dropoff", split(col("date_dropoff"), " ").getItem(1))
df_y_s = df_y_s.drop("date_dropoff")
display(df_y_s, truncate = False)


In [0]:
# 2) Trips where the pickup/dropoff datetime is outside of the range
# Skip Filtering out-of-range time. It take a long time to run it and have a same result as a green taxi.
#Filter out-of-range date only
from pyspark.sql.functions import to_date

# Convert the string columns to date 
df_y_s = df_y_s.withColumn("d_pickup", to_date(df_y_s["d_pickup"], "yyyy-MM-dd")) \
               .withColumn("d_dropoff", to_date(df_y_s["d_dropoff"], "yyyy-MM-dd"))


filter_2 = df_y_s.filter(
    (col("d_pickup") < "2015-01-01") | 
    (col("d_dropoff") < "2015-01-01")
)

display(filter_2)


In [0]:
#Count Filter_2 row
filter_2.count()

Out[20]: 494

In [0]:
#Filter filter_2_2 out
df_y_s = df_y_s.filter((col("d_pickup") >= "2015-01-01") & (col("d_pickup") <= "2022-12-31"))
display(df_y_s)

In [0]:
#Save file #1
df_y_s.write.mode("overwrite").parquet("/FileStore/tables/df_yellow_cleaned.parquet")

In [0]:
#Load file #1
df_y_s_2 = spark.read.option("header", True).parquet("/FileStore/tables/df_yellow_cleaned.parquet")
display(df_y_s_2)

In [0]:
df_y_s_2.count()

Out[27]: 132455132

In [0]:
# 3) Trips with negative speed
from pyspark.sql.functions import unix_timestamp, round, col
# Calculate duration in seconds by subtracting pickup and dropoff with unix timestamp
df_y_s_2 = df_y_s_2.withColumn(
    "duration_seconds",
    unix_timestamp(col("tpep_dropoff_datetime")) - unix_timestamp(col("tpep_pickup_datetime"))
)
# Convert seconds to hours with 2 decimal points
df_y_s_2 = df_y_s_2.withColumn(
    "duration_hours",
    round(col("duration_seconds") / 3600, 2)
)


display(df_y_s_2.limit(10))

tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,PULocationID,DOLocationID,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,d_pickup,t_pickup,d_dropoff,t_dropoff,duration_seconds,duration_hours
2021-11-23T19:11:52.000+0000,2021-11-23T19:23:06.000+0000,1,1.54,68,114,8.5,1.0,0.5,2.56,0,0.3,15.36,2.5,0,2021-11-23,19:11:52,2021-11-23,19:23:06,674,0.19
2021-11-23T19:41:58.000+0000,2021-11-23T19:48:47.000+0000,1,0.93,170,233,6.0,1.0,0.5,1.0,0,0.3,11.3,2.5,0,2021-11-23,19:41:58,2021-11-23,19:48:47,409,0.11
2021-11-23T19:45:35.000+0000,2021-11-23T19:58:12.000+0000,1,1.5,246,170,9.5,3.5,0.5,1.0,0,0.3,14.8,2.5,0,2021-11-23,19:45:35,2021-11-23,19:58:12,757,0.21
2021-11-23T19:33:08.000+0000,2021-11-23T19:41:16.000+0000,1,1.26,237,233,7.5,1.0,0.5,2.36,0,0.3,14.16,2.5,0,2021-11-23,19:33:08,2021-11-23,19:41:16,488,0.14
2021-11-23T19:31:15.000+0000,2021-11-23T19:36:15.000+0000,1,0.8,162,229,5.5,1.0,0.5,0.0,0,0.3,9.8,2.5,0,2021-11-23,19:31:15,2021-11-23,19:36:15,300,0.08
2021-11-23T19:24:13.000+0000,2021-11-23T19:29:17.000+0000,1,0.72,186,246,5.5,1.0,0.5,0.0,0,0.3,9.8,2.5,0,2021-11-23,19:24:13,2021-11-23,19:29:17,304,0.08
2021-11-23T19:42:24.000+0000,2021-11-23T20:11:21.000+0000,1,3.39,230,263,19.0,1.0,0.5,5.82,0,0.3,29.12,2.5,0,2021-11-23,19:42:24,2021-11-23,20:11:21,1737,0.48
2021-11-23T19:26:31.000+0000,2021-11-23T19:44:21.000+0000,2,2.4,170,142,12.5,3.5,0.5,1.68,0,0.3,18.48,2.5,0,2021-11-23,19:26:31,2021-11-23,19:44:21,1070,0.3
2021-11-23T19:16:06.000+0000,2021-11-23T19:53:08.000+0000,2,7.8,88,100,31.5,3.5,0.5,0.0,0,0.3,35.8,2.5,0,2021-11-23,19:16:06,2021-11-23,19:53:08,2222,0.62
2021-11-23T19:11:27.000+0000,2021-11-23T19:22:35.000+0000,1,1.29,100,43,8.5,1.0,0.5,1.92,0,0.3,14.72,2.5,0,2021-11-23,19:11:27,2021-11-23,19:22:35,668,0.19


In [0]:
# Determine Speed of each trip with divided trip_distance by duration_hours
df_y_s_2 = df_y_s_2.withColumn(
    "trip_speed",
    round(col("trip_distance") / col("duration_hours"), 2)
)

display(df_y_s_2.limit(10))

tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,PULocationID,DOLocationID,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,d_pickup,t_pickup,d_dropoff,t_dropoff,duration_seconds,duration_hours,trip_speed
2021-11-23T19:11:52.000+0000,2021-11-23T19:23:06.000+0000,1,1.54,68,114,8.5,1.0,0.5,2.56,0,0.3,15.36,2.5,0,2021-11-23,19:11:52,2021-11-23,19:23:06,674,0.19,8.11
2021-11-23T19:41:58.000+0000,2021-11-23T19:48:47.000+0000,1,0.93,170,233,6.0,1.0,0.5,1.0,0,0.3,11.3,2.5,0,2021-11-23,19:41:58,2021-11-23,19:48:47,409,0.11,8.45
2021-11-23T19:45:35.000+0000,2021-11-23T19:58:12.000+0000,1,1.5,246,170,9.5,3.5,0.5,1.0,0,0.3,14.8,2.5,0,2021-11-23,19:45:35,2021-11-23,19:58:12,757,0.21,7.14
2021-11-23T19:33:08.000+0000,2021-11-23T19:41:16.000+0000,1,1.26,237,233,7.5,1.0,0.5,2.36,0,0.3,14.16,2.5,0,2021-11-23,19:33:08,2021-11-23,19:41:16,488,0.14,9.0
2021-11-23T19:31:15.000+0000,2021-11-23T19:36:15.000+0000,1,0.8,162,229,5.5,1.0,0.5,0.0,0,0.3,9.8,2.5,0,2021-11-23,19:31:15,2021-11-23,19:36:15,300,0.08,10.0
2021-11-23T19:24:13.000+0000,2021-11-23T19:29:17.000+0000,1,0.72,186,246,5.5,1.0,0.5,0.0,0,0.3,9.8,2.5,0,2021-11-23,19:24:13,2021-11-23,19:29:17,304,0.08,9.0
2021-11-23T19:42:24.000+0000,2021-11-23T20:11:21.000+0000,1,3.39,230,263,19.0,1.0,0.5,5.82,0,0.3,29.12,2.5,0,2021-11-23,19:42:24,2021-11-23,20:11:21,1737,0.48,7.06
2021-11-23T19:26:31.000+0000,2021-11-23T19:44:21.000+0000,2,2.4,170,142,12.5,3.5,0.5,1.68,0,0.3,18.48,2.5,0,2021-11-23,19:26:31,2021-11-23,19:44:21,1070,0.3,8.0
2021-11-23T19:16:06.000+0000,2021-11-23T19:53:08.000+0000,2,7.8,88,100,31.5,3.5,0.5,0.0,0,0.3,35.8,2.5,0,2021-11-23,19:16:06,2021-11-23,19:53:08,2222,0.62,12.58
2021-11-23T19:11:27.000+0000,2021-11-23T19:22:35.000+0000,1,1.29,100,43,8.5,1.0,0.5,1.92,0,0.3,14.72,2.5,0,2021-11-23,19:11:27,2021-11-23,19:22:35,668,0.19,6.79


In [0]:
#Find any trips with negative speed
filter_3 = df_y_s_2.where("trip_speed < 0")
display(filter_3)

In [0]:
# Count the filter_3
filter_3.count()

Out[33]: 2283

In [0]:
#Filter the trips with negative speed out and count the rows have left
df_y_s_2 = df_y_s_2.filter(col("trip_speed") >= 0)
df_y_s_2.count()

In [0]:
# 4) Trips with very high speed (look for NYC and outside of NYC speed limit )
# NYC has 25 mph speed limit unless otherwise posted and outside NYC is 70 mph.
# The trip is speed over 25 mph that have to be filtered out because most of trips are in NYC.
filter_4 = df_y_s_2.where((col("trip_speed") > 25))
display(filter_4.select("PULocationID","DOLocationID","trip_distance", "duration_hours", "trip_speed"))

In [0]:
filter_4.count()

Out[38]: 5784630

In [0]:
#Filter the trips with over speed limit and count the rows have left
df_y_s_2 = df_y_s_2.filter((col("trip_speed") < 25) & (col("trip_speed") >= 0))
df_y_s_2.count()

Out[7]: 125971959

In [0]:
#Save file #2
df_y_s_2.write.option("header", True).mode("overwrite").parquet("/FileStore/tables/df_yellow_cleaned.parquet")

In [0]:
#Load file #3
df_y_s_3 = spark.read.option("header", True).parquet("/FileStore/tables/df_yellow_cleaned.parquet")
display(df_y_s_3)

In [0]:
# 5) Trips that are travelling too short or too long (duration wise)
#Find the trips that are too short duration.
# Find trips that have duration shorter than 0.02 * 60 = 1.2 minutes
filter_5 = df_y_s_3.where("duration_hours < 0.02")
display(filter_5.select("duration_hours"))
filter_5.count()

In [0]:
#Filter 5) out
from pyspark.sql.functions import col
df_y_s_3 = df_y_s_3.filter(col("duration_hours") >= 0.02)
df_y_s_3.select("duration_hours").show()

+--------------+
|duration_hours|
+--------------+
|          0.17|
|          0.16|
|          0.17|
|          0.05|
|          0.09|
|          0.31|
|          0.15|
|          0.11|
|          0.35|
|          0.41|
|           0.1|
|           0.2|
|          0.14|
|          0.41|
|          0.27|
|          0.08|
|          0.14|
|          0.19|
|          0.56|
|          0.18|
+--------------+
only showing top 20 rows



In [0]:
#Find the trips that too long
# A driver shift is 12 hours is from the rule. 
filter_5_1 = df_y_s_3.where("duration_hours > 12.00")
display(filter_5_1.select("duration_hours"))
filter_5_1.count()

In [0]:
#Filter 5_1 out
df_y_s_3 = df_y_s_3.filter(col("duration_hours") <= 12.00)
df_y_s_3.select("duration_hours").show()
df_y_s_3.count()

+--------------+
|duration_hours|
+--------------+
|          0.17|
|          0.16|
|          0.17|
|          0.05|
|          0.09|
|          0.31|
|          0.15|
|          0.11|
|          0.35|
|          0.41|
|           0.1|
|           0.2|
|          0.14|
|          0.41|
|          0.27|
|          0.08|
|          0.14|
|          0.19|
|          0.56|
|          0.18|
+--------------+
only showing top 20 rows

Out[85]: 125391473

In [0]:
# 6) Trips that are travelling too short or too long (distance wise)
#Find the trips that are too short
#Filter the same Pickup and Dropoff location or distance shorter than 0.1 miles or 0.16 km which is the distance that regular people can walk.
filter_6 = df_y_s_3.where((col("trip_distance") < 0.1))
filter_6.select("PULocationID", "DOLocationID", "trip_distance").show()
filter_6.count()

+------------+------------+-------------+
|PULocationID|DOLocationID|trip_distance|
+------------+------------+-------------+
|         189|         189|          0.0|
|         193|         193|          0.0|
|         193|         193|          0.0|
|         145|         145|          0.0|
|         140|         140|          0.0|
|         141|         141|          0.0|
|          68|         158|          0.0|
|          68|         246|         0.01|
|         246|          48|          0.0|
|         170|         170|         0.03|
|         166|          24|         0.08|
|         238|         238|         0.04|
|         263|         263|          0.0|
|         113|         113|          0.0|
|         107|         107|          0.0|
|         148|         132|          0.0|
|         164|         164|         0.05|
|         138|         142|          0.0|
|         237|         237|          0.0|
|         233|         162|         0.09|
+------------+------------+-------

In [0]:
#Filter 6) out
df_y_s_3 = df_y_s_3.filter(col("trip_distance") >= 0.1)
df_y_s_3.select("trip_distance").show()
df_y_s_3.count()

+-------------+
|trip_distance|
+-------------+
|          1.0|
|          0.7|
|          1.5|
|         0.71|
|         1.48|
|         4.59|
|          0.8|
|         0.72|
|         1.33|
|         2.38|
|          1.0|
|          1.3|
|          0.4|
|          3.3|
|          2.6|
|         0.75|
|         1.12|
|         1.84|
|          7.6|
|         0.54|
+-------------+
only showing top 20 rows

Out[87]: 125012514

In [0]:
#Find the trips that are too long distance.
#Measured the distance from the northest to the southest of NYC is around 50 miles. So, taxi can't commute more than 50 miles
#Filter the trip that has the distance more than 50 miles
filter_6_1 = df_y_s_3.where((col("trip_distance") > 50))
filter_6_1.select("PULocationID", "DOLocationID", "trip_distance").show()
filter_6_1.count()

+------------+------------+-------------+
|PULocationID|DOLocationID|trip_distance|
+------------+------------+-------------+
|         230|         132|        58.24|
|         132|         100|         53.3|
|         132|         265|        51.75|
|         132|          28|        54.22|
|          48|         265|        50.78|
|          15|          15|        51.71|
|         100|         265|        50.39|
|         164|         107|        56.32|
|          44|         265|        50.38|
|         132|         265|        54.87|
|         132|         265|        55.43|
|         163|          48|         59.6|
|         186|         100|         90.2|
|          75|          69|         96.3|
|         181|         143|         53.6|
|         152|          17|         66.3|
|         162|         162|        60.65|
|          22|         129|         70.3|
|         162|         162|         62.4|
|         162|         162|        51.47|
+------------+------------+-------

In [0]:
#Filter 6_1) out
df_y_s_3 = df_y_s_3.filter(col("trip_distance") <= 50)
df_y_s_3.select("trip_distance").show()
df_y_s_3.count()


+-------------+
|trip_distance|
+-------------+
|          1.0|
|          0.7|
|          1.5|
|         0.71|
|         1.48|
|         4.59|
|          0.8|
|         0.72|
|         1.33|
|         2.38|
|          1.0|
|          1.3|
|          0.4|
|          3.3|
|          2.6|
|         0.75|
|         1.12|
|         1.84|
|          7.6|
|         0.54|
+-------------+
only showing top 20 rows

Out[89]: 125012017

In [0]:
#7) Any other logic you think is important
#Show the cleaned dataset of yellow taxi so far
display(df_y_s_3)
df_y_s_3.count()

In [0]:
#Use Spark SQL and create temp view for SQL
df_y_s_3.createOrReplaceTempView("taxi_y_trips")

In [0]:
#Filter the total amount charged to passengers less than 3 because the initial charge of 2015-2022 is $2.50 
filter_7_1 = spark.sql("SELECT total_amount FROM taxi_y_trips WHERE total_amount < 2.50")
filter_7_1.show()
filter_7_1.count()

+------------+
|total_amount|
+------------+
|        -4.8|
|        -7.3|
|        -5.3|
|        -4.8|
|        -3.8|
|        -7.3|
|        -4.8|
|       -42.8|
|        0.31|
|        -6.8|
|        -6.8|
|       -11.3|
|       -10.8|
|        -4.8|
|       -18.3|
|        -5.8|
|        -4.3|
|       -12.8|
|         0.0|
|       -27.3|
+------------+
only showing top 20 rows

Out[92]: 122639

In [0]:
#Filter 7_1) out
df_y_s_3 = df_y_s_3.filter("total_amount >= 2.50")
display(df_y_s_3)
df_y_s_3.count()

In [0]:
#Show the value of each MTA tax
filter_7_2 = spark.sql("SELECT mta_tax, COUNT(*) AS numbers FROM taxi_y_trips GROUP BY mta_tax ")
filter_7_2.show()

+-------+---------+
|mta_tax|  numbers|
+-------+---------+
|    0.0|   236289|
|   2.64|        1|
|    0.5|124663811|
|   0.85|      498|
|   0.55|        1|
|    3.0|        4|
|   -0.5|   109532|
|   1.03|       17|
|    3.3|       23|
|  18.49|        1|
|   0.35|       15|
|   2.74|        1|
|   2.55|        1|
|   48.4|        1|
|   59.3|        1|
|    0.8|     1472|
|   2.54|       98|
|   0.68|        1|
|    1.3|       60|
|    1.5|       18|
+-------+---------+
only showing top 20 rows



In [0]:
#Filter the MTA tax that is not 0.5 and 0 out because $0.5 MTA tax is applied to trips inside Manhattan, Other trip outside Manhattan MTA tax is not applied.
df_y_s_3 = df_y_s_3.filter((col("mta_tax") == 0) | (col("mta_tax") == 0.5))
display(df_y_s_3)
df_y_s_3.count()

In [0]:
#Filter the extra that occur when trips are driving in rush hour or overnight. 
filter_7_3 = spark.sql("SELECT extra, COUNT(*) AS numbers FROM taxi_y_trips WHERE extra IN (0, 0.5, 1) GROUP BY extra")
filter_7_3.show()

+-----+--------+
|extra| numbers|
+-----+--------+
|  0.0|60700981|
|  1.0|18679901|
|  0.5|35191976|
+-----+--------+



In [0]:
#Filter 7_3) out
df_y_s_3 = df_y_s_3.filter((col("extra") == 0) | (col("extra") == 0.5) | (col("extra") == 1.0))
display(df_y_s_3)
df_y_s_3.count()

In [0]:
#Find improvement surcharge how much of each trip.
filter_7_4 = spark.sql("SELECT improvement_surcharge, COUNT(*) AS numbers FROM taxi_y_trips GROUP BY improvement_surcharge")
filter_7_4.show()

+---------------------+---------+
|improvement_surcharge|  numbers|
+---------------------+---------+
|                  0.0|   140451|
|                 -0.3|   107378|
|                  0.3|124552558|
|                 -1.0|     1644|
|                  1.0|   209982|
|                  0.6|        2|
|                 0.03|        1|
|                 0.45|        1|
+---------------------+---------+



In [0]:
#Filter 7_4) only improvement surcharge = 0.3
df_y_s_3 = df_y_s_3.filter((col("Improvement_surcharge") == 0.3))
display(df_y_s_3)
df_y_s_3.count()

In [0]:
#Filter the negative value of fare_amount, tip_amount, tolls_amount
filter_7_5 = spark.sql("SELECT fare_amount, count(*) AS numbers FROM taxi_y_trips where fare_amount < 0 GROUP BY fare_amount ")
filter_7_5.show()

+-----------+-------+
|fare_amount|numbers|
+-----------+-------+
|       -5.5|   7901|
|      -1.36|      1|
|      -14.5|    872|
|      -1.55|     19|
|      -58.0|      8|
|      -52.0|   2323|
|      -30.5|    132|
|     -10.64|      1|
|       -5.0|   9967|
|      -19.8|     24|
|      -18.0|    464|
|      -32.5|     79|
|      -29.0|    140|
|      -50.0|     58|
|      -34.5|    100|
|     -34.11|      1|
|     -26.78|      1|
|      -42.0|     66|
|      -5.05|     13|
|     -12.05|      6|
+-----------+-------+
only showing top 20 rows



In [0]:
filter_7_6 = spark.sql("SELECT tip_amount, count(*) AS numbers FROM taxi_y_trips where tip_amount < 0 GROUP BY tip_amount ")
filter_7_6.show()

+----------+-------+
|tip_amount|numbers|
+----------+-------+
|     -1.36|     12|
|     -5.53|      1|
|    -11.06|      5|
|      -1.0|     34|
|    -26.39|      1|
|     -24.5|      1|
|     -2.16|      2|
|     -1.76|      8|
|    -32.38|      1|
|     -1.46|     13|
|     -2.45|      3|
|     -0.76|      1|
|     -1.66|     15|
|    -36.85|      1|
|     -15.5|      1|
|     -2.26|      5|
|    -54.19|      1|
|    -12.31|      1|
|     -14.0|      3|
|      -3.2|      2|
+----------+-------+
only showing top 20 rows



In [0]:
filter_7_7 = spark.sql("SELECT tolls_amount, count(*) AS numbers FROM taxi_y_trips where tolls_amount < 0 GROUP BY tolls_amount ")
filter_7_7.show()

+------------+-------+
|tolls_amount|numbers|
+------------+-------+
|       -27.5|      2|
|      -13.75|     25|
|        -3.0|     12|
|        -2.8|      8|
|       -6.55|   1427|
|       -30.0|      1|
|      -11.75|     29|
|       -6.12|    467|
|       -2.45|      4|
|       -10.5|     18|
|      -10.17|      5|
|       -5.76|     72|
|        -5.0|      5|
|      -20.05|      1|
|       -22.0|      3|
|       -12.5|     11|
|      -16.11|      1|
|      -10.62|      1|
|      -27.75|      1|
|        -8.5|      1|
+------------+-------+
only showing top 20 rows



In [0]:
#Filter 7_5, 7_6 and 7_7 out
df_y_s_3 = df_y_s_3.filter((col("fare_amount") >= 0) & (col("tolls_amount") >= 0) & (col("tip_amount") >= 0))
display(df_y_s_3)
df_y_s_3.count()

In [0]:
#Find the impossible passenger count
filter_7_8 = spark.sql("SELECT passenger_count, count(*) AS numbers FROM taxi_y_trips GROUP BY passenger_count ")
filter_7_8.show()

+---------------+--------+
|passenger_count| numbers|
+---------------+--------+
|            8.0|      53|
|            0.0|  860375|
|            7.0|     109|
|           null|  690632|
|            1.0|88461639|
|            4.0| 2465036|
|            3.0| 5186896|
|            2.0|18210247|
|            6.0| 3521381|
|            5.0| 5615569|
|            9.0|      80|
+---------------+--------+



In [0]:
#Filter out the passenger count more than 5 passengers and 0 passenger because The maximum amount of passengers are allowed in a taxicab by law is four in a four passenger taxicab or five passengers in a five passenger taxicab.
df_y_s_3 = df_y_s_3.filter((col("passenger_count") <= 5) & (col("passenger_count") > 0))
display(df_y_s_3)
df_y_s_3.count()

In [0]:
#Show the value of congestion 
filter_7_9 = spark.sql("SELECT congestion_surcharge, count(*) AS numbers FROM taxi_y_trips GROUP BY congestion_surcharge ")
filter_7_9.show()

+--------------------+--------+
|congestion_surcharge| numbers|
+--------------------+--------+
|                 0.0| 2407020|
|                null|93023983|
|                 2.5|29501340|
|                 0.5|       7|
|                0.75|     264|
|                2.75|     142|
|                -2.5|   79253|
|                 1.5|       1|
|                -1.5|       1|
|               -0.75|       2|
|                 1.0|       2|
|                 2.0|       2|
+--------------------+--------+



In [0]:
#Filter only 0, 2.5 and null values
df_y_s_3 = df_y_s_3.filter((col("congestion_surcharge").isNull()) | (col("congestion_surcharge") == 0) | (col("congestion_surcharge") == 2.5))
display(df_y_s_3)
df_y_s_3.count()

In [0]:
filter_7_10 = spark.sql("SELECT airport_fee, count(*) AS numbers FROM taxi_y_trips GROUP BY airport_fee ")
filter_7_10.show()

+-----------+---------+
|airport_fee|  numbers|
+-----------+---------+
|        0.0| 11289757|
|       1.25|   503468|
|       null|113216115|
|      -1.25|     2677|
+-----------+---------+



In [0]:
#Filter only 0, 1.25 and null values
df_y_s_3 = df_y_s_3.filter((col("airport_fee").isNull()) | (col("airport_fee") == 0) | (col("airport_fee") == 1.25))
display(df_y_s_3)
df_y_s_3.count()

In [0]:
#Drop some unused columns
df_y_s_3 = df_y_s_3.drop("tpep_pickup_datetime", "tpep_dropoff_datetime", "duration_seconds")
#The final of yellow taxi dataframe
display(df_y_s_3)

In [0]:
df_y_s_3.write.mode("overwrite").parquet("/FileStore/tables/df_yellow_cleaned.parquet")

In [0]:
df_y_s_cleaned = spark.read.option("header", True).parquet("/FileStore/tables/df_yellow_cleaned.parquet")
display(df_y_s_cleaned)
df_y_s_cleaned.count()