In [1]:
from pyspark.sql.functions import isnan, when, count, col
import pandas as pd
from pyspark.sql import functions as F
from pyspark.sql.functions import col
from pyspark.sql.types import StructType, StructField, DoubleType
import datetime
from pyspark.sql.functions import expr

In [2]:
from pyspark.sql import SparkSession, functions as F

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Tutorial 2")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config('spark.driver.memory', '4g')
    .config('spark.executor.memory', '2g')
    .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/08/22 00:57:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
sdf = spark.read.parquet('../data/weather/weather_data.parquet')
df = pd.read_parquet('../data/weather/weather_data.parquet')
sdf.printSchema()
sdf.show(1, vertical=True, truncate=100)
# show last row
print(sdf.tail(1))
# check if there are any null values
pd.isnull(df).sum()
# count total records
print(df.shape)


# after looking this there is no need for any preprocessing in weather data
# in this case landing and raw data are same

                                                                                

root
 |-- STATION: string (nullable = true)
 |-- NAME: string (nullable = true)
 |-- DATE: string (nullable = true)
 |-- AWND: double (nullable = true)
 |-- PGTM: double (nullable = true)
 |-- PRCP: double (nullable = true)
 |-- SNOW: double (nullable = true)
 |-- TAVG: double (nullable = true)
 |-- TMAX: double (nullable = true)
 |-- TMIN: double (nullable = true)



                                                                                

-RECORD 0-----------------------------------
 STATION | USW00094789                      
 NAME    | JFK INTERNATIONAL AIRPORT, NY US 
 DATE    | 2022-01-01                       
 AWND    | 2.8                              
 PGTM    | null                             
 PRCP    | 31.0                             
 SNOW    | 0.0                              
 TAVG    | 10.1                             
 TMAX    | 11.7                             
 TMIN    | 8.9                              
only showing top 1 row

[Row(STATION='USW00094789', NAME='JFK INTERNATIONAL AIRPORT, NY US', DATE='2023-05-31', AWND=2.3, PGTM=None, PRCP=0.0, SNOW=0.0, TAVG=14.7, TMAX=21.1, TMIN=8.9)]
(516, 10)


## Lets begin some preprocessing on yellow taxi data

In [4]:
sdf = spark.read.parquet('../data/tlc/')
sdf_jan = spark.read.parquet('../data/tlc/2022-01.parquet')
sdf_jan.printSchema()
sdf_feb_23 = spark.read.parquet('../data/tlc/2023-02.parquet')
sdf_feb_23.printSchema()

root
 |-- VendorID: long (nullable = true)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: double (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetim

In [5]:
# first before moving into raw layer data from 2022 have passenger count as double 
# lets cast all data to double and then save it to raw layer
def cast_to_schema(year, start, end, temp_schema):
    for month in range(start, end+1):
        input_path = f'../data/tlc/{year}-{str(month).zfill(2)}.parquet'
        output_path = f'../data/raw/{year}-{str(month).zfill(2)}.parquet'
        sdf_malformed = spark.read.parquet(input_path)
        sdf_malformed = sdf_malformed \
            .select([F.col(c).cast(temp_schema[i].dataType) for i, c in enumerate(sdf_malformed.columns)])
        sdf_malformed \
        .coalesce(1) \
        .write \
        .mode('overwrite') \
        .parquet(output_path)

In [6]:
# see if there is any non-whole number passenger_count. So it can be removed
# before any proper data conversion can be made. Lets cast whole data to 
# jan 2022 schema which contains passenger_count as double
tem_schema = sdf_jan.schema
tem_schema
cast_to_schema("2022", 1, 12, tem_schema)
cast_to_schema("2023", 1, 5, tem_schema)

                                                                                

In [7]:
# see if there is any non-whole number passenger_count
sdf = spark.read.parquet('../data/raw/*')
non_whole_count = sdf.filter(col("passenger_count") % 1 != 0).count()
print(f"Number of rows with non-whole number passenger counts: {non_whole_count}")



Number of rows with non-whole number passenger counts: 0


                                                                                

In [8]:
# well we are now sure there wasn't any non-whole number passenger_count
# so lets try to cast all datatype to 2023 february schema (proper schema)
sdf_feb_23.printSchema()

# also ensuring we have consistent casing
consistent_col_casing = [F.col(col_name).alias(col_name.lower()) for col_name in sdf_feb_23.columns]
sdf_feb_23 = sdf_feb_23.select(*consistent_col_casing)
sdf_schema = sdf_feb_23.schema
sdf_schema

cast_to_schema(2022, 1, 12, sdf_schema)
cast_to_schema(2023, 1, 5, sdf_schema)

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- Airport_fee: double (nullable = true)



                                                                                

In [9]:
sdf = spark.read.parquet('../data/raw/*')
sdf.printSchema()
sdf.show(1, vertical=True, truncate=100)

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)

-RECORD 0------------------------------------
 VendorID              | 1                   
 tpep_pickup_datetime  | 2022-10-

## Done with raw data &#128512;

In [10]:
# according to data dictionary store_and_fwd_flag represents boolean condition
# but currently have N and Y to represent No and Yes respectively
sdf = sdf.withColumn('store_and_fwd_flag', 
    (F.col('store_and_fwd_flag') == 'Y').cast('boolean'))
sdf.printSchema()
sdf.show(1, vertical=True, truncate = 100)

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- store_and_fwd_flag: boolean (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)

-RECORD 0------------------------------------
 VendorID              | 1                   
 tpep_pickup_datetime  | 2022-10

In [11]:
# lets see the datashape before doing any further preprocessing
tot_rows = sdf.count()
tot_cols = len(sdf.columns)
print("Total rows: ", tot_rows, "Total columns: ", tot_cols)


Total rows:  55842484 Total columns:  19


In [12]:
# lets see which columns have missing values
missing_values = sdf.agg(*[F.sum(F.when(F.col(c).isNull(), 1).otherwise(0)).alias(c) for c in sdf.columns])
missing_values.show()



+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|       0|                   0|                    0|        1796968|            0|   1796968|           1796968|           0|           0|           0|          0|    0|      0|         

                                                                                

In [13]:
# well there is immense amount of missing passenger data. (imputation doesn't makes sense here)
# so let drop those (1796968) rows
missing_passenger = (missing_values.collect()[0]['passenger_count'] 
                    / tot_rows * 100)
print(f'percentage of missing passenger_count data: {missing_passenger}%')
sdf_clean = sdf.filter(col("passenger_count").isNotNull())
missing_values2 = sdf_clean.agg(*[F.sum(F.when(F.col(c).isNull(), 1).otherwise(0)).alias(c) for c in sdf_clean.columns])
missing_values2.show()

                                                                                

percentage of missing passenger_count data: 3.217922755728416%




+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|       0|                   0|                    0|              0|            0|         0|                 0|           0|           0|           0|          0|    0|      0|         

                                                                                

`after removing misssing passenger_count no other values seems to be missing` \
`How nice is this dataset? No missing values at all.` \
## lets do some outlier detection


In [14]:
# 1. Add a new temporary column to record trip distance so can remove those
# trips whcih span more than 5 hours
sdf_with_difference = sdf_clean.withColumn("time_difference", 
                                          col("tpep_dropoff_datetime")
                                          - col("tpep_pickup_datetime"))

# Count rows where the time_difference is more than 5 hours and remove them
count_greater_than_5_hours = sdf_with_difference \
                             .filter(expr("time_difference "
                             "> interval 5 hours")).count()

print(f"Number of rows with time_difference greater than 5 hours: "
      f"{count_greater_than_5_hours}, percentage: " 
      f"{count_greater_than_5_hours/sdf_with_difference.count() * 100}")
                                         
sdf_clean2 = sdf_with_difference \
            .filter(expr("time_difference <= interval 5 hours"))
sdf_clean2 = sdf_clean2.drop("time_difference")

# also remove those rows that includes drop off time before pick up time a
invalid_dropoff = sdf_clean2.filter(expr("tpep_dropoff_datetime "
                                    "<= tpep_pickup_datetime")).count()

print(f"Number of rows with invalid drop off time: {invalid_dropoff}," 
      f"percentage:  {invalid_dropoff/sdf_clean2.count() * 100}")

sdf_clean2 = sdf_clean2.filter(expr("tpep_dropoff_datetime "
                                    "> tpep_pickup_datetime"))
print(f'remaining rows {sdf_clean2.count()}')


                                                                                

Number of rows with time_difference greater than 5 hours: 62764, percentage: 0.1161317434734086


                                                                                

Number of rows with invalid drop off time: 23308,percentage:  0.043176753937998566




remaining rows 53959444


                                                                                

In [15]:
# reomove Trips with a pick-up/drop-off location ID out of the range 1-263
sdf_clean3 = sdf_clean2.filter(expr("PULocationID >= 1 "
                                    "AND PULocationID <= 263 "
                                    "AND DOLocationID >= 1 "
                                    "AND DOLocationID <= 263"))

invalid_id = sdf_clean2.count() - sdf_clean3.count()
print(f"Number of invalid location IDs: {invalid_id}",
      f"percentage: {invalid_id/sdf_clean2.count()*100}%")
print(sdf_clean3.count())

                                                                                

Number of invalid location IDs: 943953 percentage: 1.749374956495104%




53015491


                                                                                