In [24]:
# Clean up
# spark.sql("DROP DATABASE IF EXISTS demoDataSparkDb CASCADE")

In [1]:
# Load the data from storage
dataSource = 'abfss://demo@stdemodatalake001.dfs.core.windows.net/'
path = 'nycTripYellow2019Jan/nycTripYellow2019Jan.snappy.parquet'
fileFormat = 'parquet'

df = spark.read.load(dataSource + path, format=fileFormat)
display(df.limit(10))

In [2]:
# Show the column name & data type
display(df.dtypes)

# Column data type issue: vendorID, puLocationId, doLocationId, paymentType, improvementSurcharge

In [3]:
# Build some baseline understanding of the data. (Max, min, etc)

display(df.describe())

# Same issue as in SQL: 
#  > PassengerCount: 0 - 9
#  > FareAmount: negative - 623k

In [4]:
# Check for pickup time >= DropOff time
from pyspark.sql.functions import *

dfTimeCheck = df.filter(col('tpepPickupDateTime') >= col('tpepDropoffDateTime')).count()
display(dfTimeCheck)

In [5]:
# For the fareAmount, we want to conduct a more detail analysis
# We want to exclude the fares on the extreme values using Normal distribution (99.7% coverage).

from pyspark.sql.functions import *
from pyspark.sql.window import Window

dfFareAmountPercentile = df.filter(col("fareAmount") > 0).select(percentile_approx(df["fareAmount"], [0.0015, 0.9985]).alias("percentRank"))

display(dfFareAmountPercentile)

In [6]:
# Create a dataframe based on the data quality issues we found so far.

dfClean = df.filter(
    (col("passengerCount") > 0) &
    (col("tripDistance") > 0) &
    (col('tpepDropoffDateTime') > col('tpepPickupDateTime')) &
    (col("fareAmount") >= 2.5) &
    (col("fareAmount") <= 75)
)

display(dfClean.count())

# 7664528 -> 7477805
# ~186k records have issues.

In [21]:
# Lets create the clean table

from pyspark.sql.functions import *
from pyspark.sql.types import *

spark.sql("CREATE DATABASE IF NOT EXISTS demoDataSparkDb")

dfClean = dfClean.drop("startLon", "startLat", "endLon", "endLat")
dfClean.withColumn("vendorId", col("vendorId").cast(IntegerType()))
dfClean.withColumn("puLocationId", col("puLocationId").cast(IntegerType()))
dfClean.withColumn("doLocationId", col("doLocationId").cast(IntegerType()))
dfClean.withColumn("paymentType", col("paymentType").cast(IntegerType()))
dfClean.withColumn("improvementSurcharge", col("improvementSurcharge").cast(DoubleType()))

dfClean.write.mode("overwrite").saveAsTable("demoDataSparkDb.clean_nycTripYellow2019Jan")

In [22]:
dfTest = spark.read.table("demoDataSparkDb.clean_nycTripYellow2019Jan")

display(dfTest.limit(10))

In [23]:
# Lets plot this data to see the data as we expect a relationship between trip distance and total amount.

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from pandas.plotting import scatter_matrix

# Clear the plot
plt.clf()

# We can add additional variables in the select for additional analysis.
#  does the fareAmount has any relationship wtih the tripDistance? 
pdComplexCheck = dfTest.select("tripDistance", "fareAmount").toPandas()
pd.plotting.scatter_matrix(pdComplexCheck, figsize=(12,12))

plt.show()

# As we can see, there are some outlier values for trip distance as well. 
# The same approach can be applied. 

# This is not to say, for trips that are < $2.5 and > $75 are not valid records. 
# We should discuss with the client to ensure the data is accurate. 

# Observation (Right)
# - Majority of the fare is < $15
# - We have some outliers where the fare is $10 but its going 700 miles