In [1]:
# Load the data from storage
df = spark.read.load('abfss://data@dpsyndlsdemodev.dfs.core.windows.net/sourceData/nycTripYellow2019Jan/nycTripYellow2019Jan.snappy.parquet', format='parquet')
display(df.limit(10))

In [2]:
# Show the column name & data type
display(df.dtypes)

In [3]:
# Build some baseline understanding of the data.

display(df.describe())

In [2]:
# We notice the values for fareAmount is a bit strange. 
# We have value <= 0 and > 1,000.

from pyspark.sql.functions import *
from pyspark.sql.window import Window

# Normal distribution 
# We want to make sure we exclude data on each extreme (99.7% coverage)

dfFareAmountPercentile = df.select(percentile_approx(df["fareAmount"], [0.0015, 0.9985]).alias("percentRank"))

display(dfFareAmountPercentile)

In [3]:
# Lets plot this data to see the data as we expect a relationship between trip distance and total amount.

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from pandas.plotting import scatter_matrix

# Clear the plot
plt.clf()

# We can add additional variables in the select for additional analysis.
dfDqCheck = df.filter((col("fareAmount") >= 2.5) & (col("fareAmount") <= 75))
pdDqCheck = dfDqCheck.select("tripDistance", "fareAmount").toPandas()
pd.plotting.scatter_matrix(pdDqCheck, figsize=(12,12))

plt.show()

# As we can see, there are some outlier values for trip distance as well. 
# The same approach can be applied. 

# This is not to say, for trips that are < $2.5 and > $75 are not valid records. We should further investigate those records to ensure they are valid.

In [6]:
# Lets create a table of this data

from pyspark.sql.functions import *
spark.sql("CREATE DATABASE IF NOT EXISTS dataExplorationSparkDb")
dfDqCheck.write.mode("overwrite").saveAsTable("dataExplorationSparkDb.nycTripYl2019JanClean")

In [5]:
# Clean up
# spark.sql("DROP DATABASE IF EXISTS dataExplorationSparkDb CASCADE")