**The goal of this Jupyter Notebook is to:**

- Identify data quality issues, like missing values, duplicate data, etc.

- Formalize the steps to clean the datasets.

In [None]:
import os.path as osp

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import (
    BooleanType, DateType, DoubleType, IntegerType, LongType, StringType, StructField, StructType, TimestampType
)
from pyspark.sql import functions as F
from pyspark.sql.functions import col, udf

# !pip3 install matplotlib pandas

import matplotlib.pyplot as plt

In [None]:
spark = SparkSession.builder.appName("Sparkify ETL").getOrCreate()
spark

## Capital bikeshare trip data

![Capital bikeshare system map](./capital_bikeshare_system_map.png)

In [None]:
TRIP_DATA_PATH = "./datasets/capitalbikeshare_tripdata/2020*.csv"

trip_data_schema = StructType([
    StructField('ride_id', StringType()),
    StructField('rideable_type', StringType()),
    StructField('started_at', TimestampType()),
    StructField('ended_at', TimestampType()),
    StructField('start_station_name', StringType()),
    StructField('start_station_id', LongType()),
    StructField('end_station_name', StringType()),
    StructField('end_station_id', LongType()),
    StructField('start_lat', DoubleType()),
    StructField('start_lng', DoubleType()),
    StructField('end_lat', DoubleType()),
    StructField('end_lng', DoubleType()),
    StructField('member_casual', StringType())
])

trip_data = spark.read.csv(TRIP_DATA_PATH, header=True, schema=trip_data_schema)
trip_data.show(5)

trip_data.printSchema()

print("Total number records: ", trip_data.count())

## COVID data by states

Data description can be found at https://covidtracking.com/data/api.

In [None]:
COVID_DATA_PATH = "./datasets/covid_data/daily.json"

# Select only interested columns
covid_data = spark.read.json(COVID_DATA_PATH).select(
    "dataQualityGrade", "date", "state", "death", "deathIncrease", "hospitalizedCurrently", "hospitalizedDischarged", "hospitalizedIncrease", 
    "positive", "positiveIncrease", "recovered"
)
# Select only data from Washington DC
covid_data = covid_data.filter(col("state") == "DC").drop("state")

covid_data.show(5)

covid_data.printSchema()

print("Total number records: ", covid_data.count())

Drop columns which has a single value (e.g. null), which typically means data is not available.

In [None]:
covid_data.select("dataQualityGrade", "hospitalizedDischarged", "hospitalizedIncrease").distinct().show()
covid_data = covid_data.drop("dataQualityGrade", "hospitalizedDischarged", "hospitalizedIncrease")

In [None]:
covid_data.show(5)

In [None]:
covid_df = covid_data.toPandas()
covid_df["deathIncrease"].plot()

## Weather data

- AWND: Average daily wind speed (miles per hour)
- TAVG: Average temperature (Fahrenheit)
- TMAX: Maximum temperature (Fahrenheit)
- TMIN: Minimum temperature (Fahrenheit)
- TOBS: Temperature at the time of observation (Fahrenheit)
- WDF2: Direction of fastest 2-minute wind (degrees)
- WDF5: Direction of fastest 5-second wind (degrees)
- WSF2: Fastest 2-minute wind speed (miles per hour)
- WSF5: Fastest 5-second wind speed (miles per hour)
- WDMV: 24-hour wind movement (miles)
- WT01: Fog, ice fog, or freezing fog (may include heavy fog)
- WT02: Heavy fog or heaving freezing fog (not always distinguished from fog)
- WT03: Thunder
- WT04: Ice pellets, sleet, snow pellets, or small hail
- WT05: Hail (may include small hail)
- WT06: Glaze or rime
- WT08: Smoke or haze
- WT11: High or damaging winds

In [None]:
WEATHER_DATA_PATH = "./datasets/weather_data/*_daily.csv"

weather_data_schema = StructType([
    StructField('STATION', StringType()),
    StructField('NAME', StringType()),
    StructField('DATE', DateType()),
    StructField('AWND', DoubleType()),
    StructField('TAVG', DoubleType()),
    StructField('TMAX', DoubleType()),
    StructField('TMIN', DoubleType()),
    StructField('TOBS', DoubleType()),
    StructField('WDF2', DoubleType()),
    StructField('WDF5', DoubleType()), 
    StructField('WDMV', DoubleType()), 
    StructField('WSF2', DoubleType()),
    StructField('WSF5', DoubleType()),
    StructField('WT01', StringType()),
    StructField('WT02', StringType()), 
    StructField('WT03', StringType()), 
    StructField('WT04', StringType()), 
    StructField('WT05', StringType()), 
    StructField('WT06', StringType()), 
    StructField('WT08', StringType()), 
    StructField('WT11', StringType())
])

weather_data = spark.read.csv(WEATHER_DATA_PATH, header=True, schema=weather_data_schema).drop(
    "NAME", "TOBS", "WDF2", "WDF5", "WDMV", "WSF2", "WSF5")

print("Number of stations before filtering: ", weather_data.select('STATION').distinct().count())

# Remove incomplete data.
weather_data = weather_data.filter(col("AWND").isNotNull()).filter(col("TAVG").isNotNull()).filter(col("TMAX").isNotNull()).filter(col("TMIN").isNotNull())

print("Number of stations after filtering: ", weather_data.select('STATION').distinct().count())

# Fill the WT?? null value with 0.
weather_data = weather_data.na.fill('0')

weather_data.show(5)

weather_data.printSchema()

print("Total number records: ", weather_data.count())

In [None]:
for i in ['01', "02", "03", "04", "05", "06", "08", "11"]:
    col_name = f"WT{i}"
    orig_col_name = f"{col_name}_orig"
    weather_data = weather_data.withColumnRenamed(col_name, orig_col_name)
    weather_data = weather_data.withColumn(col_name, col(orig_col_name).cast(BooleanType())).drop(orig_col_name)

To make my life simple, I select one of the three stations. Of course, one could use the aggregated values of the three stations, or map weather station to bike station.

In [None]:
weather_data = weather_data.filter(col("STATION") == "USW00093721").drop("STATION")

print("Total number records: ", weather_data.count())

assert(weather_data.select("DATE").distinct().count() == weather_data.count())

In [None]:
weather_data.show(5)

In [None]:
weather_df = weather_data.toPandas()

_, axes = plt.subplots(1, 2, figsize=(16, 4))

weather_df.plot("DATE", ["TAVG", "TMIN", "TMAX"], ax=axes[0])
weather_df.plot("DATE", "AWND", ax=axes[1])

for ax in axes:
    ax.tick_params(axis='x', labelrotation = 45)