**The goal of this Jupyter Notebook is to:**

- Identify data quality issues, like missing values, duplicate data, etc.

- Formalize the steps to clean the datasets.

In [None]:
import pathlib
from datetime import datetime

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import (
    BooleanType, DateType, DoubleType, IntegerType, LongType, StringType, StructField, StructType, TimestampType
)
from pyspark.sql import functions as F

# !pip3 install matplotlib pandas

import matplotlib.pyplot as plt

In [None]:
spark = SparkSession.builder.appName("Sparkify ETL").getOrCreate()
spark

## Capital bikeshare trip data

![Capital bikeshare system map](./capital_bikeshare_system_map.png)

Data description can be found at https://www.capitalbikeshare.com/system-data.

**After initial exploration, it is found that the data schema has been changed since 05.2020. The new data schema is different from the schema listed on the official website. In the following, the data with different schemas with be merged.**

In [None]:
TRIP_DATA_FOLDER = pathlib.Path("./datasets/capitalbikeshare_tripdata")
TRIP_DATA_PATHS_OLD = []
TRIP_DATA_PATHS_NEW = []
for filename in TRIP_DATA_FOLDER.glob("*.csv"):
    if int(filename.stem[:6]) <= 202003:
        TRIP_DATA_PATHS_OLD.append(str(filename))
    else:
        TRIP_DATA_PATHS_NEW.append(str(filename))
        
len(TRIP_DATA_PATHS_OLD), len(TRIP_DATA_PATHS_NEW)

In [None]:
trip_data_new = spark.read.csv(TRIP_DATA_PATHS_NEW, header=True)
trip_data_new.show(5)

trip_data_new.printSchema()

print("Total number new records: ", trip_data_new.count())

In [None]:
trip_data_new.select("member_casual").distinct().show()

In [None]:
trip_data_new.select("rideable_type").distinct().show()

In [None]:
trip_data_old = spark.read.csv(TRIP_DATA_PATHS_OLD, header=True)
trip_data_old.show(5)

trip_data_old.printSchema()

print("Total number of old records: ", trip_data_old.count())

In [None]:
trip_data_old.select("Member type").distinct().show()

## COVID data by states

Data description can be found at https://covidtracking.com/data/api.

In [None]:
COVID_DATA_PATH = "./datasets/covid_data/daily.json"

# Select only interested columns
covid_data = spark.read.json(COVID_DATA_PATH).select(
    "dataQualityGrade", "date", "state", "death", "deathIncrease", "hospitalizedCurrently", "hospitalizedDischarged", "hospitalizedIncrease", 
    "positive", "positiveIncrease", "recovered"
)
# Select only data from Washington DC
covid_data = covid_data.filter(F.col("state") == "DC").drop("state")

covid_data.show(5)

covid_data.printSchema()

print("Total number records: ", covid_data.count())

**Drop columns which has a single value (e.g. null), which typically means data is not available.**

In [None]:
covid_data.select("dataQualityGrade", "hospitalizedDischarged", "hospitalizedIncrease").distinct().show()
covid_data = covid_data.drop("dataQualityGrade", "hospitalizedDischarged", "hospitalizedIncrease")

covid_data.show(5)

**Convert type of column "date" from `long` to `date`.**

In [None]:
func =  F.udf(lambda x: datetime.strptime(str(x), '%Y%m%d'), DateType())

covid_data = covid_data.withColumn("date", func(F.col("date")))
covid_data.orderBy("date").show(5)

**Fill null with 0. Actually, the null values were discovered by the following visualization. It is reasonable to do it since null values only appears at the beginning of the outbreak.**

In [None]:
covid_data = covid_data.fillna(0).orderBy("date")

**Drop possible duplicated rows.**

In [None]:
print("Before dropDuplicates: ", covid_data.count())
covid_data = covid_data.dropDuplicates(["date"])
print("After dropDuplicates: ", covid_data.count())

**Sanity check by plotting the temperature data and wind speed data.**

In [None]:
covid_df = covid_data.toPandas()

_, axes = plt.subplots(3, 2, figsize=(16, 9))

for col, ax in zip(list(covid_df.columns[1:]), axes.flatten()):
    covid_df.plot("date", col, ax=ax)
    ax.tick_params(axis='x', labelrotation=45)

plt.tight_layout()

**Note: the dip in the "hospitalizedCurrently" plot and the jump in the "recovered" plot are both suspicious!**

## Weather data

- AWND: Average daily wind speed (miles per hour)
- TAVG: Average temperature (Fahrenheit)
- TMAX: Maximum temperature (Fahrenheit)
- TMIN: Minimum temperature (Fahrenheit)
- TOBS: Temperature at the time of observation (Fahrenheit)
- WDF2: Direction of fastest 2-minute wind (degrees)
- WDF5: Direction of fastest 5-second wind (degrees)
- WSF2: Fastest 2-minute wind speed (miles per hour)
- WSF5: Fastest 5-second wind speed (miles per hour)
- WDMV: 24-hour wind movement (miles)
- WT01: Fog, ice fog, or freezing fog (may include heavy fog)
- WT02: Heavy fog or heaving freezing fog (not always distinguished from fog)
- WT03: Thunder
- WT04: Ice pellets, sleet, snow pellets, or small hail
- WT05: Hail (may include small hail)
- WT06: Glaze or rime
- WT08: Smoke or haze
- WT11: High or damaging winds

In [None]:
WEATHER_DATA_PATH = "./datasets/weather_data/*_daily.csv"

weather_data_schema = StructType([
    StructField('STATION', StringType()),
    StructField('NAME', StringType()),
    StructField('DATE', DateType()),
    StructField('AWND', DoubleType()),
    StructField('TAVG', DoubleType()),
    StructField('TMAX', DoubleType()),
    StructField('TMIN', DoubleType()),
    StructField('TOBS', DoubleType()),
    StructField('WDF2', DoubleType()),
    StructField('WDF5', DoubleType()), 
    StructField('WDMV', DoubleType()), 
    StructField('WSF2', DoubleType()),
    StructField('WSF5', DoubleType()),
    StructField('WT01', StringType()),
    StructField('WT02', StringType()), 
    StructField('WT03', StringType()), 
    StructField('WT04', StringType()), 
    StructField('WT05', StringType()), 
    StructField('WT06', StringType()), 
    StructField('WT08', StringType()), 
    StructField('WT11', StringType())
])

weather_data = spark.read.csv(WEATHER_DATA_PATH, header=True, schema=weather_data_schema).drop(
    "NAME", "TOBS", "WDF2", "WDF5", "WDMV", "WSF2", "WSF5")

weather_data.show(5)

**Remove rows if any of the columns "AWND", "TAVG", "TMAX" and "TMIN" contain null. Afterwards, replace null in WT?? with 0 and cast the data type to boolean.**


In [None]:
print("Number of stations before filtering: ", weather_data.select('STATION').distinct().count())

weather_data = weather_data.filter(F.col("AWND").isNotNull()).filter(F.col("TAVG").isNotNull()).filter(F.col("TMAX").isNotNull()).filter(F.col("TMIN").isNotNull())

print("Number of stations after filtering: ", weather_data.select('STATION').distinct().count())

for i in ['01', "02", "03", "04", "05", "06", "08", "11"]:
    col_name = f"WT{i}"
    orig_col_name = f"{col_name}_orig"
    weather_data = weather_data.fillna('0', subset=[col_name]).withColumnRenamed(col_name, orig_col_name)
    weather_data = weather_data.withColumn(col_name, F.col(orig_col_name).cast(BooleanType())).drop(orig_col_name)

weather_data.show(5)

weather_data.printSchema()

print("Total number records: ", weather_data.count())

**To make my life easy, I select one of the three stations. Of course, one could use the aggregated values of the three stations, or map weather station to bike station.**

In [None]:
weather_data = weather_data.filter(F.col("STATION") == "USW00093721").drop("STATION")

print("Total number records: ", weather_data.count())

# There is no duplicated row.
assert(weather_data.select("DATE").distinct().count() == weather_data.count())

In [None]:
weather_data.show(5)

**Sanity check by plotting the temperature data and wind speed data.**

In [None]:
weather_df = weather_data.toPandas()

_, axes = plt.subplots(1, 2, figsize=(16, 4))

weather_df.plot("DATE", ["TAVG", "TMIN", "TMAX"], ax=axes[0])
weather_df.plot("DATE", "AWND", ax=axes[1])

for ax in axes:
    ax.tick_params(axis='x', labelrotation=45)