**The goal of this Jupyter Notebook is to:**

- Identify data quality issues, like missing values, duplicate data, etc.

- Formalize the steps to clean the datasets.

In [None]:
import os.path as osp

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import (
    DoubleType, LongType, StringType, StructField, StructType, TimestampType
)
from pyspark.sql import functions as F
from pyspark.sql.functions import col, udf

# !pip3 install matplotlib
import matplotlib.pyplot as plt

In [None]:
spark = SparkSession.builder.appName("Sparkify ETL").getOrCreate()
spark

## Capital bikeshare trip data

![Capital bikeshare system map](./capital_bikeshare_system_map.png)

In [None]:
TRIP_DATA_PATH = "./datasets/capitalbikeshare_tripdata/2020*.csv"

In [None]:
trip_data_schema = StructType([
    StructField('ride_id', StringType()),
    StructField('rideable_type', StringType()),
    StructField('started_at', TimestampType()),
    StructField('ended_at', TimestampType()),
    StructField('start_station_name', StringType()),
    StructField('start_station_id', LongType()),
    StructField('end_station_name', StringType()),
    StructField('end_station_id', LongType()),
    StructField('start_lat', DoubleType()),
    StructField('start_lng', DoubleType()),
    StructField('end_lat', DoubleType()),
    StructField('end_lng', DoubleType()),
    StructField('member_casual', StringType())
])

trip_data = spark.read.csv(TRIP_DATA_PATH, header=True, schema=trip_data_schema)
trip_data.show(5)

trip_data.printSchema()

print("Total number records: ", trip_data.count())

In [None]:
trip_data.select(F.min("start_lng"), F.max("start_lng")).collect()

In [None]:
trip_data.select(F.min("end_lng"), F.max("end_lng")).collect()

In [None]:
trip_data.select(F.min("start_lat"), F.max("start_lat")).collect()

In [None]:
trip_data.select(F.min("end_lat"), F.max("end_lat")).collect()

## COVID data by states

Data description can be found at https://covidtracking.com/data/api.

In [None]:
COVID_DATA_PATH = "./datasets/covid_data/daily.json"

covid_data = spark.read.json(COVID_DATA_PATH).select(
    "dataQualityGrade", "date", "state", "death", "deathIncrease", "hospitalizedCurrently", "hospitalizedDischarged", "hospitalizedIncrease", 
    "positive", "positiveIncrease", "recovered"
)
covid_data.show(2)

covid_data.printSchema()

print("Total number records: ", covid_data.count())

## Weather data

- AWND: Average daily wind speed (miles per hour)
- TAVG: Average temperature (Fahrenheit)
- TMAX: Maximum temperature (Fahrenheit)
- TMIN: Minimum temperature (Fahrenheit)
- TOBS: Temperature at the time of observation (Fahrenheit)
- WDF2: Direction of fastest 2-minute wind (degrees)
- WDF5: Direction of fastest 5-second wind (degrees)
- WSF2: Fastest 2-minute wind speed (miles per hour)
- WSF5: Fastest 5-second wind speed (miles per hour)
- WDMV: 24-hour wind movement (miles)
- WT01: Fog, ice fog, or freezing fog (may include heavy fog)
- WT02: Heavy fog or heaving freezing fog (not always distinguished from fog)
- WT03: Thunder
- WT04: Ice pellets, sleet, snow pellets, or small hail
- WT05: Hail (may include small hail)
- WT06: Glaze or rime
- WT08: Smoke or haze
- WT11: High or damaging winds

In [None]:
WEATHER_DATA_PATH = "./datasets/weather_data/*_daily.csv"

weather_data = spark.read.csv(WEATHER_DATA_PATH, header=True).drop("TOBS", "WDF2", "WDF5", "WSF2", "WSF5")
weather_data.filter(col("TAVG").isNotNull()).show(5)

weather_data.printSchema()