**The goal of this Jupyter Notebook is to:**

- Identify data quality issues, like missing values, duplicate data, etc.

- Formalize the steps to clean the datasets.

In [None]:
import os.path as osp

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import (
    DoubleType, LongType, StringType, StructField, StructType, TimestampType
)
from pyspark.sql import functions as F
from pyspark.sql.functions import col, udf

# !pip3 install matplotlib
import matplotlib.pyplot as plt

In [None]:
spark = SparkSession.builder.appName("Sparkify ETL").getOrCreate()
spark

## Capital bikeshare trip data

![Capital bikeshare system map](./capital_bikeshare_system_map.png)

In [None]:
TRIP_DATA_PATH = "./datasets/capitalbikeshare_tripdata/2020*.csv"

In [None]:
trip_data_schema = StructType([
    StructField('ride_id', StringType()),
    StructField('rideable_type', StringType()),
    StructField('started_at', TimestampType()),
    StructField('ended_at', TimestampType()),
    StructField('start_station_name', StringType()),
    StructField('start_station_id', LongType()),
    StructField('end_station_name', StringType()),
    StructField('end_station_id', LongType()),
    StructField('start_lat', DoubleType()),
    StructField('start_lng', DoubleType()),
    StructField('end_lat', DoubleType()),
    StructField('end_lng', DoubleType()),
    StructField('member_casual', StringType())
])

trip_data = spark.read.csv(TRIP_DATA_PATH, header=True, schema=trip_data_schema)
trip_data.show(5)

trip_data.printSchema()

print("Total number records: ", trip_data.count())

In [None]:
trip_data.select(F.min("start_lng"), F.max("start_lng")).collect()

In [None]:
trip_data.select(F.min("end_lng"), F.max("end_lng")).collect()

In [None]:
trip_data.select(F.min("start_lat"), F.max("start_lat")).collect()

In [None]:
trip_data.select(F.min("end_lat"), F.max("end_lat")).collect()

## COVID data by states

In [None]:
COVID_DATA_PATH = "./datasets/covid_data/daily.csv"

In [None]:
# We drop the deprecated fields in the first place.
covid_data = spark.read.csv(COVID_DATA_PATH, header=True).drop(
    "checkTimeEt", "commercialScore", "dateChecked", "dateModified", "grade", "hash", "hospitalized", "negativeIncrease", 
    "negativeRegularScore", "negativeScore", "posNeg", "positiveScore", "score", "total"
)

covid_data.printSchema()

print("Total number records: ", covid_data.count())

## Weather data

In [None]:
from datetime import datetime, date
# !pip3 install pandas
import pandas as pd
from pyspark.sql import Row

In [None]:
df = spark.createDataFrame([
    Row(a=1, b=2, c='string1', d=date(2000, 1, 1), e=datetime(2000, 1, 1, 12, 0)),
    Row(a=2, b=3, c='string2', d=date(2000, 2, 1), e=datetime(2000, 1, 2, 12, 0)),
    Row(a=3, b=5, c='string3', d=date(2000, 2, 1), e=datetime(2000, 1, 3, 12, 0))
])
df

In [None]:
df = spark.createDataFrame([
    Row(a=1, b=2., c='string1', d=date(2000, 1, 1), e=datetime(2000, 1, 1, 12, 0)),
    Row(a=2, b=3., c='string2', d=date(2000, 2, 1), e=datetime(2000, 1, 2, 12, 0)),
    Row(a=3, b=5., c='string3', d=date(2000, 2, 1), e=datetime(2000, 1, 3, 12, 0))
], schema='a long, b double, c string, d date, e timestamp')
df

In [None]:
pandas_df = pd.DataFrame({
    'a': [1, 2, 3],
    'b': [2., 3., 4.],
    'c': ['string1', 'string2', 'string3'],
    'd': [date(2000, 1, 1), date(2000, 2, 1), date(2000, 3, 1)],
    'e': [datetime(2000, 1, 1, 12, 0), datetime(2000, 1, 2, 12, 0), datetime(2000, 1, 3, 12, 0)]
})
df = spark.createDataFrame(pandas_df)
df

In [None]:
rdd = spark.sparkContext.parallelize([
    (1, 2., 'a', date(2000, 1, 1), datetime(2000, 1, 1, 12, 0)),
    (2, 3., 'b', date(2000, 2, 1), datetime(2000, 1, 2, 12, 0)),
    (3, 5., 'c', date(2000, 2, 1), datetime(2000, 1, 3, 12, 0))
])
df = spark.createDataFrame(rdd, schema=['a', 'b', 'c', 'd','e'])
df

In [None]:
df.show()

In [None]:
df.printSchema()