In [89]:
from pyspark.sql.functions import isnan, when, count, col
import pandas as pd
from pyspark.sql import functions as F
from pyspark.sql.functions import col
from pyspark.sql.types import StructType, StructField, DoubleType

In [1]:
from pyspark.sql import SparkSession, functions as F

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Tutorial 2")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config('spark.driver.memory', '4g')
    .config('spark.executor.memory', '2g')
    .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/08/21 01:42:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [91]:
sdf = spark.read.parquet('../data/weather/weather_data.parquet')
df = pd.read_parquet('../data/weather/weather_data.parquet')
sdf.printSchema()
sdf.show(1, vertical=True, truncate=100)
# show last row
print(sdf.tail(1))
# check if there are any null values
pd.isnull(df).sum()
# count total records
print(df.shape)


# after looking this there is no need for any preprocessing in weather data
# in this case landing and raw data are same

root
 |-- STATION: string (nullable = true)
 |-- NAME: string (nullable = true)
 |-- DATE: string (nullable = true)
 |-- AWND: double (nullable = true)
 |-- PGTM: double (nullable = true)
 |-- PRCP: double (nullable = true)
 |-- SNOW: double (nullable = true)
 |-- TAVG: double (nullable = true)
 |-- TMAX: double (nullable = true)
 |-- TMIN: double (nullable = true)

-RECORD 0-----------------------------------
 STATION | USW00094789                      
 NAME    | JFK INTERNATIONAL AIRPORT, NY US 
 DATE    | 2022-01-01                       
 AWND    | 2.8                              
 PGTM    | null                             
 PRCP    | 31.0                             
 SNOW    | 0.0                              
 TAVG    | 10.1                             
 TMAX    | 11.7                             
 TMIN    | 8.9                              
only showing top 1 row

[Row(STATION='USW00094789', NAME='JFK INTERNATIONAL AIRPORT, NY US', DATE='2023-05-31', AWND=2.3, PGTM=None, PRC

## Lets begin some preprocessing on yellow taxi data

In [92]:
sdf = spark.read.parquet('../data/tlc/')
sdf_jan = spark.read.parquet('../data/tlc/2022-01.parquet')
sdf_jan.printSchema()
sdf_feb_23 = spark.read.parquet('../data/tlc/2023-02.parquet')
sdf_feb_23.printSchema()

root
 |-- VendorID: long (nullable = true)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: double (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetim

In [93]:
# first before moving into raw layer data from 2022 have passenger count as double 
# lets cast all data to double and then save it to raw layer
def cast_to_schema(year, start, end, temp_schema):
    for month in range(start, end+1):
        input_path = f'../data/tlc/{year}-{str(month).zfill(2)}.parquet'
        output_path = f'../data/raw/{year}-{str(month).zfill(2)}.parquet'
        sdf_malformed = spark.read.parquet(input_path)
        sdf_malformed = sdf_malformed \
            .select([F.col(c).cast(temp_schema[i].dataType) for i, c in enumerate(sdf_malformed.columns)])
        sdf_malformed \
        .coalesce(1) \
        .write \
        .mode('overwrite') \
        .parquet(output_path)

In [94]:
# see if there is any non-whole number passenger_count. So it can be removed
# before any proper data conversion can be made. Lets cast whole data to 
# jan 2022 schema which contains passenger_count as double
tem_schema = sdf_jan.schema
tem_schema
cast_to_schema("2022", 1, 12, tem_schema)
cast_to_schema("2023", 1, 5, tem_schema)

                                                                                

In [95]:
# see if there is any non-whole number passenger_count
sdf = spark.read.parquet('../data/raw/*')
non_whole_count = sdf.filter(col("passenger_count") % 1 != 0).count()
print(f"Number of rows with non-whole number passenger counts: {non_whole_count}")



Number of rows with non-whole number passenger counts: 0


                                                                                

In [96]:
# well we are now sure there wasn't any non-whole number passenger_count
# so lets try to cast all datatype to 2023 february schema (proper schema)
sdf_feb_23.printSchema()

# also ensuring we have consistent casing
consistent_col_casing = [F.col(col_name).alias(col_name.lower()) for col_name in sdf_feb_23.columns]
sdf_feb_23 = sdf_feb_23.select(*consistent_col_casing)
sdf_schema = sdf_feb_23.schema
sdf_schema

cast_to_schema(2022, 1, 12, sdf_schema)
cast_to_schema(2023, 1, 5, sdf_schema)

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- Airport_fee: double (nullable = true)



                                                                                

In [97]:
sdf = spark.read.parquet('../data/raw/*')
sdf.printSchema()
sdf.show(1, vertical=True, truncate=100)

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)

-RECORD 0------------------------------------
 VendorID              | 1                   
 tpep_pickup_datetime  | 2022-10-

## Done with raw data &#128512;