This notebook perform raw data transformation, and saved the processed data to `data/raw`

In [4]:
from pyspark.sql import SparkSession, functions as F

spark = (
    SparkSession.builder.appName("Duy Thinh raw data")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config('spark.driver.memory', '4g')
    .config('spark.executor.memory', '2g')
    .getOrCreate()
)

Reading schema and cast appropriate datatypes, convert all column names to lowercase

In [5]:
sdf_all = spark.read.parquet('../data/landing/*')
sdf_schema = sdf_all.schema
schema_dict = {field.name.lower(): field.dataType for field in sdf_schema.fields}
schema_dict

{'vendorid': IntegerType(),
 'tpep_pickup_datetime': TimestampNTZType(),
 'tpep_dropoff_datetime': TimestampNTZType(),
 'passenger_count': LongType(),
 'trip_distance': DoubleType(),
 'ratecodeid': LongType(),
 'store_and_fwd_flag': StringType(),
 'pulocationid': IntegerType(),
 'dolocationid': IntegerType(),
 'payment_type': LongType(),
 'fare_amount': DoubleType(),
 'extra': DoubleType(),
 'mta_tax': DoubleType(),
 'tip_amount': DoubleType(),
 'tolls_amount': DoubleType(),
 'improvement_surcharge': DoubleType(),
 'total_amount': DoubleType(),
 'congestion_surcharge': DoubleType(),
 'airport_fee': DoubleType()}

In [6]:
sdf_all.count()

18816606

This part implemented the logic described above

In [7]:
for month in range(7, 13):
    input_path = f'../data/landing/2023-{str(month).zfill(2)}-yellow_cab.parquet'
    output_path = f'../data/raw/2023-{str(month).zfill(2)}-yellow_cab'
    
    sdf_malformed = spark.read.parquet(input_path)
    
    # Convert column names to lowercase
    consistent_col_casing_malformed = [F.col(col_name).alias(col_name.lower()) for col_name in sdf_malformed.columns]
    sdf_malformed = sdf_malformed.select(*consistent_col_casing_malformed)
    
    # Cast columns to match the schema, only for columns present in the schema
    casted_columns = [
        F.col(c).cast(schema_dict[c]) 
        for c in sdf_malformed.columns 
        if c in schema_dict
    ]
    
    # Ensure all columns are in the schema
    for col_name in sdf_malformed.columns:
        if col_name not in schema_dict:
            raise ValueError(f"Column '{col_name}' in the DataFrame is not present in the schema.")

    sdf_malformed = sdf_malformed.select(*casted_columns)
    
    sdf_malformed \
    .coalesce(1) \
    .write \
    .mode('overwrite') \
    .parquet(output_path)

                                                                                

In [8]:
raw_sdf = spark.read.parquet('../data/raw/*')
raw_sdf.show(10, truncate=100)

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|vendorid|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|ratecodeid|store_and_fwd_flag|pulocationid|dolocationid|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|       1| 2023-07-01 00:29:59|  2023-07-01 00:40:15|              1|          1.8|         1|                 N|         140|         263|           1|       12.1|  3.5|    0.5|       5.

In [9]:
raw_sdf.printSchema()

root
 |-- vendorid: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- ratecodeid: long (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- pulocationid: integer (nullable = true)
 |-- dolocationid: integer (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)

