## Ingested Multi-CSV Files in Lap-Times Folder

##### Step-01 -- Read CSV File and specify the schema

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

In [0]:
lap_times_schema = StructType(fields =[StructField("raceId",IntegerType(),False),
                                       StructField("driverId",IntegerType(),True),
                                       StructField("lap",IntegerType(),True),
                                       StructField("position",IntegerType(),True),
                                       StructField("time",StringType(),True),
                                       StructField("milliseconds",IntegerType(),True),
                                       
    
] )

In [0]:
lap_times_df = spark.read \
.schema(lap_times_schema) \
.csv('/mnt/formula1dlbyumar136/raw/lap_times')

In [0]:
display(lap_times_df)

raceId,driverId,lap,position,time,milliseconds
841,20,1,1,1:38.109,98109
841,20,2,1,1:33.006,93006
841,20,3,1,1:32.713,92713
841,20,4,1,1:32.803,92803
841,20,5,1,1:32.342,92342
841,20,6,1,1:32.605,92605
841,20,7,1,1:32.502,92502
841,20,8,1,1:32.537,92537
841,20,9,1,1:33.240,93240
841,20,10,1,1:32.572,92572


##### Step-02 Rename and add column

In [0]:
from pyspark.sql.functions import current_timestamp

In [0]:
add_columns_df = lap_times_df.withColumnRenamed("resultId","result_id") \
                           .withColumnRenamed("raceId","race_id") \
                            .withColumn("ingestion_date",current_timestamp())

In [0]:
lap_times_df.count()

In [0]:
display (add_columns_df)

race_id,driverId,lap,position,time,milliseconds,ingestion_date
841,20,1,1,1:38.109,98109,2022-09-16T14:17:01.006+0000
841,20,2,1,1:33.006,93006,2022-09-16T14:17:01.006+0000
841,20,3,1,1:32.713,92713,2022-09-16T14:17:01.006+0000
841,20,4,1,1:32.803,92803,2022-09-16T14:17:01.006+0000
841,20,5,1,1:32.342,92342,2022-09-16T14:17:01.006+0000
841,20,6,1,1:32.605,92605,2022-09-16T14:17:01.006+0000
841,20,7,1,1:32.502,92502,2022-09-16T14:17:01.006+0000
841,20,8,1,1:32.537,92537,2022-09-16T14:17:01.006+0000
841,20,9,1,1:33.240,93240,2022-09-16T14:17:01.006+0000
841,20,10,1,1:32.572,92572,2022-09-16T14:17:01.006+0000


##### Step 03 - Save into Processed Container in Parquet format

In [0]:
add_columns_df.write.mode("overwrite").parquet("/mnt/formula1dlbyumar136/processed/lap_times")

In [0]:
display(spark.read.parquet("/mnt/formula1dlbyumar136/processed/lap_times"))

race_id,driverId,lap,position,time,milliseconds,ingestion_date
145,2,39,7,1:27.597,87597,2022-09-16T14:17:58.552+0000
145,2,40,10,1:47.283,107283,2022-09-16T14:17:58.552+0000
145,2,41,10,1:25.187,85187,2022-09-16T14:17:58.552+0000
145,2,42,10,1:23.978,83978,2022-09-16T14:17:58.552+0000
145,2,43,10,1:24.152,84152,2022-09-16T14:17:58.552+0000
145,2,44,9,1:24.952,84952,2022-09-16T14:17:58.552+0000
145,2,45,8,1:24.888,84888,2022-09-16T14:17:58.552+0000
145,2,46,7,1:24.921,84921,2022-09-16T14:17:58.552+0000
145,2,47,7,1:24.027,84027,2022-09-16T14:17:58.552+0000
145,2,48,7,1:24.120,84120,2022-09-16T14:17:58.552+0000
