# Ingest lap_times folder

### Step 1 - Read the CSV files using the spark dataframe reader API

In [0]:
from pyspark.sql.types import StructField, StructType, StringType, IntegerType
from pyspark.sql import functions as f

In [0]:
lap_times_schema = StructType(fields=[StructField("raceId", IntegerType(), False),
                                      StructField("driverId", IntegerType(), True),
                                      StructField("lap", IntegerType(), True),
                                      StructField("position", IntegerType(), True),
                                      StructField("time", StringType(), True),
                                      StructField("milliseconds", IntegerType(), False),
                                     ])

In [0]:
df = spark.read.schema(lap_times_schema).csv("/FileStore/tables/lap_times/")

In [0]:
df.show(5)

+------+--------+---+--------+--------+------------+
|raceId|driverId|lap|position|    time|milliseconds|
+------+--------+---+--------+--------+------------+
|   841|      20|  1|       1|1:38.109|       98109|
|   841|      20|  2|       1|1:33.006|       93006|
|   841|      20|  3|       1|1:32.713|       92713|
|   841|      20|  4|       1|1:32.803|       92803|
|   841|      20|  5|       1|1:32.342|       92342|
+------+--------+---+--------+--------+------------+
only showing top 5 rows



In [0]:
df.count()

Out[11]: 490904

### Step 2 - Rename columns and add new columns

In [0]:
transformed_df = df.withColumnRenamed("raceId", "race_id").withColumnRenamed("driverId", "driver_id").withColumn("ingestion_date", f.current_timestamp())

In [0]:
transformed_df.show(5)

+-------+---------+---+--------+--------+------------+--------------------+
|race_id|driver_id|lap|position|    time|milliseconds|      ingestion_date|
+-------+---------+---+--------+--------+------------+--------------------+
|    841|       20|  1|       1|1:38.109|       98109|2023-06-02 17:11:...|
|    841|       20|  2|       1|1:33.006|       93006|2023-06-02 17:11:...|
|    841|       20|  3|       1|1:32.713|       92713|2023-06-02 17:11:...|
|    841|       20|  4|       1|1:32.803|       92803|2023-06-02 17:11:...|
|    841|       20|  5|       1|1:32.342|       92342|2023-06-02 17:11:...|
+-------+---------+---+--------+--------+------------+--------------------+
only showing top 5 rows



### Step 3 - Write the output to processed container in parquet format

In [0]:
transformed_df \
.write \
.mode("overwrite") \
.parquet("/FileStore/tables/lap_times.parquet")