# Ingest circuits.csv file

In [0]:
dbutils.widgets.help()

In [0]:
dbutils.widgets.dropdown("Environment", "Dev", ["Prod", "Dev", "Test"], "Environment")
env = dbutils.widgets.get("Environment")

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

### Step 1 - Read the CSV file using spark dataframe reader

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType

In [0]:
circuits_schema = StructType(fields=[StructField("circuitId", IntegerType(), False),
                                     StructField("circuitRef", StringType(), True),
                                     StructField("name", StringType(), True),
                                     StructField("location", StringType(), True),
                                     StructField("country", StringType(), True),
                                     StructField("lat", DoubleType(), True),
                                     StructField("lng", DoubleType(), True),
                                     StructField("alt", IntegerType(), True),
                                     StructField("url", StringType(), True),
                                    
                                    ])

In [0]:
circuits_df = spark.read.csv(f"{raw_folder_path}/circuits.csv", header=True, schema=circuits_schema)

In [0]:
# display(circuits_df)
circuits_df.show(5)

+---------+-----------+--------------------+------------+---------+--------+-------+---+--------------------+
|circuitId| circuitRef|                name|    location|  country|     lat|    lng|alt|                 url|
+---------+-----------+--------------------+------------+---------+--------+-------+---+--------------------+
|        1|albert_park|Albert Park Grand...|   Melbourne|Australia|-37.8497|144.968| 10|http://en.wikiped...|
|        2|     sepang|Sepang Internatio...|Kuala Lumpur| Malaysia| 2.76083|101.738| 18|http://en.wikiped...|
|        3|    bahrain|Bahrain Internati...|      Sakhir|  Bahrain| 26.0325|50.5106|  7|http://en.wikiped...|
|        4|  catalunya|Circuit de Barcel...|    Montmeló|    Spain|   41.57|2.26111|109|http://en.wikiped...|
|        5|   istanbul|       Istanbul Park|    Istanbul|   Turkey| 40.9517| 29.405|130|http://en.wikiped...|
+---------+-----------+--------------------+------------+---------+--------+-------+---+--------------------+
only showi

### Step 2 - Select only required columns

In [0]:
from pyspark.sql.functions import col, lit

In [0]:
circuits_selected_df = circuits_df.select(col("circuitId"), col("circuitRef"), col("name"), col("location"), col("country"), col("lat"), col("lng"), col("alt"))

In [0]:
# display(circuits_selected_df)
circuits_selected_df.show(5)

+---------+-----------+--------------------+------------+---------+--------+-------+---+
|circuitId| circuitRef|                name|    location|  country|     lat|    lng|alt|
+---------+-----------+--------------------+------------+---------+--------+-------+---+
|        1|albert_park|Albert Park Grand...|   Melbourne|Australia|-37.8497|144.968| 10|
|        2|     sepang|Sepang Internatio...|Kuala Lumpur| Malaysia| 2.76083|101.738| 18|
|        3|    bahrain|Bahrain Internati...|      Sakhir|  Bahrain| 26.0325|50.5106|  7|
|        4|  catalunya|Circuit de Barcel...|    Montmeló|    Spain|   41.57|2.26111|109|
|        5|   istanbul|       Istanbul Park|    Istanbul|   Turkey| 40.9517| 29.405|130|
+---------+-----------+--------------------+------------+---------+--------+-------+---+
only showing top 5 rows



### Step 3 - Rename the columns as required

In [0]:
circuits_renamed_df = circuits_selected_df.withColumnRenamed("circuitId", "circuit_id").withColumnRenamed("circuitRef", "circuit_ref").withColumnRenamed("lat", "latitude").withColumnRenamed("lng", "longitude").withColumnRenamed("alt", "altitude").withColumn("env", lit(env))

In [0]:
# display(circuits_renamed_df)
circuits_renamed_df.show(5)

+----------+-----------+--------------------+------------+---------+--------+---------+--------+----+
|circuit_id|circuit_ref|                name|    location|  country|latitude|longitude|altitude| env|
+----------+-----------+--------------------+------------+---------+--------+---------+--------+----+
|         1|albert_park|Albert Park Grand...|   Melbourne|Australia|-37.8497|  144.968|      10|Prod|
|         2|     sepang|Sepang Internatio...|Kuala Lumpur| Malaysia| 2.76083|  101.738|      18|Prod|
|         3|    bahrain|Bahrain Internati...|      Sakhir|  Bahrain| 26.0325|  50.5106|       7|Prod|
|         4|  catalunya|Circuit de Barcel...|    Montmeló|    Spain|   41.57|  2.26111|     109|Prod|
|         5|   istanbul|       Istanbul Park|    Istanbul|   Turkey| 40.9517|   29.405|     130|Prod|
+----------+-----------+--------------------+------------+---------+--------+---------+--------+----+
only showing top 5 rows



### Step 4 - Adding ingestion date to the dataframe

In [0]:
from pyspark.sql.functions import current_timestamp

In [0]:
# circuits_final_df = circuits_renamed_df.withColumn("ingestion_date", current_timestamp())
circuits_final_df = add_ingestion_date(circuits_renamed_df)

In [0]:
# display(circuits_final_df)
circuits_final_df.show(5)

+----------+-----------+--------------------+------------+---------+--------+---------+--------+----+--------------------+
|circuit_id|circuit_ref|                name|    location|  country|latitude|longitude|altitude| env|      ingestion_date|
+----------+-----------+--------------------+------------+---------+--------+---------+--------+----+--------------------+
|         1|albert_park|Albert Park Grand...|   Melbourne|Australia|-37.8497|  144.968|      10|Prod|2023-06-02 20:56:...|
|         2|     sepang|Sepang Internatio...|Kuala Lumpur| Malaysia| 2.76083|  101.738|      18|Prod|2023-06-02 20:56:...|
|         3|    bahrain|Bahrain Internati...|      Sakhir|  Bahrain| 26.0325|  50.5106|       7|Prod|2023-06-02 20:56:...|
|         4|  catalunya|Circuit de Barcel...|    Montmeló|    Spain|   41.57|  2.26111|     109|Prod|2023-06-02 20:56:...|
|         5|   istanbul|       Istanbul Park|    Istanbul|   Turkey| 40.9517|   29.405|     130|Prod|2023-06-02 20:56:...|
+----------+----

### Step 5 - Write data to datalake as Parquet

In [0]:
circuits_final_df.write.mode("overwrite").parquet(f"{processed_folder_path}/circuits.parquet")

In [0]:
df = spark.read.parquet(f"{processed_folder_path}/circuits.parquet")

In [0]:
# display(df)
df.show(5)

+----------+-----------+--------------------+------------+---------+--------+---------+--------+----+--------------------+
|circuit_id|circuit_ref|                name|    location|  country|latitude|longitude|altitude| env|      ingestion_date|
+----------+-----------+--------------------+------------+---------+--------+---------+--------+----+--------------------+
|         1|albert_park|Albert Park Grand...|   Melbourne|Australia|-37.8497|  144.968|      10|Prod|2023-06-02 20:56:...|
|         2|     sepang|Sepang Internatio...|Kuala Lumpur| Malaysia| 2.76083|  101.738|      18|Prod|2023-06-02 20:56:...|
|         3|    bahrain|Bahrain Internati...|      Sakhir|  Bahrain| 26.0325|  50.5106|       7|Prod|2023-06-02 20:56:...|
|         4|  catalunya|Circuit de Barcel...|    Montmeló|    Spain|   41.57|  2.26111|     109|Prod|2023-06-02 20:56:...|
|         5|   istanbul|       Istanbul Park|    Istanbul|   Turkey| 40.9517|   29.405|     130|Prod|2023-06-02 20:56:...|
+----------+----