# Ingest constructors.json file

In [0]:
dbutils.widgets.dropdown("Environment", "Dev", ["Prod", "Dev", "Test"], "Environment")
env = dbutils.widgets.get("Environment")

In [0]:
dbutils.widgets.text("p_file_date", "2021-03-21")
v_file_date = dbutils.widgets.get("p_file_date")

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

### Step 1 - Read the JSON file using the spark dataframe reader

In [0]:
constructors_schema = "constructorId INT, constructorRef STRING, name STRING, nationality STRING, url STRING"

In [0]:
constructor_df = spark.read.schema(constructors_schema).json(f"{raw_folder_path}/{v_file_date}/constructors.json")

In [0]:
# display(constructor_df)
constructor_df.show(5)

+-------------+--------------+----------+-----------+--------------------+
|constructorId|constructorRef|      name|nationality|                 url|
+-------------+--------------+----------+-----------+--------------------+
|            1|       mclaren|   McLaren|    British|http://en.wikiped...|
|            2|    bmw_sauber|BMW Sauber|     German|http://en.wikiped...|
|            3|      williams|  Williams|    British|http://en.wikiped...|
|            4|       renault|   Renault|     French|http://en.wikiped...|
|            5|    toro_rosso|Toro Rosso|    Italian|http://en.wikiped...|
+-------------+--------------+----------+-----------+--------------------+
only showing top 5 rows



### Step 2 - Drop unwanted columns

In [0]:
constructor_dropped_df = constructor_df.drop('url')

### Step 3 - Rename column and add ingestion date

In [0]:
from pyspark.sql.functions import current_timestamp, lit

In [0]:
constructor_final_df = constructor_dropped_df.withColumnRenamed("constructorId", "constructor_id").withColumnRenamed("constructorRef", "constructor_ref").withColumn("env", lit(env)).withColumn("ingestion_date", current_timestamp()).withColumn("file_date", lit(v_file_date))

In [0]:
# display(constructor_final_df)
constructor_final_df.show(5)

+--------------+---------------+----------+-----------+----+--------------------+----------+
|constructor_id|constructor_ref|      name|nationality| env|      ingestion_date| file_date|
+--------------+---------------+----------+-----------+----+--------------------+----------+
|             1|        mclaren|   McLaren|    British|Prod|2023-06-17 16:35:...|2021-03-21|
|             2|     bmw_sauber|BMW Sauber|     German|Prod|2023-06-17 16:35:...|2021-03-21|
|             3|       williams|  Williams|    British|Prod|2023-06-17 16:35:...|2021-03-21|
|             4|        renault|   Renault|     French|Prod|2023-06-17 16:35:...|2021-03-21|
|             5|     toro_rosso|Toro Rosso|    Italian|Prod|2023-06-17 16:35:...|2021-03-21|
+--------------+---------------+----------+-----------+----+--------------------+----------+
only showing top 5 rows



### Step 4 - Write output to parquet file

In [0]:
dbutils.fs.rm("dbfs:/FileStore/tables/processed/constructors", True)

# constructor_final_df.write.mode("overwrite").parquet(f"{processed_folder_path}/constructors.parquet")
# constructor_final_df.write.mode("overwrite").format("parquet").saveAsTable("f1_processed.constructors")
constructor_final_df.write.mode("overwrite").format("delta").saveAsTable("f1_processed.constructors")

In [0]:
dbutils.notebook.exit("Success")

Success