# Ingest Circuits file

In [0]:
dbutils.widgets.help()

In [0]:
# this widget would help us to ingest data from various sources and process it using the same notebook
dbutils.widgets.text("p_data_source","")
# we have to pass in a value at the top text box
v_data_source = dbutils.widgets.get("p_data_source")
# add it as a column while creating the parquet file

In [0]:
%run "/databricks-course/Formula 1/includes/configurations.ipynb"

In [0]:
%run "/databricks-course/Formula 1/includes/common_functions"

In [0]:
display(dbutils.fs.mounts())

mountPoint,source,encryptionType
/databricks-datasets,databricks-datasets,
/Volumes,UnityCatalogVolumes,
/mnt/formula1dlvb/presentation,abfss://presentation@formula1dlvb.dfs.core.windows.net/,
/databricks/mlflow-tracking,databricks/mlflow-tracking,
/mnt/formula1dl/demo,abfss://demo@formula1dlvb.dfs.core.windows.net/,
/databricks-results,databricks-results,
/databricks/mlflow-registry,databricks/mlflow-registry,
/mnt/formula1dlvb/raw,abfss://raw@formula1dlvb.dfs.core.windows.net/,
/mnt/formula1dlvb/processed,abfss://processed@formula1dlvb.dfs.core.windows.net/,
/Volume,DbfsReserved,


In [0]:
%fs
ls /mnt/formula1dlvb/raw

path,name,size,modificationTime
dbfs:/mnt/formula1dlvb/raw/circuits.csv,circuits.csv,10044,1696942444000
dbfs:/mnt/formula1dlvb/raw/constructors.json,constructors.json,30415,1696942444000
dbfs:/mnt/formula1dlvb/raw/drivers.json,drivers.json,180812,1696942444000
dbfs:/mnt/formula1dlvb/raw/lap_times/,lap_times/,0,1696942528000
dbfs:/mnt/formula1dlvb/raw/pit_stops.json,pit_stops.json,1369387,1696942444000
dbfs:/mnt/formula1dlvb/raw/qualifying/,qualifying/,0,1696942529000
dbfs:/mnt/formula1dlvb/raw/races.csv,races.csv,116847,1696942444000
dbfs:/mnt/formula1dlvb/raw/results.json,results.json,7165641,1696942445000


In [0]:
circuits_df = spark.read.option("header",True).option("inferSchema",True).csv(f"dbfs:{raw_folder_path}/circuits.csv")

# inferSchema goes through the data, identify what the schema is and apply it to the data.The complete data is read and this is not suitable in production environment and it can slow down reads.

In [0]:
type(circuits_df)

pyspark.sql.dataframe.DataFrame

In [0]:
display(circuits_df)

circuitId,circuitRef,name,location,country,lat,lng,alt,url
1,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,-37.8497,144.968,10,http://en.wikipedia.org/wiki/Melbourne_Grand_Prix_Circuit
2,sepang,Sepang International Circuit,Kuala Lumpur,Malaysia,2.76083,101.738,18,http://en.wikipedia.org/wiki/Sepang_International_Circuit
3,bahrain,Bahrain International Circuit,Sakhir,Bahrain,26.0325,50.5106,7,http://en.wikipedia.org/wiki/Bahrain_International_Circuit
4,catalunya,Circuit de Barcelona-Catalunya,Montmeló,Spain,41.57,2.26111,109,http://en.wikipedia.org/wiki/Circuit_de_Barcelona-Catalunya
5,istanbul,Istanbul Park,Istanbul,Turkey,40.9517,29.405,130,http://en.wikipedia.org/wiki/Istanbul_Park
6,monaco,Circuit de Monaco,Monte-Carlo,Monaco,43.7347,7.42056,7,http://en.wikipedia.org/wiki/Circuit_de_Monaco
7,villeneuve,Circuit Gilles Villeneuve,Montreal,Canada,45.5,-73.5228,13,http://en.wikipedia.org/wiki/Circuit_Gilles_Villeneuve
8,magny_cours,Circuit de Nevers Magny-Cours,Magny Cours,France,46.8642,3.16361,228,http://en.wikipedia.org/wiki/Circuit_de_Nevers_Magny-Cours
9,silverstone,Silverstone Circuit,Silverstone,UK,52.0786,-1.01694,153,http://en.wikipedia.org/wiki/Silverstone_Circuit
10,hockenheimring,Hockenheimring,Hockenheim,Germany,49.3278,8.56583,103,http://en.wikipedia.org/wiki/Hockenheimring


In [0]:
circuits_df.printSchema()

root
 |-- circuitId: integer (nullable = true)
 |-- circuitRef: string (nullable = true)
 |-- name: string (nullable = true)
 |-- location: string (nullable = true)
 |-- country: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- lng: double (nullable = true)
 |-- alt: integer (nullable = true)
 |-- url: string (nullable = true)



In [0]:
display(circuits_df.describe())

summary,circuitId,circuitRef,name,location,country,lat,lng,alt,url
count,77.0,77,77,77,77,77.0,77.0,77.0,77
mean,39.0,,,,,33.72035103896102,3.551302597402597,247.4935064935065,
stddev,22.37185732119709,,,,,22.88596900007453,64.8766790440326,363.2672505910991,
min,1.0,BAK,A1-Ring,Abu Dhabi,Argentina,-37.8497,-118.189,-7.0,http://en.wikipedia.org/wiki/A1-Ring
max,77.0,zolder,Zolder,Zandvoort,Vietnam,57.2653,144.968,2227.0,http://en.wikipedia.org/wiki/Zolder


**Schema Definition**

In [0]:
# struct_type -> row
# struct_field -> column

from pyspark.sql.types import StructType,StructField,IntegerType, StringType, DoubleType

In [0]:
circuits_schema = StructType(fields = [
    StructField("circuitId",IntegerType(),False),
    StructField("circuitRef",StringType(),True),
    StructField("name",StringType(),True),
    StructField("location",StringType(),True),
    StructField("country",StringType(),True),
    StructField("lat",StringType(),True),
    StructField("lng",StringType(),True),
    StructField("alt",IntegerType(),True),
    StructField("url",StringType(),True)
])

In [0]:
circuits_df = spark.read.option("header",True).schema(circuits_schema).csv("dbfs:/mnt/formula1dlvb/raw/circuits.csv")

In [0]:
circuits_df.printSchema()

root
 |-- circuitId: integer (nullable = true)
 |-- circuitRef: string (nullable = true)
 |-- name: string (nullable = true)
 |-- location: string (nullable = true)
 |-- country: string (nullable = true)
 |-- lat: string (nullable = true)
 |-- lng: string (nullable = true)
 |-- alt: integer (nullable = true)
 |-- url: string (nullable = true)




**Selecting Only Required Columns**

In [0]:
circuits_selected = circuits_df.select("circuitId","circuitRef","name","location","lat","lng","alt")

In [0]:
from pyspark.sql.functions import col

In [0]:
circuits_selected = circuits_df.select(col("circuitId"),col("circuitRef"),col("name"),col("location").alias("race_location"),col("lat"),col("lng"),col("alt"))

In [0]:
display(circuits_selected)

circuitId,circuitRef,name,race_location,lat,lng,alt
1,albert_park,Albert Park Grand Prix Circuit,Melbourne,-37.8497,144.968,10
2,sepang,Sepang International Circuit,Kuala Lumpur,2.76083,101.738,18
3,bahrain,Bahrain International Circuit,Sakhir,26.0325,50.5106,7
4,catalunya,Circuit de Barcelona-Catalunya,Montmeló,41.57,2.26111,109
5,istanbul,Istanbul Park,Istanbul,40.9517,29.405,130
6,monaco,Circuit de Monaco,Monte-Carlo,43.7347,7.42056,7
7,villeneuve,Circuit Gilles Villeneuve,Montreal,45.5,-73.5228,13
8,magny_cours,Circuit de Nevers Magny-Cours,Magny Cours,46.8642,3.16361,228
9,silverstone,Silverstone Circuit,Silverstone,52.0786,-1.01694,153
10,hockenheimring,Hockenheimring,Hockenheim,49.3278,8.56583,103


**Renaming Columns**

In [0]:
from pyspark.sql.functions import lit

In [0]:
circuits_renamed_df = circuits_selected\
.withColumnRenamed("circuitId","circuit_id")\
.withColumnRenamed("lat","latitude")\
.withColumnRenamed("lng","longitude")\
.withColumnRenamed("alt","altitude")\
.withColumn("data_source",lit(v_data_source)) # convert it to column type and add it. lit helps us do this


In [0]:
display(circuits_renamed_df)

circuit_id,circuitRef,name,race_location,latitude,longitude,altitude,data_source
1,albert_park,Albert Park Grand Prix Circuit,Melbourne,-37.8497,144.968,10,testing
2,sepang,Sepang International Circuit,Kuala Lumpur,2.76083,101.738,18,testing
3,bahrain,Bahrain International Circuit,Sakhir,26.0325,50.5106,7,testing
4,catalunya,Circuit de Barcelona-Catalunya,Montmeló,41.57,2.26111,109,testing
5,istanbul,Istanbul Park,Istanbul,40.9517,29.405,130,testing
6,monaco,Circuit de Monaco,Monte-Carlo,43.7347,7.42056,7,testing
7,villeneuve,Circuit Gilles Villeneuve,Montreal,45.5,-73.5228,13,testing
8,magny_cours,Circuit de Nevers Magny-Cours,Magny Cours,46.8642,3.16361,228,testing
9,silverstone,Silverstone Circuit,Silverstone,52.0786,-1.01694,153,testing
10,hockenheimring,Hockenheimring,Hockenheim,49.3278,8.56583,103,testing


### Add Columns

In [0]:
from pyspark.sql.functions import current_timestamp,lit

In [0]:
circuits_final_df = add_ingestion_date(circuits_renamed_df).withColumn("ingestion_date",current_timestamp()).withColumn("env",lit("Production"))

In [0]:
display(circuits_final_df)

circuit_id,circuitRef,name,race_location,latitude,longitude,altitude,ingestion_date,env
1,albert_park,Albert Park Grand Prix Circuit,Melbourne,-37.8497,144.968,10,2023-10-17T18:07:32.261+0000,Production
2,sepang,Sepang International Circuit,Kuala Lumpur,2.76083,101.738,18,2023-10-17T18:07:32.261+0000,Production
3,bahrain,Bahrain International Circuit,Sakhir,26.0325,50.5106,7,2023-10-17T18:07:32.261+0000,Production
4,catalunya,Circuit de Barcelona-Catalunya,Montmeló,41.57,2.26111,109,2023-10-17T18:07:32.261+0000,Production
5,istanbul,Istanbul Park,Istanbul,40.9517,29.405,130,2023-10-17T18:07:32.261+0000,Production
6,monaco,Circuit de Monaco,Monte-Carlo,43.7347,7.42056,7,2023-10-17T18:07:32.261+0000,Production
7,villeneuve,Circuit Gilles Villeneuve,Montreal,45.5,-73.5228,13,2023-10-17T18:07:32.261+0000,Production
8,magny_cours,Circuit de Nevers Magny-Cours,Magny Cours,46.8642,3.16361,228,2023-10-17T18:07:32.261+0000,Production
9,silverstone,Silverstone Circuit,Silverstone,52.0786,-1.01694,153,2023-10-17T18:07:32.261+0000,Production
10,hockenheimring,Hockenheimring,Hockenheim,49.3278,8.56583,103,2023-10-17T18:07:32.261+0000,Production


### Write to Parquet file

In [0]:
circuits_final_df.write.mode("overwrite").parquet(f"dbfs:{processed_folder_path}/circuits")

In [0]:
%fs
ls "dbfs:/mnt/formula1dlvb/processed/circuits"

path,name,size,modificationTime
dbfs:/mnt/formula1dlvb/processed/circuits/_SUCCESS,_SUCCESS,0,1697386527000
dbfs:/mnt/formula1dlvb/processed/circuits/_committed_2002322418279626914,_committed_2002322418279626914,123,1697386349000
dbfs:/mnt/formula1dlvb/processed/circuits/_committed_2702703524669758258,_committed_2702703524669758258,232,1697386525000
dbfs:/mnt/formula1dlvb/processed/circuits/_started_2002322418279626914,_started_2002322418279626914,0,1697386345000
dbfs:/mnt/formula1dlvb/processed/circuits/_started_2702703524669758258,_started_2702703524669758258,0,1697386524000
dbfs:/mnt/formula1dlvb/processed/circuits/part-00000-tid-2702703524669758258-a3876e0c-f09b-4a9d-ab8e-ce089c5e46e2-15-1-c000.snappy.parquet,part-00000-tid-2702703524669758258-a3876e0c-f09b-4a9d-ab8e-ce089c5e46e2-15-1-c000.snappy.parquet,7528,1697386525000


In [0]:
df = spark.read.parquet("dbfs:/mnt/formula1dlvb/processed/circuits")

In [0]:
display(df)

circuit_id,circuitRef,name,race_location,latitude,longitude,altitude,ingestion_date,env
1,albert_park,Albert Park Grand Prix Circuit,Melbourne,-37.8497,144.968,10,2023-10-15T16:15:22.942+0000,Production
2,sepang,Sepang International Circuit,Kuala Lumpur,2.76083,101.738,18,2023-10-15T16:15:22.942+0000,Production
3,bahrain,Bahrain International Circuit,Sakhir,26.0325,50.5106,7,2023-10-15T16:15:22.942+0000,Production
4,catalunya,Circuit de Barcelona-Catalunya,Montmeló,41.57,2.26111,109,2023-10-15T16:15:22.942+0000,Production
5,istanbul,Istanbul Park,Istanbul,40.9517,29.405,130,2023-10-15T16:15:22.942+0000,Production
6,monaco,Circuit de Monaco,Monte-Carlo,43.7347,7.42056,7,2023-10-15T16:15:22.942+0000,Production
7,villeneuve,Circuit Gilles Villeneuve,Montreal,45.5,-73.5228,13,2023-10-15T16:15:22.942+0000,Production
8,magny_cours,Circuit de Nevers Magny-Cours,Magny Cours,46.8642,3.16361,228,2023-10-15T16:15:22.942+0000,Production
9,silverstone,Silverstone Circuit,Silverstone,52.0786,-1.01694,153,2023-10-15T16:15:22.942+0000,Production
10,hockenheimring,Hockenheimring,Hockenheim,49.3278,8.56583,103,2023-10-15T16:15:22.942+0000,Production
