## Create spark context

In [1]:
#Entrypoint 2.x
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
spark = SparkSession.builder.config("spark.sql.shuffle.partitions", "2").appName("Analysis").master("local[2]").getOrCreate()

# On yarn:
# spark = SparkSession.builder.appName("Spark SQL basic example").enableHiveSupport().master("yarn").getOrCreate()
# specify .master("yarn")

sc = spark.sparkContext



In [2]:
%run "../includes/configuration"

## Define schema

In [3]:
lap_times_schema = StructType(fields=[
    StructField("raceId", IntegerType(), False),
    StructField("driverId", IntegerType(), True),
    StructField("lap", IntegerType(), True),
    StructField("position", IntegerType(), True),
    StructField("time", StringType(), True),
    StructField("milliseconds", IntegerType(), True),
])

## Read the JSON file from HDFS & apply schema

In [4]:
lap_times_df = spark.read.schema(lap_times_schema).csv(f"{data}/lap_times.csv")

## Rename columns & add ingestion date

In [5]:
final_df = lap_times_df.withColumnRenamed("driverId", "driver_id").withColumnRenamed("raceId", "race_id")

## Write outut to parquet file

In [6]:
final_df.write.mode('overwrite').parquet(f"{processed_folder_path}/lap_times")