## Create spark context

In [1]:
#Entrypoint 2.x
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
spark = SparkSession.builder.config("spark.sql.shuffle.partitions", "2").appName("Analysis").master("local[2]").getOrCreate()

# On yarn:
# spark = SparkSession.builder.appName("Spark SQL basic example").enableHiveSupport().master("yarn").getOrCreate()
# specify .master("yarn")

sc = spark.sparkContext



In [2]:
%run "../includes/configuration"

## Define schema

In [3]:
pit_stops_schema = StructType([
    StructField("raceId", IntegerType(), False),
    StructField("driverId", IntegerType(), True),
    StructField("stop", StringType(), True),
    StructField("lap", IntegerType(), True),
    StructField("time", StringType(), True),
    StructField("duration", StringType(), True),
    StructField("milliseconds", IntegerType(), True),
])

## Read the CSV file from HDFS & apply schema

In [4]:
# It is a multiLine json file, so we need to specify option "multiline"
pit_stops_df = spark.read.schema(pit_stops_schema).csv(f"{data}/pit_stops.csv")

In [5]:
pit_stops_df.show(5)

+------+--------+----+----+--------+--------+------------+
|raceId|driverId|stop| lap|    time|duration|milliseconds|
+------+--------+----+----+--------+--------+------------+
|  null|    null|stop|null|    time|duration|        null|
|   841|     153|   1|   1|17:05:23|  26.898|       26898|
|   841|      30|   1|   1|17:05:52|  25.021|       25021|
|   841|      17|   1|  11|17:20:48|  23.426|       23426|
|   841|       4|   1|  12|17:22:34|  23.251|       23251|
+------+--------+----+----+--------+--------+------------+
only showing top 5 rows



## Rename columns

In [6]:
final_df = pit_stops_df.withColumnRenamed("driverId", "driver_id").withColumnRenamed("raceId", "race_id")

## Write outut to parquet file

In [7]:
final_df.write.mode('overwrite').parquet(f"{processed_folder_path}/pit_stops")