## Create spark context

In [6]:
#Entrypoint 2.x
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
spark = SparkSession.builder.config("spark.sql.shuffle.partitions", "2").appName("Analysis").master("local[2]").getOrCreate()

# On yarn:
# spark = SparkSession.builder.appName("Spark SQL basic example").enableHiveSupport().master("yarn").getOrCreate()
# specify .master("yarn")

sc = spark.sparkContext

In [7]:
%run "../includes/configuration"

## Read the CSV file from HDFS & apply schema

In [10]:
drivers_df = spark.read.csv(f"{data}/drivers.csv", header = True)

In [11]:
drivers_df.show(5)

+--------+----------+------+----+--------+----------+----------+-----------+--------------------+
|driverId| driverRef|number|code|forename|   surname|       dob|nationality|                 url|
+--------+----------+------+----+--------+----------+----------+-----------+--------------------+
|       1|  hamilton|    44| HAM|   Lewis|  Hamilton|1985-01-07|    British|http://en.wikiped...|
|       2|  heidfeld|    \N| HEI|    Nick|  Heidfeld|1977-05-10|     German|http://en.wikiped...|
|       3|   rosberg|     6| ROS|    Nico|   Rosberg|1985-06-27|     German|http://en.wikiped...|
|       4|    alonso|    14| ALO|Fernando|    Alonso|1981-07-29|    Spanish|http://en.wikiped...|
|       5|kovalainen|    \N| KOV|  Heikki|Kovalainen|1981-10-19|    Finnish|http://en.wikiped...|
+--------+----------+------+----+--------+----------+----------+-----------+--------------------+
only showing top 5 rows



## Rename columns  & concatenate forename & surname

In [16]:
drivers_with_columns_df = drivers_df.withColumnRenamed("driverId", "driver_id")\
.withColumnRenamed("driverRef", "driver_ref")\
.withColumn("name", concat(col("forename"), lit(" "), col("surname")))

In [17]:
drivers_with_columns_df.show(5)

+---------+----------+------+----+--------+----------+----------+-----------+--------------------+-----------------+
|driver_id|driver_ref|number|code|forename|   surname|       dob|nationality|                 url|             name|
+---------+----------+------+----+--------+----------+----------+-----------+--------------------+-----------------+
|        1|  hamilton|    44| HAM|   Lewis|  Hamilton|1985-01-07|    British|http://en.wikiped...|   Lewis Hamilton|
|        2|  heidfeld|    \N| HEI|    Nick|  Heidfeld|1977-05-10|     German|http://en.wikiped...|    Nick Heidfeld|
|        3|   rosberg|     6| ROS|    Nico|   Rosberg|1985-06-27|     German|http://en.wikiped...|     Nico Rosberg|
|        4|    alonso|    14| ALO|Fernando|    Alonso|1981-07-29|    Spanish|http://en.wikiped...|  Fernando Alonso|
|        5|kovalainen|    \N| KOV|  Heikki|Kovalainen|1981-10-19|    Finnish|http://en.wikiped...|Heikki Kovalainen|
+---------+----------+------+----+--------+----------+----------

## Drop unwanted columns

In [18]:
drivers_final_df = drivers_with_columns_df.drop(col("url"))

## Write outut to parquet file

In [19]:
drivers_final_df.write.mode('overwrite').parquet(f"{processed_folder_path}/drivers")