## Create spark context

In [1]:
#Entrypoint 2.x
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
spark = SparkSession.builder.config("spark.sql.shuffle.partitions", "2").appName("Analysis").master("local[2]").getOrCreate()

# On yarn:
# spark = SparkSession.builder.appName("Spark SQL basic example").enableHiveSupport().master("yarn").getOrCreate()
# specify .master("yarn")

sc = spark.sparkContext



In [2]:
%run "../includes/configuration"

## Define schema

In [3]:
constructors_schema = "constructorId INT, constructorRef STRING, name STRING, nationality STRING, url STRING"

## Read the JSON file from HDFS & apply schema

In [4]:
constructor_df = spark.read.schema(constructors_schema).csv(f"{data}/constructors.csv", header = True)

In [5]:
constructor_df.show(5)

+-------------+--------------+----------+-----------+--------------------+
|constructorId|constructorRef|      name|nationality|                 url|
+-------------+--------------+----------+-----------+--------------------+
|            1|       mclaren|   McLaren|    British|http://en.wikiped...|
|            2|    bmw_sauber|BMW Sauber|     German|http://en.wikiped...|
|            3|      williams|  Williams|    British|http://en.wikiped...|
|            4|       renault|   Renault|     French|http://en.wikiped...|
|            5|    toro_rosso|Toro Rosso|    Italian|http://en.wikiped...|
+-------------+--------------+----------+-----------+--------------------+
only showing top 5 rows



## Drop unwanted columns from the dataframe

In [6]:
constructor_dropped_df = constructor_df.drop(col('url'))

## Rename column

In [7]:
constructor_final_df = constructor_dropped_df.withColumnRenamed("constructorId", "constructor_id")\
.withColumnRenamed("constructorRef", "constructor_ref")

In [8]:
constructor_final_df.show(5)

+--------------+---------------+----------+-----------+
|constructor_id|constructor_ref|      name|nationality|
+--------------+---------------+----------+-----------+
|             1|        mclaren|   McLaren|    British|
|             2|     bmw_sauber|BMW Sauber|     German|
|             3|       williams|  Williams|    British|
|             4|        renault|   Renault|     French|
|             5|     toro_rosso|Toro Rosso|    Italian|
+--------------+---------------+----------+-----------+
only showing top 5 rows



## Write outut to parquet file

In [9]:
constructor_final_df.write.mode('overwrite').parquet(f"{processed_folder_path}/constructors")