## Create spark context

In [24]:
#Entrypoint 2.x
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
spark = SparkSession.builder.config("spark.sql.shuffle.partitions", "2").appName("Analysis").master("local[2]").getOrCreate()

# On yarn:
# spark = SparkSession.builder.appName("Spark SQL basic example").enableHiveSupport().master("yarn").getOrCreate()
# specify .master("yarn")

sc = spark.sparkContext

In [25]:
%run "../includes/configuration"

## Read files

In [26]:
races_df = spark.read.csv(r"/home/sunbeam/Desktop/FastLaneForecast/csv_transformations/clean_csv/races_clean.csv", header = True).withColumnRenamed("circuit_id", "races_circuit_id").withColumnRenamed("name", "race_name")
circuits_df = spark.read.csv(r"/home/sunbeam/Desktop/FastLaneForecast/csv_transformations/clean_csv/circuits_clean.csv", header = True).withColumnRenamed("name", "circuit_name")

In [27]:
races_df.columns

['race_id', 'round', 'races_circuit_id', 'race_name', 'race_year']

In [28]:
circuits_df.columns

['circuit_id',
 'circuit_ref',
 'circuit_name',
 'location',
 'country',
 'latitude',
 'longitude',
 'altitude']

## Most popular circuit of all time

In [29]:
circuits_races_df = circuits_df.join(races_df, circuits_df.circuit_id == races_df.races_circuit_id)

In [50]:
circuits_races_df.columns

['circuit_id',
 'circuit_ref',
 'circuit_name',
 'location',
 'country',
 'latitude',
 'longitude',
 'altitude',
 'race_id',
 'round',
 'races_circuit_id',
 'race_name',
 'race_year']

In [31]:
circuits_races_df.createOrReplaceTempView("circuits_races_table")

In [51]:
query = """
SELECT
    country,
    circuit_name,
    COUNT(race_id) AS races_count
FROM circuits_races_table
GROUP BY country, circuit_name
ORDER BY races_count DESC
"""

In [52]:
spark.sql(query).show()

+------------+--------------------+-----------+
|     country|        circuit_name|races_count|
+------------+--------------------+-----------+
|       Italy|Autodromo Naziona...|         71|
|      Monaco|   Circuit de Monaco|         67|
|          UK| Silverstone Circuit|         56|
|     Belgium|Circuit de Spa-Fr...|         54|
|     Germany|         Nürburgring|         42|
|      Canada|Circuit Gilles Vi...|         41|
|      Brazil|Autódromo José Ca...|         38|
|     Germany|      Hockenheimring|         37|
|     Hungary|         Hungaroring|         36|
|       Japan|      Suzuka Circuit|         32|
|       Spain|Circuit de Barcel...|         31|
| Netherlands|Circuit Park Zand...|         31|
|       Italy|Autodromo Enzo e ...|         29|
|     Austria|             A1-Ring|         25|
|   Australia|Albert Park Grand...|         25|
|      Mexico|Autódromo Hermano...|         21|
|         USA|        Watkins Glen|         20|
|   Argentina|Autódromo Juan y ...|     

## Most popular circuit of last decade

In [53]:
query = """
SELECT
    country,
    circuit_name,
    COUNT(race_id) AS races_count
FROM circuits_races_table
WHERE race_year BETWEEN 2011 AND 2020
GROUP BY country, circuit_name
ORDER BY races_count DESC
"""

In [54]:
spark.sql(query).show()

+---------+--------------------+-----------+
|  country|        circuit_name|races_count|
+---------+--------------------+-----------+
|       UK| Silverstone Circuit|         11|
|  Belgium|Circuit de Spa-Fr...|         10|
|  Hungary|         Hungaroring|         10|
|      UAE|  Yas Marina Circuit|         10|
|  Bahrain|Bahrain Internati...|         10|
|    Italy|Autodromo Naziona...|         10|
|    Spain|Circuit de Barcel...|         10|
|Australia|Albert Park Grand...|          9|
|Singapore|Marina Bay Street...|          9|
|   Canada|Circuit Gilles Vi...|          9|
|    China|Shanghai Internat...|          9|
|   Monaco|   Circuit de Monaco|          9|
|    Japan|      Suzuka Circuit|          9|
|   Brazil|Autódromo José Ca...|          9|
|      USA|Circuit of the Am...|          8|
|  Austria|       Red Bull Ring|          8|
| Malaysia|Sepang Internatio...|          7|
|   Russia|      Sochi Autodrom|          7|
|  Germany|      Hockenheimring|          5|
|   Mexico