In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("parquet_files").getOrCreate()

In [2]:
spark

In [4]:
drivers_df = spark.read.csv("F1_Complete_Dataset/drivers.csv", header = True, inferSchema = True)

drivers_df.printSchema()
drivers_df.show(n = 5,truncate = False)

root
 |-- driverId: integer (nullable = true)
 |-- driverRef: string (nullable = true)
 |-- number: string (nullable = true)
 |-- code: string (nullable = true)
 |-- forename: string (nullable = true)
 |-- surname: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- nationality: string (nullable = true)
 |-- url: string (nullable = true)

+--------+----------+------+----+--------+----------+----------+-----------+----------------------------------------------+
|driverId|driverRef |number|code|forename|surname   |dob       |nationality|url                                           |
+--------+----------+------+----+--------+----------+----------+-----------+----------------------------------------------+
|1       |hamilton  |44    |HAM |Lewis   |Hamilton  |1985-01-07|British    |http://en.wikipedia.org/wiki/Lewis_Hamilton   |
|2       |heidfeld  |\N    |HEI |Nick    |Heidfeld  |1977-05-10|German     |http://en.wikipedia.org/wiki/Nick_Heidfeld    |
|3       |rosberg   |6     |

In [5]:
drivers_df.write.mode("overwrite").parquet("output/drivers_parquet")

In [6]:
drivers_df.repartition(4).write.mode("overwrite").parquet("output/drivers_parquet")

In [7]:
driver_df.write.mode("overwrite").partitionBy("nationality").parquet("output/drivers_partitioned")

In [8]:
drivers_df_parquet = spark.read.parquet("output/drivers_parquet")
drivers_df_parquet.printSchema()
drivers_df_parquet.show(n = 5)

root
 |-- driverId: integer (nullable = true)
 |-- driverRef: string (nullable = true)
 |-- number: string (nullable = true)
 |-- code: string (nullable = true)
 |-- forename: string (nullable = true)
 |-- surname: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- nationality: string (nullable = true)
 |-- url: string (nullable = true)

+--------+-----------------+------+----+--------+---------+----------+-------------+--------------------+
|driverId|        driverRef|number|code|forename|  surname|       dob|  nationality|                 url|
+--------+-----------------+------+----+--------+---------+----------+-------------+--------------------+
|     408|            maggs|    \N|  \N|    Tony|    Maggs|1937-02-09|South African|http://en.wikiped...|
|     443|ernesto_brambilla|    \N|  \N| Ernesto|Brambilla|1934-01-31|      Italian|http://en.wikiped...|
|     301|           morgan|    \N|  \N|    Dave|   Morgan|1944-08-07|      British|http://en.wikiped...|
|     591|  

In [9]:
driver_df_parquet = spark.read.parquet("output/drivers_partitioned/")
drivers_df_parquet.orderBy(drivers_df_parquet.nationality.desc()).show(n=10)

+--------+--------------+------+----+---------+---------+----------+-----------+--------------------+
|driverId|     driverRef|number|code| forename|  surname|       dob|nationality|                 url|
+--------+--------------+------+----+---------+---------+----------+-----------+--------------------+
|     188|       cecotto|    \N|  \N|   Johnny|  Cecotto|1956-01-25| Venezuelan|http://en.wikiped...|
|     503|       chimeri|    \N|  \N|   Ettore|  Chimeri|1921-06-04| Venezuelan|http://en.wikiped...|
|     813|     maldonado|    13| MAL|   Pastor|Maldonado|1985-03-09| Venezuelan|http://en.wikiped...|
|     568|        fontes|    \N|  \N| Azdrubal|   Fontes|1922-12-26|  Uruguayan|http://en.wikiped...|
|     623|          uria|    \N|  \N|  Alberto|     Uria|1924-07-11|  Uruguayan|http://en.wikiped...|
|     748|       cantoni|    \N|  \N|    Eitel|  Cantoni|1906-10-04|  Uruguayan|http://en.wikiped...|
|     806|oscar_gonzalez|    \N|  \N|    Óscar| González|1923-11-10|  Uruguayan|ht

In [12]:
drivers_df_parquet = spark.read.parquet("output/drivers_partitioned/nationality=Venezuelan")
drivers_df_parquet.show()

+--------+---------+------+----+--------+---------+----------+--------------------+
|driverId|driverRef|number|code|forename|  surname|       dob|                 url|
+--------+---------+------+----+--------+---------+----------+--------------------+
|     188|  cecotto|    \N|  \N|  Johnny|  Cecotto|1956-01-25|http://en.wikiped...|
|     503|  chimeri|    \N|  \N|  Ettore|  Chimeri|1921-06-04|http://en.wikiped...|
|     813|maldonado|    13| MAL|  Pastor|Maldonado|1985-03-09|http://en.wikiped...|
+--------+---------+------+----+--------+---------+----------+--------------------+



In [13]:
df = spark.read.json("json_files/students.json")
df.printSchema()
df.show()

root
 |-- age: long (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)

+---+---+-------+
|age| id|   name|
+---+---+-------+
| 22|  1|  Alice|
| 23|  2|    Bob|
| 21|  3|Charlie|
+---+---+-------+



In [14]:
df = spark.read.json("json_files/students_city.json")
df.printSchema()
df.show()

root
 |-- age: long (nullable = true)
 |-- city: string (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)

+----+------+---+-------+
| age|  city| id|   name|
+----+------+---+-------+
|  22|  NULL|  1|  Alice|
|  23|  NULL|  2|    Bob|
|  21|  NULL|  3|Charlie|
|NULL|  Pune|  4|  David|
|NULL|Mumbai|  5|    Eva|
+----+------+---+-------+



In [15]:
from pyspark.sql.types import StructType, StructField, IntegerType,StringType
schema = StructType([
    StructField("id", IntegerType(),True),
    StructField("name",StringType(), True),
    StructField("age",IntegerType(),True)
])

students_df = spark.read.schema(schema).json("json_files/students_city.json")

students_df.printSchema()
students_df.show()


root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)

+---+-------+----+
| id|   name| age|
+---+-------+----+
|  1|  Alice|  22|
|  2|    Bob|  23|
|  3|Charlie|  21|
|  4|  David|NULL|
|  5|    Eva|NULL|
+---+-------+----+



In [16]:
df = spark.read.json("json_files/students_nested.json")
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- id: long (nullable = true)
 |-- name: struct (nullable = true)
 |    |-- first: string (nullable = true)
 |    |-- last: string (nullable = true)



In [17]:
df.show()

+---+---+---------------+
|age| id|           name|
+---+---+---------------+
| 22|  1| {Alice, Smith}|
| 23|  2| {Bob, Johnson}|
| 34|  3|{Charlie, Pitt}|
+---+---+---------------+



In [19]:
df.select("id", "name.first", "name.last","age").show()

+---+-------+-------+---+
| id|  first|   last|age|
+---+-------+-------+---+
|  1|  Alice|  Smith| 22|
|  2|    Bob|Johnson| 23|
|  3|Charlie|   Pitt| 34|
+---+-------+-------+---+



In [21]:
from pyspark.sql.types import StructType, StructField,IntegerType,StringType

schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name",StructType([
                                StructField("first", StringType(), True),
                                StructField("last", StringType(), True)
    ])),
    StructField("age",IntegerType(), True)
])

students_df = spark.read.schema(schema).json("json_files/students_nested.json")

students_df.printSchema()
students_df.show()

root
 |-- id: integer (nullable = true)
 |-- name: struct (nullable = true)
 |    |-- first: string (nullable = true)
 |    |-- last: string (nullable = true)
 |-- age: integer (nullable = true)

+---+---------------+---+
| id|           name|age|
+---+---------------+---+
|  1| {Alice, Smith}| 22|
|  2| {Bob, Johnson}| 23|
|  3|{Charlie, Pitt}| 34|
+---+---------------+---+



In [22]:
from pyspark.sql.functions import col
flatten_df = students_df.select(
    col("id"),
    col("name.first").alias("first_name"),
    col("name.last").alias("last_name"),
    col("age")
)

flatten_df.printSchema()
flatten_df.show(truncate = False)

root
 |-- id: integer (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- age: integer (nullable = true)

+---+----------+---------+---+
|id |first_name|last_name|age|
+---+----------+---------+---+
|1  |Alice     |Smith    |22 |
|2  |Bob       |Johnson  |23 |
|3  |Charlie   |Pitt     |34 |
+---+----------+---------+---+



In [24]:
name_schema = StructType([
    StructField("first", StringType(),True),
    StructField("last", StringType(), True)
])

schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", name_schema),
    StructField("age",IntegerType(),True)
])

students_df = spark.read.schema(schema).json("json_files/students_nested.json")
students_df.printSchema()
students_df.show()

root
 |-- id: integer (nullable = true)
 |-- name: struct (nullable = true)
 |    |-- first: string (nullable = true)
 |    |-- last: string (nullable = true)
 |-- age: integer (nullable = true)

+---+---------------+---+
| id|           name|age|
+---+---------------+---+
|  1| {Alice, Smith}| 22|
|  2| {Bob, Johnson}| 23|
|  3|{Charlie, Pitt}| 34|
+---+---------------+---+



In [25]:
df = spark.read.json("json_files/students_multiline.json")
df.printSchema()
df.show()

root
 |-- _corrupt_record: string (nullable = true)
 |-- age: long (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)

+---------------+----+----+-------+
|_corrupt_record| age|  id|   name|
+---------------+----+----+-------+
|              [|NULL|NULL|   NULL|
|           NULL|  22|   1|  Alice|
|           NULL|  23|   2|    Bob|
|           NULL|  21|   3|Charlie|
|              ]|NULL|NULL|   NULL|
+---------------+----+----+-------+



In [26]:
df = spark.read.option("multiline", "true").json("json_files/students_multiline.json")
df.printSchema()
df.show()

root
 |-- age: long (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)

+---+---+-------+
|age| id|   name|
+---+---+-------+
| 22|  1|  Alice|
| 23|  2|    Bob|
| 21|  3|Charlie|
+---+---+-------+



In [28]:
df = spark.read.options(header = 'true', inferSchema = 'true').csv('F1_Complete_Dataset/races.csv')
df.show()

+------+----+-----+---------+--------------------+----------+--------+--------------------+--------+--------+--------+--------+--------+--------+----------+----------+-----------+-----------+
|raceId|year|round|circuitId|                name|      date|    time|                 url|fp1_date|fp1_time|fp2_date|fp2_time|fp3_date|fp3_time|quali_date|quali_time|sprint_date|sprint_time|
+------+----+-----+---------+--------------------+----------+--------+--------------------+--------+--------+--------+--------+--------+--------+----------+----------+-----------+-----------+
|     1|2009|    1|        1|Australian Grand ...|2009-03-29|06:00:00|http://en.wikiped...|      \N|      \N|      \N|      \N|      \N|      \N|        \N|        \N|         \N|         \N|
|     2|2009|    2|        2|Malaysian Grand Prix|2009-04-05|09:00:00|http://en.wikiped...|      \N|      \N|      \N|      \N|      \N|      \N|        \N|        \N|         \N|         \N|
|     3|2009|    3|       17|  Chinese G

In [29]:
df.write.json("output/races_json")

In [30]:
df.write.mode("overwrite").json("output/races_json")

In [31]:
df.write.partitionBy("year").mode("overwrite").json("output/races_partitioned")

In [32]:
df.select("raceId","year","name").write.json("output/races_partial_json")