### Schemas:
Definire uno schema prima di leggere i dati come DataFrame permette 3 benefici:
- Evitiamo che Spark cerchi di indovinare i data type
- Evitiamo che venga creato un *job* separato per fare la deduzione dello schema (si puo' perdere diverso tempo)
- Possiamo scoprire errori prima nel caso i dati non corrispondano alla schema

Spark permette di definire lo schema in 2 modi:
- Programmaticamente: costruire la struttura attraverso l'uso di data type complessi
- Utilizzo del DDL (Data Defintion Language)

# Esempio 1

In [0]:
from datetime import date

data = [["pippo", "topolinia", 200, date(1920, 1, 1)]
       ,["paperino", "paperopoli", 200, date(1920, 1, 1)]
       ,["topolino", "topolinia", 200, date(1920, 1, 1)]
       ,["paperoga", "paperopoli", 200, date(1920, 1, 1)]]

In [0]:
# Importare i data types per costruire lo schema

from pyspark.sql.types import *

# StructType() -> struttura del dataset
# [] array di elementi al suo interno
# StructField(nome campo, data type, null) -> definizione del campo 

schema_progr = StructType([
                    StructField("personaggio", StringType(), False)
                    ,StructField("citta", StringType(), False)
                    ,StructField("episodi", IntegerType(), False)
                    ,StructField("data", DateType(), False)
                    
])

In [0]:
disney_df_schema_progr = spark.createDataFrame(data, schema_progr)

disney_df_schema_progr.show()

print(disney_df_schema_progr.printSchema())

+-----------+----------+-------+----------+
|personaggio|     citta|episodi|      data|
+-----------+----------+-------+----------+
|      pippo| topolinia|    200|1920-01-01|
|   paperino|paperopoli|    200|1920-01-01|
|   topolino| topolinia|    200|1920-01-01|
|   paperoga|paperopoli|    200|1920-01-01|
+-----------+----------+-------+----------+

root
 |-- personaggio: string (nullable = false)
 |-- citta: string (nullable = false)
 |-- episodi: integer (nullable = false)
 |-- data: date (nullable = false)

None


In [0]:
schema_DDL = "personaggio string, citta string, episodi int, data date"

In [0]:
disney_df_schema_ddl = spark.createDataFrame(data, schema_DDL)

disney_df_schema_ddl.show()

print(disney_df_schema_ddl.printSchema())

+-----------+----------+-------+----------+
|personaggio|     citta|episodi|      data|
+-----------+----------+-------+----------+
|      pippo| topolinia|    200|1920-01-01|
|   paperino|paperopoli|    200|1920-01-01|
|   topolino| topolinia|    200|1920-01-01|
|   paperoga|paperopoli|    200|1920-01-01|
+-----------+----------+-------+----------+

root
 |-- personaggio: string (nullable = true)
 |-- citta: string (nullable = true)
 |-- episodi: integer (nullable = true)
 |-- data: date (nullable = true)

None


# Esempio 2

In [0]:
data = [
        [1, "Jules", "Damji", "https://tinyurl.1", "1/4/2016", 4535, ["twitter", "LinkedIn"]],
        [2, "Brooke","Wenig", "https://tinyurl.2", "5/5/2018", 8908, ["twitter", "LinkedIn"]],
        [3, "Denny", "Lee", "https://tinyurl.3", "6/7/2019", 7659, ["web", "twitter", "FB", "LinkedIn"]],
        [4, "Tathagata", "Das", "https://tinyurl.4", "5/12/2018", 10568, ["twitter", "FB"]],
        [5, "Matei","Zaharia", "https://tinyurl.5", "5/14/2014", 40578, ["web", "twitter", "FB", "LinkedIn"]],
        [6, "Reynold", "Xin", "https://tinyurl.6", "3/2/2015", 25568, ["twitter", "LinkedIn"]]
]

In [0]:
schema_ddl = "`Id` INT, `First` STRING, `Last` STRING, `Url` STRING, `Published` STRING, `Hits` INT, `Campaigns` ARRAY<STRING>"

In [0]:
# Creazione del DataFrame
blogs_df_schema_ddl = spark.createDataFrame(data, schema_ddl)

# Show DataFrame, con truncate=False cosi vedo tutti gli elementi del campo che contiene l'array
blogs_df_schema_ddl.show(truncate=False)

# printo lo schema del DataFrame 
print(blogs_df_schema_ddl.printSchema())

+---+---------+-------+-----------------+---------+-----+----------------------------+
|Id |First    |Last   |Url              |Published|Hits |Campaigns                   |
+---+---------+-------+-----------------+---------+-----+----------------------------+
|1  |Jules    |Damji  |https://tinyurl.1|1/4/2016 |4535 |[twitter, LinkedIn]         |
|2  |Brooke   |Wenig  |https://tinyurl.2|5/5/2018 |8908 |[twitter, LinkedIn]         |
|3  |Denny    |Lee    |https://tinyurl.3|6/7/2019 |7659 |[web, twitter, FB, LinkedIn]|
|4  |Tathagata|Das    |https://tinyurl.4|5/12/2018|10568|[twitter, FB]               |
|5  |Matei    |Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB, LinkedIn]|
|6  |Reynold  |Xin    |https://tinyurl.6|3/2/2015 |25568|[twitter, LinkedIn]         |
+---+---------+-------+-----------------+---------+-----+----------------------------+

root
 |-- Id: integer (nullable = true)
 |-- First: string (nullable = true)
 |-- Last: string (nullable = true)
 |-- Url: string

In [0]:
schema_progr = StructType([
    StructField("Id", IntegerType(), True)
    ,StructField("First", StringType(), True)
    ,StructField("Last", StringType(), True)
    ,StructField("Url", StringType(), True)
    ,StructField("Published", StringType(), True)
    ,StructField("Hits", IntegerType(), True)
    ,StructField("Campaigns", ArrayType(StringType()), True)
])

In [0]:
# Creazione del DataFrame
blogs_df_schema_progr = spark.createDataFrame(data, schema_progr)

# Show DataFrame, con truncate=False cosi vedo tutti gli elementi del campo che contiene l'array
blogs_df_schema_progr.show(truncate=False)

# printo lo schema del DataFrame 
print(blogs_df_schema_progr.printSchema())

+---+---------+-------+-----------------+---------+-----+----------------------------+
|Id |First    |Last   |Url              |Published|Hits |Campaigns                   |
+---+---------+-------+-----------------+---------+-----+----------------------------+
|1  |Jules    |Damji  |https://tinyurl.1|1/4/2016 |4535 |[twitter, LinkedIn]         |
|2  |Brooke   |Wenig  |https://tinyurl.2|5/5/2018 |8908 |[twitter, LinkedIn]         |
|3  |Denny    |Lee    |https://tinyurl.3|6/7/2019 |7659 |[web, twitter, FB, LinkedIn]|
|4  |Tathagata|Das    |https://tinyurl.4|5/12/2018|10568|[twitter, FB]               |
|5  |Matei    |Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB, LinkedIn]|
|6  |Reynold  |Xin    |https://tinyurl.6|3/2/2015 |25568|[twitter, LinkedIn]         |
+---+---------+-------+-----------------+---------+-----+----------------------------+

root
 |-- Id: integer (nullable = true)
 |-- First: string (nullable = true)
 |-- Last: string (nullable = true)
 |-- Url: string