Per performare delle operazioni sui DataFrames, prima di tutto bisogna caricare un DataFrame da una fonte dato. L'interfaccia *DataFrameReader* permette di **leggere** dati da una vastita' di formati come JSON, CSV, Parquet, Text, Avro, ORC, etc.
D'altro canto per **scrivere** un DataFrame su una fonte dato in un formato particolare, Spark usa *DataFrameWriter*.

## Lettura del file
Leggiamo un grosso file CSV che contiene i dati sulle chiamate dei pompieri di San Francisco. Prima definiamo lo schema, poi useremo la classe *DataFrameReader* e i suoi metodi per dire a Spark cosa fare.

In [0]:
# importo i data type

from pyspark.sql.types import *

In [0]:
# costruisco lo schema in maniera programmatica

fire_schema = StructType([
    StructField('CallNumber', IntegerType(), True),
    StructField('UnitID', StringType(), True),
    StructField('IncidentNumber', IntegerType(), True),
    StructField('CallType', StringType(), True),
    StructField('CallDate', StringType(), True),
    StructField('WatchDate', StringType(), True),
    StructField('CallFinalDisposition', StringType(), True),
    StructField('AvailableDtTm', StringType(), True),
    StructField('Address', StringType(), True),
    StructField('City', StringType(), True),
    StructField('Zipcode', IntegerType(), True),
    StructField('Battalion', StringType(), True),
    StructField('StationArea', StringType(), True),
    StructField('Box', StringType(), True),
    StructField('OriginalPriority', StringType(), True),
    StructField('Priority', StringType(), True),
    StructField('FinalPriority', IntegerType(), True),
    StructField('ALSUnit', BooleanType(), True),
    StructField('CallTypeGroup', StringType(), True),
    StructField('NumAlarms', IntegerType(), True),
    StructField('UnitType', StringType(), True),
    StructField('UnitSequenceInCallDispatch', IntegerType(), True),
    StructField('FirePreventionDistrict', StringType(), True),
    StructField('SupervisorDistrict', StringType(), True),
    StructField('Neighborhood', StringType(), True),
    StructField('Location', StringType(), True),
    StructField('RowID', StringType(), True),
    StructField('Delay', FloatType(), True)
])

In [0]:
dbutils.fs.ls("dbfs:/databricks-datasets/learning-spark-v2/sf-fire/sf-fire-calls.csv")

Out[11]: [FileInfo(path='dbfs:/databricks-datasets/learning-spark-v2/sf-fire/sf-fire-calls.csv', name='sf-fire-calls.csv', size=1137925359, modificationTime=1576280979000)]

Use the *DataFrameReader* interface to read a CSV file

In [0]:
# file path
sf_fire_file = "dbfs:/databricks-datasets/learning-spark-v2/sf-fire/sf-fire-calls.csv"
# read as df
fire_df = spark.read.csv(sf_fire_file, header=True, schema=fire_schema)
# display prime 100 righe
display(fire_df.limit(100))
# print schema df
print(fire_df.printSchema())

CallNumber,UnitID,IncidentNumber,CallType,CallDate,WatchDate,CallFinalDisposition,AvailableDtTm,Address,City,Zipcode,Battalion,StationArea,Box,OriginalPriority,Priority,FinalPriority,ALSUnit,CallTypeGroup,NumAlarms,UnitType,UnitSequenceInCallDispatch,FirePreventionDistrict,SupervisorDistrict,Neighborhood,Location,RowID,Delay
20110014,M29,2003234,Medical Incident,01/11/2002,01/10/2002,Other,01/11/2002 01:58:43 AM,10TH ST/MARKET ST,SF,94103,B02,36,2338,1,1,2,True,,1,MEDIC,1,2,6,Tenderloin,"(37.7765408927183, -122.417501464907)",020110014-M29,5.233333
20110015,M08,2003233,Medical Incident,01/11/2002,01/10/2002,Other,01/11/2002 02:10:17 AM,300 Block of 5TH ST,SF,94107,B03,8,2243,1,1,2,True,,1,MEDIC,1,3,6,South of Market,"(37.7792841462441, -122.402061300134)",020110015-M08,3.0833333
20110016,B02,2003235,Structure Fire,01/11/2002,01/10/2002,Other,01/11/2002 01:47:00 AM,2000 Block of CALIFORNIA ST,SF,94109,B04,38,3362,3,3,3,False,,1,CHIEF,6,4,5,Pacific Heights,"(37.7895840679362, -122.428071912459)",020110016-B02,3.05
20110016,B04,2003235,Structure Fire,01/11/2002,01/10/2002,Other,01/11/2002 01:51:54 AM,2000 Block of CALIFORNIA ST,SF,94109,B04,38,3362,3,3,3,False,,1,CHIEF,3,4,5,Pacific Heights,"(37.7895840679362, -122.428071912459)",020110016-B04,2.3166666
20110016,D2,2003235,Structure Fire,01/11/2002,01/10/2002,Other,01/11/2002 01:47:00 AM,2000 Block of CALIFORNIA ST,SF,94109,B04,38,3362,3,3,3,False,,1,CHIEF,4,4,5,Pacific Heights,"(37.7895840679362, -122.428071912459)",020110016-D2,3.0166667
20110016,E03,2003235,Structure Fire,01/11/2002,01/10/2002,Other,01/11/2002 01:47:00 AM,2000 Block of CALIFORNIA ST,SF,94109,B04,38,3362,3,3,3,False,,1,ENGINE,7,4,5,Pacific Heights,"(37.7895840679362, -122.428071912459)",020110016-E03,2.6833334
20110016,E38,2003235,Structure Fire,01/11/2002,01/10/2002,Other,01/11/2002 01:51:17 AM,2000 Block of CALIFORNIA ST,SF,94109,B04,38,3362,3,3,3,False,,1,ENGINE,1,4,5,Pacific Heights,"(37.7895840679362, -122.428071912459)",020110016-E38,2.1
20110016,E41,2003235,Structure Fire,01/11/2002,01/10/2002,Other,01/11/2002 01:47:00 AM,2000 Block of CALIFORNIA ST,SF,94109,B04,38,3362,3,3,3,False,,1,ENGINE,8,4,5,Pacific Heights,"(37.7895840679362, -122.428071912459)",020110016-E41,2.7166667
20110016,M03,2003235,Structure Fire,01/11/2002,01/10/2002,Other,01/11/2002 01:46:38 AM,2000 Block of CALIFORNIA ST,SF,94109,B04,38,3362,3,3,3,True,,1,MEDIC,10,4,5,Pacific Heights,"(37.7895840679362, -122.428071912459)",020110016-M03,2.7666667
20110016,RS1,2003235,Structure Fire,01/11/2002,01/10/2002,Other,01/11/2002 01:46:57 AM,2000 Block of CALIFORNIA ST,SF,94109,B04,38,3362,3,3,3,False,,1,RESCUE SQUAD,9,4,5,Pacific Heights,"(37.7895840679362, -122.428071912459)",020110016-RS1,3.2666667


root
 |-- CallNumber: integer (nullable = true)
 |-- UnitID: string (nullable = true)
 |-- IncidentNumber: integer (nullable = true)
 |-- CallType: string (nullable = true)
 |-- CallDate: string (nullable = true)
 |-- WatchDate: string (nullable = true)
 |-- CallFinalDisposition: string (nullable = true)
 |-- AvailableDtTm: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Zipcode: integer (nullable = true)
 |-- Battalion: string (nullable = true)
 |-- StationArea: string (nullable = true)
 |-- Box: string (nullable = true)
 |-- OriginalPriority: string (nullable = true)
 |-- Priority: string (nullable = true)
 |-- FinalPriority: integer (nullable = true)
 |-- ALSUnit: boolean (nullable = true)
 |-- CallTypeGroup: string (nullable = true)
 |-- NumAlarms: integer (nullable = true)
 |-- UnitType: string (nullable = true)
 |-- UnitSequenceInCallDispatch: integer (nullable = true)
 |-- FirePreventionDistrict: string (nullable = true)
 

## Scrittura del DataFrame
Se il DataFrame e' scritto in formato Parquet lo schema del DataFrame e' preservato come parte dei metadati del file Parquet, quindi non importa (per una successiva lettura dal file Parquet) specificare manualmente lo schema dei dati.

Un'operazione comune e' quella di esplorare e trasformare i dati e poi persistere il DataFrame sotto forma di file Parquet o salvandolo in una tabella SQL.

### File

In [0]:
# dove voglio salvare il file Parquet
parquetPath = "dbfs:/tmp/learning-spark/fireParquet"
# salvo il file in frmato parquet
fire_df.write.format("parquet").save(parquetPath)

In [0]:
# check della creazione dei file parquet 
dbutils.fs.ls("dbfs:/tmp/learning-spark/fireParquet")

Out[32]: [FileInfo(path='dbfs:/tmp/learning-spark/fireParquet/_SUCCESS', name='_SUCCESS', size=0, modificationTime=1665654606000),
 FileInfo(path='dbfs:/tmp/learning-spark/fireParquet/_committed_6154992983257719147', name='_committed_6154992983257719147', size=924, modificationTime=1665654606000),
 FileInfo(path='dbfs:/tmp/learning-spark/fireParquet/_started_6154992983257719147', name='_started_6154992983257719147', size=0, modificationTime=1665654491000),
 FileInfo(path='dbfs:/tmp/learning-spark/fireParquet/part-00000-tid-6154992983257719147-eab22e0e-b130-4ada-8d69-ce550d01c81f-230-1-c000.snappy.parquet', name='part-00000-tid-6154992983257719147-eab22e0e-b130-4ada-8d69-ce550d01c81f-230-1-c000.snappy.parquet', size=19293442, modificationTime=1665654598000),
 FileInfo(path='dbfs:/tmp/learning-spark/fireParquet/part-00001-tid-6154992983257719147-eab22e0e-b130-4ada-8d69-ce550d01c81f-231-1-c000.snappy.parquet', name='part-00001-tid-6154992983257719147-eab22e0e-b130-4ada-8d69-ce550d01c81f-2

### Tabella SQL

In [0]:
# nome tabella
parquet_table = "fire_table"
# scrittura del DataFrame come tabella nel database di default di Databricks
fire_df.write.format("parquet").saveAsTable(parquet_table)

In [0]:
%sql

select *
from default.fire_table
limit 100

CallNumber,UnitID,IncidentNumber,CallType,CallDate,WatchDate,CallFinalDisposition,AvailableDtTm,Address,City,Zipcode,Battalion,StationArea,Box,OriginalPriority,Priority,FinalPriority,ALSUnit,CallTypeGroup,NumAlarms,UnitType,UnitSequenceInCallDispatch,FirePreventionDistrict,SupervisorDistrict,Neighborhood,Location,RowID,Delay
111050354,E14,11034920,Medical Incident,04/15/2011,04/15/2011,Other,04/15/2011 11:27:08 PM,500 Block of 21ST AVE,SF,94121,B07,14,7171,3,3,3,True,,1,ENGINE,1,7,1,Outer Richmond,"(37.7774255992901, -122.480311994328)",111050354-E14,4.7833333
111050355,E03,11034921,Structure Fire,04/15/2011,04/15/2011,Other,04/15/2011 11:10:54 PM,HYDE ST/BUSH ST,SF,94109,B04,3,1561,3,3,3,True,,1,ENGINE,1,4,3,Nob Hill,"(37.7891101748937, -122.417016879226)",111050355-E03,1.9166666
111050355,T03,11034921,Structure Fire,04/15/2011,04/15/2011,Other,04/15/2011 11:10:54 PM,HYDE ST/BUSH ST,SF,94109,B04,3,1561,3,3,3,False,,1,TRUCK,2,4,3,Nob Hill,"(37.7891101748937, -122.417016879226)",111050355-T03,2.4333334
111050356,73,11034922,Structure Fire,04/15/2011,04/15/2011,Other,04/15/2011 11:24:56 PM,1000 Block of POTRERO AVE,SF,94110,B10,7,2553,3,3,3,True,,1,MEDIC,10,10,10,Potrero Hill,"(37.7565080013216, -122.40654101432)",111050356-73,2.0666666
111050356,B06,11034922,Structure Fire,04/15/2011,04/15/2011,Other,04/15/2011 11:22:46 PM,1000 Block of POTRERO AVE,SF,94110,B10,7,2553,3,3,3,False,,1,CHIEF,6,10,10,Potrero Hill,"(37.7565080013216, -122.40654101432)",111050356-B06,2.6
111050356,B10,11034922,Structure Fire,04/15/2011,04/15/2011,Other,04/15/2011 11:25:00 PM,1000 Block of POTRERO AVE,SF,94110,B10,7,2553,3,3,3,False,,1,CHIEF,4,10,10,Potrero Hill,"(37.7565080013216, -122.40654101432)",111050356-B10,3.25
111050356,D3,11034922,Structure Fire,04/15/2011,04/15/2011,Other,04/15/2011 11:23:01 PM,1000 Block of POTRERO AVE,SF,94110,B10,7,2553,3,3,3,False,,1,CHIEF,7,10,10,Potrero Hill,"(37.7565080013216, -122.40654101432)",111050356-D3,3.5
111050356,E29,11034922,Structure Fire,04/15/2011,04/15/2011,Other,04/15/2011 11:22:50 PM,1000 Block of POTRERO AVE,SF,94110,B10,7,2553,3,3,3,True,,1,ENGINE,8,10,10,Potrero Hill,"(37.7565080013216, -122.40654101432)",111050356-E29,2.6
111050356,E37,11034922,Structure Fire,04/15/2011,04/15/2011,Other,04/15/2011 11:25:10 PM,1000 Block of POTRERO AVE,SF,94110,B10,7,2553,3,3,3,False,,1,ENGINE,2,10,10,Potrero Hill,"(37.7565080013216, -122.40654101432)",111050356-E37,2.6666667
111050356,RS2,11034922,Structure Fire,04/15/2011,04/15/2011,Other,04/15/2011 11:24:11 PM,1000 Block of POTRERO AVE,SF,94110,B10,7,2553,3,3,3,False,,1,RESCUE SQUAD,5,10,10,Potrero Hill,"(37.7565080013216, -122.40654101432)",111050356-RS2,3.05


## Lettura del file Parquet
Non importa specificare lo schema perche' e' salvato nei metadati del file Parquet

In [0]:
# file path
sf_fire_file = "dbfs:/tmp/learning-spark/fireParquet/"
# read as df
fire_df = spark.read.parquet(sf_fire_file)
# display prime 100 righe
display(fire_df.limit(100))
# print schema df
print(fire_df.printSchema())

CallNumber,UnitID,IncidentNumber,CallType,CallDate,WatchDate,CallFinalDisposition,AvailableDtTm,Address,City,Zipcode,Battalion,StationArea,Box,OriginalPriority,Priority,FinalPriority,ALSUnit,CallTypeGroup,NumAlarms,UnitType,UnitSequenceInCallDispatch,FirePreventionDistrict,SupervisorDistrict,Neighborhood,Location,RowID,Delay
111050354,E14,11034920,Medical Incident,04/15/2011,04/15/2011,Other,04/15/2011 11:27:08 PM,500 Block of 21ST AVE,SF,94121,B07,14,7171,3,3,3,True,,1,ENGINE,1,7,1,Outer Richmond,"(37.7774255992901, -122.480311994328)",111050354-E14,4.7833333
111050355,E03,11034921,Structure Fire,04/15/2011,04/15/2011,Other,04/15/2011 11:10:54 PM,HYDE ST/BUSH ST,SF,94109,B04,3,1561,3,3,3,True,,1,ENGINE,1,4,3,Nob Hill,"(37.7891101748937, -122.417016879226)",111050355-E03,1.9166666
111050355,T03,11034921,Structure Fire,04/15/2011,04/15/2011,Other,04/15/2011 11:10:54 PM,HYDE ST/BUSH ST,SF,94109,B04,3,1561,3,3,3,False,,1,TRUCK,2,4,3,Nob Hill,"(37.7891101748937, -122.417016879226)",111050355-T03,2.4333334
111050356,73,11034922,Structure Fire,04/15/2011,04/15/2011,Other,04/15/2011 11:24:56 PM,1000 Block of POTRERO AVE,SF,94110,B10,7,2553,3,3,3,True,,1,MEDIC,10,10,10,Potrero Hill,"(37.7565080013216, -122.40654101432)",111050356-73,2.0666666
111050356,B06,11034922,Structure Fire,04/15/2011,04/15/2011,Other,04/15/2011 11:22:46 PM,1000 Block of POTRERO AVE,SF,94110,B10,7,2553,3,3,3,False,,1,CHIEF,6,10,10,Potrero Hill,"(37.7565080013216, -122.40654101432)",111050356-B06,2.6
111050356,B10,11034922,Structure Fire,04/15/2011,04/15/2011,Other,04/15/2011 11:25:00 PM,1000 Block of POTRERO AVE,SF,94110,B10,7,2553,3,3,3,False,,1,CHIEF,4,10,10,Potrero Hill,"(37.7565080013216, -122.40654101432)",111050356-B10,3.25
111050356,D3,11034922,Structure Fire,04/15/2011,04/15/2011,Other,04/15/2011 11:23:01 PM,1000 Block of POTRERO AVE,SF,94110,B10,7,2553,3,3,3,False,,1,CHIEF,7,10,10,Potrero Hill,"(37.7565080013216, -122.40654101432)",111050356-D3,3.5
111050356,E29,11034922,Structure Fire,04/15/2011,04/15/2011,Other,04/15/2011 11:22:50 PM,1000 Block of POTRERO AVE,SF,94110,B10,7,2553,3,3,3,True,,1,ENGINE,8,10,10,Potrero Hill,"(37.7565080013216, -122.40654101432)",111050356-E29,2.6
111050356,E37,11034922,Structure Fire,04/15/2011,04/15/2011,Other,04/15/2011 11:25:10 PM,1000 Block of POTRERO AVE,SF,94110,B10,7,2553,3,3,3,False,,1,ENGINE,2,10,10,Potrero Hill,"(37.7565080013216, -122.40654101432)",111050356-E37,2.6666667
111050356,RS2,11034922,Structure Fire,04/15/2011,04/15/2011,Other,04/15/2011 11:24:11 PM,1000 Block of POTRERO AVE,SF,94110,B10,7,2553,3,3,3,False,,1,RESCUE SQUAD,5,10,10,Potrero Hill,"(37.7565080013216, -122.40654101432)",111050356-RS2,3.05


root
 |-- CallNumber: integer (nullable = true)
 |-- UnitID: string (nullable = true)
 |-- IncidentNumber: integer (nullable = true)
 |-- CallType: string (nullable = true)
 |-- CallDate: string (nullable = true)
 |-- WatchDate: string (nullable = true)
 |-- CallFinalDisposition: string (nullable = true)
 |-- AvailableDtTm: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Zipcode: integer (nullable = true)
 |-- Battalion: string (nullable = true)
 |-- StationArea: string (nullable = true)
 |-- Box: string (nullable = true)
 |-- OriginalPriority: string (nullable = true)
 |-- Priority: string (nullable = true)
 |-- FinalPriority: integer (nullable = true)
 |-- ALSUnit: boolean (nullable = true)
 |-- CallTypeGroup: string (nullable = true)
 |-- NumAlarms: integer (nullable = true)
 |-- UnitType: string (nullable = true)
 |-- UnitSequenceInCallDispatch: integer (nullable = true)
 |-- FirePreventionDistrict: string (nullable = true)
 