In [2]:
from pyspark.sql import SparkSession

ss = SparkSession.builder.getOrCreate()

In [5]:
df = ss.read.csv('persone.txt', sep=';', header=True, inferSchema=True)
df.show()


+-----+--------+-------+----+-----------+-----+
| nome| cognome|altezza|peso|datanascita|sesso|
+-----+--------+-------+----+-----------+-----+
|Mario|   Rossi|   1.78|  70| 1995-05-01|    M|
|Luigi| Bianchi|   1.67|  71| 1995-06-02|    M|
| John|     Doe|   1.64|  61| 1995-07-03|    M|
| Anna|    Blue|    1.7|  60| 1995-08-04|    F|
|Viola|Scarlett|   1.72|  62| 1995-09-05|    F|
|Maria|   Brown|   1.85|  80| 1996-01-01|    F|
+-----+--------+-------+----+-----------+-----+



In [6]:
df.printSchema()

root
 |-- nome: string (nullable = true)
 |-- cognome: string (nullable = true)
 |-- altezza: double (nullable = true)
 |-- peso: integer (nullable = true)
 |-- datanascita: date (nullable = true)
 |-- sesso: string (nullable = true)



In [7]:
df.count()

6

In [8]:
df2 = df.filter( df['sesso'] == 'F' )
df2.show()

+-----+--------+-------+----+-----------+-----+
| nome| cognome|altezza|peso|datanascita|sesso|
+-----+--------+-------+----+-----------+-----+
| Anna|    Blue|    1.7|  60| 1995-08-04|    F|
|Viola|Scarlett|   1.72|  62| 1995-09-05|    F|
|Maria|   Brown|   1.85|  80| 1996-01-01|    F|
+-----+--------+-------+----+-----------+-----+



In [9]:
df3 = df.groupBy("sesso").count()
df3.show()

+-----+-----+
|sesso|count|
+-----+-----+
|    F|    3|
|    M|    3|
+-----+-----+



In [10]:
#RDD a DF
sc = ss.sparkContext
dati = [ ('Mario', 'Rossi', 25), ('Luigi', 'Bianchi', 34), ('Giuseppe', 'Verdi', 18)]
rdd = sc.parallelize( dati )
df4 = ss.createDataFrame(rdd, ['nome', 'cognome', 'eta'])
df4.show()


+--------+-------+---+
|    nome|cognome|eta|
+--------+-------+---+
|   Mario|  Rossi| 25|
|   Luigi|Bianchi| 34|
|Giuseppe|  Verdi| 18|
+--------+-------+---+



In [11]:
df4.show(1)

+-----+-------+---+
| nome|cognome|eta|
+-----+-------+---+
|Mario|  Rossi| 25|
+-----+-------+---+
only showing top 1 row



In [12]:
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)
df4

nome,cognome,eta
Mario,Rossi,25
Luigi,Bianchi,34
Giuseppe,Verdi,18


In [13]:
df4.show(1, vertical=True)

-RECORD 0--------
 nome    | Mario 
 cognome | Rossi 
 eta     | 25    
only showing top 1 row



In [14]:
df4.columns

['nome', 'cognome', 'eta']

In [15]:
df2

nome,cognome,altezza,peso,datanascita,sesso
Anna,Blue,1.7,60,1995-08-04,F
Viola,Scarlett,1.72,62,1995-09-05,F
Maria,Brown,1.85,80,1996-01-01,F


In [16]:
df2.select("cognome", "altezza")

cognome,altezza
Blue,1.7
Scarlett,1.72
Brown,1.85


In [17]:
df2.select("cognome", "altezza").describe().show()

+-------+--------+-------------------+
|summary| cognome|            altezza|
+-------+--------+-------------------+
|  count|       3|                  3|
|   mean|    NULL| 1.7566666666666666|
| stddev|    NULL|0.08144527815247084|
|    min|    Blue|                1.7|
|    max|Scarlett|               1.85|
+-------+--------+-------------------+



In [18]:
df2.collect()

[Row(nome='Anna', cognome='Blue', altezza=1.7, peso=60, datanascita=datetime.date(1995, 8, 4), sesso='F'),
 Row(nome='Viola', cognome='Scarlett', altezza=1.72, peso=62, datanascita=datetime.date(1995, 9, 5), sesso='F'),
 Row(nome='Maria', cognome='Brown', altezza=1.85, peso=80, datanascita=datetime.date(1996, 1, 1), sesso='F')]

In [20]:
df2.take(2)

[Row(nome='Anna', cognome='Blue', altezza=1.7, peso=60, datanascita=datetime.date(1995, 8, 4), sesso='F'),
 Row(nome='Viola', cognome='Scarlett', altezza=1.72, peso=62, datanascita=datetime.date(1995, 9, 5), sesso='F')]

In [21]:
df2.toPandas()

Unnamed: 0,nome,cognome,altezza,peso,datanascita,sesso
0,Anna,Blue,1.7,60,1995-08-04,F
1,Viola,Scarlett,1.72,62,1995-09-05,F
2,Maria,Brown,1.85,80,1996-01-01,F
