In [0]:
from pyspark.sql.functions import (
    col,
    sum as spark_sum,
)
from pyspark.sql.types import BooleanType
from pyspark.sql.window import Window

In [0]:
# 1. Variáveis de Caminho
# Caminho de destino no Volume do Unity Catalog (UC)
UC_VOLUME_PATH = '/Volumes/airports_database/default/airports_database/'
CSV_FILE_NAME = 'airports-database.csv'
FINAL_UC_PATH = UC_VOLUME_PATH + CSV_FILE_NAME

# 2. Criar o DataFrame (DF) a partir do Volume UC
df_aeroportos = (spark.read
  .format("csv")
  .option("header", "true")
  .option("inferSchema", "true")
  .load(FINAL_UC_PATH)
)

print("\nDataFrame criado com sucesso!")


DataFrame criado com sucesso!


In [0]:
# Exibir as primeiras linhas do DataFrame
display(df_aeroportos)

id,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour,name
0,2013,1,1,517.0,515,2.0,830.0,819,11.0,UA,1545,N14228,EWR,IAH,227.0,1400,5,15,2013-01-01T05:00:00.000Z,United Air Lines Inc.
1,2013,1,1,533.0,529,4.0,850.0,830,20.0,UA,1714,N24211,LGA,IAH,227.0,1416,5,29,2013-01-01T05:00:00.000Z,United Air Lines Inc.
2,2013,1,1,542.0,540,2.0,923.0,850,33.0,AA,1141,N619AA,JFK,MIA,160.0,1089,5,40,2013-01-01T05:00:00.000Z,American Airlines Inc.
3,2013,1,1,544.0,545,-1.0,1004.0,1022,-18.0,B6,725,N804JB,JFK,BQN,183.0,1576,5,45,2013-01-01T05:00:00.000Z,JetBlue Airways
4,2013,1,1,554.0,600,-6.0,812.0,837,-25.0,DL,461,N668DN,LGA,ATL,116.0,762,6,0,2013-01-01T06:00:00.000Z,Delta Air Lines Inc.
5,2013,1,1,554.0,558,-4.0,740.0,728,12.0,UA,1696,N39463,EWR,ORD,150.0,719,5,58,2013-01-01T05:00:00.000Z,United Air Lines Inc.
6,2013,1,1,555.0,600,-5.0,913.0,854,19.0,B6,507,N516JB,EWR,FLL,158.0,1065,6,0,2013-01-01T06:00:00.000Z,JetBlue Airways
7,2013,1,1,557.0,600,-3.0,709.0,723,-14.0,EV,5708,N829AS,LGA,IAD,53.0,229,6,0,2013-01-01T06:00:00.000Z,ExpressJet Airlines Inc.
8,2013,1,1,557.0,600,-3.0,838.0,846,-8.0,B6,79,N593JB,JFK,MCO,140.0,944,6,0,2013-01-01T06:00:00.000Z,JetBlue Airways
9,2013,1,1,558.0,600,-2.0,753.0,745,8.0,AA,301,N3ALAA,LGA,ORD,138.0,733,6,0,2013-01-01T06:00:00.000Z,American Airlines Inc.


In [0]:
# Exibir o schema do DataFrame
df_aeroportos.printSchema()

root
 |-- id: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- dep_time: double (nullable = true)
 |-- sched_dep_time: integer (nullable = true)
 |-- dep_delay: double (nullable = true)
 |-- arr_time: double (nullable = true)
 |-- sched_arr_time: integer (nullable = true)
 |-- arr_delay: double (nullable = true)
 |-- carrier: string (nullable = true)
 |-- flight: integer (nullable = true)
 |-- tailnum: string (nullable = true)
 |-- origin: string (nullable = true)
 |-- dest: string (nullable = true)
 |-- air_time: double (nullable = true)
 |-- distance: integer (nullable = true)
 |-- hour: integer (nullable = true)
 |-- minute: integer (nullable = true)
 |-- time_hour: timestamp (nullable = true)
 |-- name: string (nullable = true)



In [0]:
# Contar o número de linhas e colunas
num_rows = df_aeroportos.count()
num_cols = len(df_aeroportos.columns)
print(f"Número de linhas: {num_rows}")
print(f"Número de colunas: {num_cols}")

Número de linhas: 336776
Número de colunas: 21


In [0]:
# Exibir estatísticas descritivas para colunas numéricas
display(df_aeroportos.describe())

summary,id,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,name
count,336776.0,336776.0,336776.0,336776.0,328521.0,336776.0,328521.0,328063.0,336776.0,327346.0,336776,336776.0,334264,336776,336776,327346.0,336776.0,336776.0,336776.0,336776
mean,168387.5,2013.0,6.548509988835309,15.71078699194717,1349.1099473093043,1344.2548400123524,12.639070257304708,1502.0549985825894,1536.380220086942,6.89537675731489,,1971.9236198541464,,,,150.68646019807787,1039.9126036297123,13.180247404803191,26.23009953203316,
stddev,97219.00146576283,0.0,3.414457244678893,8.768607101536873,488.2817910011616,467.3357557341948,40.2100608921299,533.2641319903768,497.45714151438006,44.63329169019401,,1632.4719381392947,,,,93.68830465900987,733.2330333236599,4.661315707848448,19.30084565741288,
min,0.0,2013.0,1.0,1.0,1.0,106.0,-43.0,1.0,1.0,-86.0,9E,1.0,D942DN,EWR,ABQ,20.0,17.0,1.0,0.0,AirTran Airways Corporation
max,336775.0,2013.0,12.0,31.0,2400.0,2359.0,1301.0,2400.0,2359.0,1272.0,YV,8500.0,N9EAMQ,LGA,XNA,695.0,4983.0,23.0,59.0,Virgin America


In [0]:
# Exibir estatísticas descritivas para todas as colunas
display(df_aeroportos.summary())

summary,id,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,name
count,336776.0,336776.0,336776.0,336776.0,328521.0,336776.0,328521.0,328063.0,336776.0,327346.0,336776,336776.0,334264,336776,336776,327346.0,336776.0,336776.0,336776.0,336776
mean,168387.5,2013.0,6.548509988835309,15.71078699194717,1349.1099473093043,1344.2548400123524,12.639070257304708,1502.0549985825894,1536.380220086942,6.89537675731489,,1971.9236198541464,,,,150.68646019807787,1039.9126036297123,13.180247404803191,26.23009953203316,
stddev,97219.00146576283,0.0,3.414457244678893,8.768607101536873,488.2817910011616,467.3357557341948,40.2100608921299,533.2641319903768,497.45714151438006,44.63329169019401,,1632.4719381392947,,,,93.68830465900987,733.2330333236599,4.661315707848448,19.30084565741288,
min,0.0,2013.0,1.0,1.0,1.0,106.0,-43.0,1.0,1.0,-86.0,9E,1.0,D942DN,EWR,ABQ,20.0,17.0,1.0,0.0,AirTran Airways Corporation
25%,84170.0,2013.0,4.0,8.0,907.0,906.0,-5.0,1104.0,1124.0,-17.0,,553.0,,,,82.0,502.0,9.0,8.0,
50%,168368.0,2013.0,7.0,16.0,1401.0,1359.0,-2.0,1535.0,1556.0,-5.0,,1496.0,,,,129.0,872.0,13.0,29.0,
75%,252582.0,2013.0,10.0,23.0,1744.0,1729.0,11.0,1940.0,1945.0,14.0,,3464.0,,,,192.0,1389.0,17.0,44.0,
max,336775.0,2013.0,12.0,31.0,2400.0,2359.0,1301.0,2400.0,2359.0,1272.0,YV,8500.0,N9EAMQ,LGA,XNA,695.0,4983.0,23.0,59.0,Virgin America


In [0]:
# Verificar valores nulos por coluna
display(df_aeroportos.agg(*[spark_sum(col(c).isNull().cast("int")).alias(c) for c in df_aeroportos.columns]))

id,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour,name
0,0,0,0,8255,0,8255,8713,0,9430,0,0,2512,0,0,9430,0,0,0,0,0


In [0]:
# Exibir a contagem de valores distintos por coluna
distinct_counts = {c: df_aeroportos.select(c).distinct().count() for c in df_aeroportos.columns}
print("Contagem de valores distintos por coluna:")
for k, v in distinct_counts.items():
    print(f"{k}: {v}")

Contagem de valores distintos por coluna:
id: 336776
year: 1
month: 12
day: 31
dep_time: 1319
sched_dep_time: 1021
dep_delay: 528
arr_time: 1412
sched_arr_time: 1163
arr_delay: 578
carrier: 16
flight: 3844
tailnum: 4044
origin: 3
dest: 105
air_time: 510
distance: 214
hour: 20
minute: 60
time_hour: 6936
name: 16


In [0]:
# Exibir a distribuição de frequências das principais colunas categóricas (top 10)
for c in df_aeroportos.columns:
    if dict(df_aeroportos.dtypes)[c] == 'string':
        display(df_aeroportos.groupBy(c).count().orderBy('count', ascending=False).limit(10))

carrier,count
UA,58665
B6,54635
EV,54173
DL,48110
AA,32729
MQ,26397
US,20536
9E,18460
WN,12275
VX,5162


tailnum,count
,2512
N725MQ,575
N722MQ,513
N723MQ,507
N711MQ,486
N713MQ,483
N258JB,427
N298JB,407
N353JB,404
N351JB,402


origin,count
EWR,120835
JFK,111279
LGA,104662


dest,count
ORD,17283
ATL,17215
LAX,16174
BOS,15508
MCO,14082
CLT,14064
SFO,13331
FLL,12055
MIA,11728
DCA,9705


name,count
United Air Lines Inc.,58665
JetBlue Airways,54635
ExpressJet Airlines Inc.,54173
Delta Air Lines Inc.,48110
American Airlines Inc.,32729
Envoy Air,26397
US Airways Inc.,20536
Endeavor Air Inc.,18460
Southwest Airlines Co.,12275
Virgin America,5162


In [0]:
# Exibir correlação entre colunas numéricas (se houver mais de uma)
numeric_cols = [c for c, t in df_aeroportos.dtypes if t in ['int', 'double', 'float', 'long']]
if len(numeric_cols) > 1:
    for i in range(len(numeric_cols)):
        for j in range(i+1, len(numeric_cols)):
            corr = df_aeroportos.stat.corr(numeric_cols[i], numeric_cols[j])
            print(f"Correlação entre {numeric_cols[i]} e {numeric_cols[j]}: {corr}")

Correlação entre id e year: nan
Correlação entre id e month: 0.07177899178794223
Correlação entre id e day: 0.08785224697881668
Correlação entre id e dep_time: -0.0009705465277637629
Correlação entre id e sched_dep_time: 0.004076341700607097
Correlação entre id e dep_delay: 0.04881879050844889
Correlação entre id e arr_time: -0.02654477085594239
Correlação entre id e sched_arr_time: -0.013372620851013548
Correlação entre id e arr_delay: 0.018090949708827537
Correlação entre id e flight: 0.0026757297945334355
Correlação entre id e air_time: -0.03448370073132029
Correlação entre id e distance: 0.0125263537656721
Correlação entre id e hour: 0.0031503023040879837
Correlação entre id e minute: 0.022618950256378875
Correlação entre year e month: nan
Correlação entre year e day: nan
Correlação entre year e dep_time: nan
Correlação entre year e sched_dep_time: nan
Correlação entre year e dep_delay: nan
Correlação entre year e arr_time: nan
Correlação entre year e sched_arr_time: nan
Correlação

In [0]:
# Exibe o plano de execução físico e lógico detalhado do DataFrame df_aeroportos
newDF = df_aeroportos.distinct().select('*')
newDF.explain(True)

== Parsed Logical Plan ==
'Project [*]
+- Deduplicate [id#15232, year#15233, month#15234, day#15235, dep_time#15236, sched_dep_time#15237, dep_delay#15238, arr_time#15239, sched_arr_time#15240, arr_delay#15241, carrier#15242, flight#15243, tailnum#15244, origin#15245, dest#15246, air_time#15247, distance#15248, hour#15249, minute#15250, time_hour#15251, name#15252]
   +- Relation [id#15232,year#15233,month#15234,day#15235,dep_time#15236,sched_dep_time#15237,dep_delay#15238,arr_time#15239,sched_arr_time#15240,arr_delay#15241,carrier#15242,flight#15243,tailnum#15244,origin#15245,dest#15246,air_time#15247,distance#15248,hour#15249,minute#15250,time_hour#15251,name#15252] csv

== Analyzed Logical Plan ==
id: int, year: int, month: int, day: int, dep_time: double, sched_dep_time: int, dep_delay: double, arr_time: double, sched_arr_time: int, arr_delay: double, carrier: string, flight: int, tailnum: string, origin: string, dest: string, air_time: double, distance: int, hour: int, minute: int