In [0]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import DecisionTreeRegressor as SparkDecisionTreeRegressor
from pyspark.sql.functions import col, format_number, sum as spark_sum
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

In [0]:
UC_VOLUME_PATH = '/Volumes/airports_database/default/airports_database/'
CSV_FILE_NAME = 'airports-database.csv'
FINAL_UC_PATH = UC_VOLUME_PATH + CSV_FILE_NAME

# 2. Criar o DataFrame (DF) a partir do Volume UC
df_aeroportos = (spark.read
  .format("csv")
  .option("header", "true")
  .option("inferSchema", "true")
  .load(FINAL_UC_PATH)
)

print("\nDataFrame criado com sucesso!")


DataFrame criado com sucesso!


In [0]:
df_aeroportos_features = df_aeroportos.select("arr_delay","dep_delay","sched_dep_time","dep_time","sched_arr_time","arr_time","hour")

In [0]:
def checa_valores_nulos(df):
    display(df.agg(*[spark_sum(col(c).isNull().cast("int")).alias(c) for c in df.columns]))

checa_valores_nulos(df_aeroportos_features)

arr_delay,dep_delay,sched_dep_time,dep_time,sched_arr_time,arr_time,hour
9430,8255,0,8255,0,8713,0


In [0]:
# Verificando valores nulos e tratando
df_aeroportos_features = df_aeroportos_features.na.drop()

df_aeroportos_features.printSchema()

root
 |-- arr_delay: double (nullable = true)
 |-- dep_delay: double (nullable = true)
 |-- sched_dep_time: integer (nullable = true)
 |-- dep_time: double (nullable = true)
 |-- sched_arr_time: integer (nullable = true)
 |-- arr_time: double (nullable = true)
 |-- hour: integer (nullable = true)



In [0]:
checa_valores_nulos(df_aeroportos_features)

arr_delay,dep_delay,sched_dep_time,dep_time,sched_arr_time,arr_time,hour
0,0,0,0,0,0,0


In [0]:
display(df_aeroportos_features)

arr_delay,dep_delay,sched_dep_time,dep_time,sched_arr_time,arr_time,hour
11.0,2.0,515,517.0,819,830.0,5
20.0,4.0,529,533.0,830,850.0,5
33.0,2.0,540,542.0,850,923.0,5
-18.0,-1.0,545,544.0,1022,1004.0,5
-25.0,-6.0,600,554.0,837,812.0,6
12.0,-4.0,558,554.0,728,740.0,5
19.0,-5.0,600,555.0,854,913.0,6
-14.0,-3.0,600,557.0,723,709.0,6
-8.0,-3.0,600,557.0,846,838.0,6
8.0,-2.0,600,558.0,745,753.0,6


In [0]:
# Definindo as features e o label
feature_cols = [col for col in df_aeroportos_features.columns if col != 'arr_delay']
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
df_features = assembler.transform(df_aeroportos_features).select("features", "arr_delay")

# Dividindo em treino e teste
train_data, test_data = df_features.randomSplit([0.8, 0.2], seed=42)

# Instanciando o modelo
modelo_arvore_decisao = SparkDecisionTreeRegressor(featuresCol="features", labelCol="arr_delay")

# Realizando o fit do modelo
modelo_treinado = modelo_arvore_decisao.fit(train_data)

# Usando o modelo para prever os dados de teste
predicoes = modelo_treinado.transform(test_data)

# Avaliando os erros
evaluator = RegressionEvaluator(labelCol="arr_delay", predictionCol="prediction", metricName="mse")
erro_quadratico_arvore_decisao = evaluator.evaluate(predicoes)
print(erro_quadratico_arvore_decisao)

evaluator_r2 = RegressionEvaluator(labelCol="arr_delay", predictionCol="prediction", metricName="r2")
r2_arvore_decisao = evaluator_r2.evaluate(predicoes)
print(r2_arvore_decisao)

479.00670037991375
0.7708071944169319


In [0]:
# Salvar modelo Spark MLlib usando pickle (não é suportado diretamente para modelos Spark MLlib)
# Para modelos Spark MLlib, use o método .save(). Para modelos scikit-learn, use pickle.

# Exemplo para modelo scikit-learn:
# with open(f"{UC_VOLUME_PATH.rstrip('/')}/model_decision_tree.pkl", "wb") as f:
#     pickle.dump(modelo_treinado, f)

# Para Spark MLlib, mantenha o método .save():
MODEL_DIR = f"{UC_VOLUME_PATH.rstrip('/')}/model_decision_tree"
modelo_treinado.save(MODEL_DIR)

print(f"✅ Modelo model_decision_tree salvo com sucesso em {MODEL_DIR}!")

✅ Modelo model_decision_tree salvo com sucesso em /Volumes/airports_database/default/airports_database/model/model_decision_tree!


In [0]:
from pyspark.sql import Row

# Criando 5 exemplos de voos (ajuste os valores conforme necessário)
voos = [
    Row(dep_delay=5, sched_dep_time=830, dep_time=835, sched_arr_time=1030, arr_time=1035, hour=8),
    Row(dep_delay=0, sched_dep_time=900, dep_time=900, sched_arr_time=1100, arr_time=1100, hour=9),
    Row(dep_delay=15, sched_dep_time=700, dep_time=715, sched_arr_time=900, arr_time=915, hour=7),
    Row(dep_delay=30, sched_dep_time=1200, dep_time=1230, sched_arr_time=1400, arr_time=1430, hour=12),
    Row(dep_delay=-3, sched_dep_time=600, dep_time=557, sched_arr_time=800, arr_time=757, hour=6)
]

df_voos = spark.createDataFrame(voos)

# Montar as features
df_voos_features = VectorAssembler(
    inputCols=[c for c in df_voos.columns if c != "arr_delay"],
    outputCol="features"
).transform(df_voos)

# Aplicar o modelo treinado
predicoes_voos = modelo_treinado.transform(df_voos_features)

display(predicoes_voos.select("features", "prediction"))

